gpt4 book ai didi

python - sklearn GridSearchCV 不在评分函数中使用 sample_weight

转载 作者:太空狗 更新时间:2023-10-29 22:24:31 26 4
gpt4 key购买 nike

我有每个样本具有不同权重的数据。在我的申请中,重要的是在估计模型和比较备选模型时考虑这些权重。

我正在使用 sklearn 来估计模型并比较备选超参数选择。但是这个单元测试表明 GridSearchCV 没有应用 sample_weights 来估计分数。

有没有办法让 sklearn 使用 sample_weight 对模型进行评分?

单元测试:

from __future__ import division

import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold


def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()

for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y[test_ndx]

clf.fit(X=X_train, y=y_train, sample_weight=w_train)

y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)

results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})

for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})

best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param


if __name__ == "__main__":
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
# sample_weight = np.array([1 for _ in range(len(X))])

inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)

outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)

rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}


fit_params = {"sample_weight": sample_weight}
my_scorer = make_scorer(log_loss,
greater_is_better=False,
needs_proba=True,
needs_threshold=False)

grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y, **fit_params)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)

msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))

score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))

产生输出:

This is the best out-of-sample score using GridSearchCV: 0.135692.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.

说明:由于手动计算不带权重的损失会产生与 GridSearchCV 相同的评分,因此我们知道未使用样本权重。

最佳答案

GridSearchCV 将一个scoring 作为输入,它可以被调用。你可以看到如何更改评分功能的详细信息,以及如何传递自己的评分功能here .为了完整起见,以下是该页面的相关代码:

enter image description here

编辑:fit_params 仅传递给拟合函数,而不是评分函数。如果有应该传递给 scorer 的参数,则应将它们传递给 make_scorer。但这仍然没有解决这里的问题,因为这意味着整个 sample_weight 参数将被传递给 log_loss,而只有对应于 的部分计算loss时的y_test应该通过。

sklearn 不支持这样的东西,但您可以使用 padas.DataFrame 破解。好消息是,sklearn 理解 DataFrame,并保持这种状态。这意味着您可以利用 DataFrameindex,如您在此处的代码中所见:

  # more code

X, y = load_iris(return_X_y=True)
index = ['r%d' % x for x in range(len(y))]
y_frame = pd.DataFrame(y, index=index)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
sample_weight_frame = pd.DataFrame(sample_weight, index=index)

# more code

def score_f(y_true, y_pred, sample_weight):
return log_loss(y_true.values, y_pred,
sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
normalize=True)

score_params = {"sample_weight": sample_weight_frame}
my_scorer = make_scorer(score_f,
greater_is_better=False,
needs_proba=True,
needs_threshold=False,
**score_params)

grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y_frame)

# more code

如您所见,score_f 使用 y_trueindex 来查找要使用 sample_weight 的哪些部分.为了完整起见,这里是整个代码:

from __future__ import division

import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.metrics import make_scorer
import pandas as pd

def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()

for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y_in[test_ndx]

clf.fit(X=X_train, y=y_train, sample_weight=w_train)

y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)

results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})

for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})

best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param


#if __name__ == "__main__":
if True:
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
index = ['r%d' % x for x in range(len(y))]
y_frame = pd.DataFrame(y, index=index)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
sample_weight_frame = pd.DataFrame(sample_weight, index=index)
# sample_weight = np.array([1 for _ in range(len(X))])

inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)

outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)

rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}


def score_f(y_true, y_pred, sample_weight):
return log_loss(y_true.values, y_pred,
sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
normalize=True)

score_params = {"sample_weight": sample_weight_frame}
my_scorer = make_scorer(score_f,
greater_is_better=False,
needs_proba=True,
needs_threshold=False,
**score_params)

grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y_frame)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)

msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))

score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))

代码的输出是:

This is the best out-of-sample score using GridSearchCV: 0.095439.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.

编辑 2:正如下面的评论所说:

the difference in my score and the sklearn score using this solution originates in the way that I was computing a weighted average of scores. If you omit the weighted average portion of the code, the two outputs match to machine precision.

关于python - sklearn GridSearchCV 不在评分函数中使用 sample_weight,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49581104/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com