- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我正在尝试使用一些自定义转换器优化 scikit-learn 管道中的超参数,但我不断收到错误消息:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
class RollingMeanTransform(BaseEstimator, TransformerMixin):
def __init__(self, col, window=3):
self._window = window
self._col = col
def fit(self, X, y=None):
return self
def transform(self, X):
df = X.copy()
df['{}_rolling_mean'.format(self._col)] = df[self._col].shift(1).rolling(self._window).mean().fillna(0.0)
return df
class TimeEncoding(BaseEstimator, TransformerMixin):
def __init__(self, col, drop_original=True):
self._col = col
self._drop_original = drop_original
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
unique_vals = float(len(X[self._col].unique()))
X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
if self._drop_original:
X.drop([self._col], axis=1, inplace=True, errors='ignore')
return X
huber = HuberRegressor()
huber_max_iter = [100, 200, 500, 1000]
huber_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]
huber_epsilon = [1.15, 1.25, 1.35, 1.5]
huber_grid = {'clf__alpha':huber_alpha,
'clf__epsilon':huber_epsilon,
'clf__max_iter':huber_max_iter,
}
regression_pipeline = Pipeline([('encoding', TimeEncoding('my_col')),
('mean', RollingMeanTransform('my_other_col')),
('select', Treshold()),
('scale', Scale()),
('clf', huber)
])
我试着用它来拟合:
grid = GridSearchCV(regression_pipeline, huber_grid, cv=TimeSeriesSplit(n_splits=5))
grid.fit(X_train, y_train)
但我得到以下异常:
ValueError Traceback (most recent call last)
<ipython-input-14-3949096c802a> in <module>()
----> 1 grid.fit(X_train, y_train)
~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
456 estimator.fit(X_train, **fit_params)
457 else:
--> 458 estimator.fit(X_train, y_train, **fit_params)
459
460 except Exception as e:
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
246 This estimator
247 """
--> 248 Xt, fit_params = self._fit(X, y, **fit_params)
249 if self._final_estimator is not None:
250 self._final_estimator.fit(Xt, y, **fit_params)
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
211 Xt, fitted_transformer = fit_transform_one_cached(
212 cloned_transformer, None, Xt, y,
--> 213 **fit_params_steps[name])
214 # Replace the transformer of the step with the fitted
215 # transformer. This is necessary when loading the transformer
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
360
361 def __call__(self, *args, **kwargs):
--> 362 return self.func(*args, **kwargs)
363
364 def call_and_shelve(self, *args, **kwargs):
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
579 **fit_params):
580 if hasattr(transformer, 'fit_transform'):
--> 581 res = transformer.fit_transform(X, y, **fit_params)
582 else:
583 res = transformer.fit(X, y, **fit_params).transform(X)
~/anaconda3/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
518 else:
519 # fit method of arity 2 (supervised transformation)
--> 520 return self.fit(X, y, **fit_params).transform(X)
521
522
~/my_project/my_model.py in transform(self, X)
126 def transform(self, X):
127 X = X.copy()
--> 128 unique_vals = float(len(X[self._col].unique()))
129 X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
130 X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
-> 2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1840 res = cache.get(item)
1841 if res is None:
-> 1842 values = self._data.get(item)
1843 res = self._box_item_values(item, values)
1844 cache[item] = res
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3850 loc = indexer.item()
3851 else:
-> 3852 raise ValueError("cannot label index with a null key")
3853
3854 return self.iget(loc, fastpath=fastpath)
ValueError: cannot label index with a null key
我不知道发生了什么,也不知道如何解决。如果我移除变压器,它就可以工作,但我的管道中需要它。
如果我将管道更改为
regression_pipeline = Pipeline([('mean', RollingMeanTransform('my_other_col')),
('encoding', TimeEncoding('my_col')),
('select', Treshold()),
('scale', Scale()),
('clf', huber)
])
我得到了同样的错误,但这次调用了 mean
转换器。
完整的代码示例:
from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
class RollingMeanTransform(BaseEstimator, TransformerMixin):
def __init__(self, col, window=3):
self._window = window
self._col = col
def fit(self, X, y=None):
return self
def transform(self, X):
df = X.copy()
df['{}_rolling_mean'.format(self._col)] = df[self._col].shift(1).rolling(self._window).mean().fillna(0.0)
return df
class TimeEncoding(BaseEstimator, TransformerMixin):
def __init__(self, col, drop_original=True):
self._col = col
self._drop_original = drop_original
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
unique_vals = float(len(X[self._col].unique()))
X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
if self._drop_original:
X.drop([self._col], axis=1, inplace=True, errors='ignore')
return X
class Treshold(BaseEstimator, TransformerMixin):
# note: Threshold which removes features with constant value
# and preserves the input data as data frame
def __init__(self):
self.to_keep = list()
def fit(self, X, y=None):
self.to_keep = list()
self.colname_original = X.columns
for i, col in enumerate(X):
if len(np.unique(X.values[:, i])) >= 2:
self.to_keep.append(col)
return self
def transform(self, X, copy=None):
return X[self.to_keep]
class Scale(BaseEstimator, TransformerMixin):
# note: scaler which keeps the input data as data frame
# and does not scale binary features
def __init__(self, copy=True, with_mean=True, with_std=True):
self.scaler = StandardScaler(copy, with_mean, with_std)
self.bin_vars_index = list()
self.cont_vars_index = list()
self.colnames_original = list()
def fit(self, X, y=None):
self.bin_vars_index = list()
self.cont_vars_index = list()
self.colnames_original = list()
self.colnames_original = X.columns
for i in range(X.shape[1]):
if len(np.unique(X.values[:, i])) <= 2:
self.bin_vars_index.append(i)
else:
self.cont_vars_index.append(i)
self.scaler.fit(X.values[:, self.cont_vars_index])
return self
def transform(self, X, copy=None):
X_tail = self.scaler.transform(X.values[:, self.cont_vars_index], copy)
res = np.concatenate((X.values[:, self.bin_vars_index], X_tail), axis=1)
colnames_res = np.array(
list(self.colnames_original[self.bin_vars_index]) + list(self.colnames_original[self.cont_vars_index]))
assert len(colnames_res) == len(self.colnames_original)
res = pd.DataFrame(data=res, columns=colnames_res)
return res[[str(el) for el in self.colnames_original]].set_index(X.index)
huber = HuberRegressor()
huber_max_iter = [100, 200, 500, 1000]
huber_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]
huber_epsilon = [1.15, 1.25, 1.35, 1.5]
huber_grid = {'clf__alpha':huber_alpha,
'clf__epsilon':huber_epsilon,
'clf__max_iter':huber_max_iter,
}
regression_pipeline = Pipeline([('encoding', TimeEncoding('my_col')),
('mean', RollingMeanTransform('my_other_col')),
('select', Treshold()),
('scale', Scale()),
('clf', huber)
])
grid = GridSearchCV(regression_pipeline, huber_grid, cv=TimeSeriesSplit(n_splits=5))
X = pd.DataFrame(np.random.randint(low=0, high=10, size=(20, 2)), columns=['my_col', 'my_other_col'])
y = pd.Series(np.random.randint(low=0, high=10, size=(20,)))
grid.fit(X, y)
最佳答案
您会看到 GridSearchCV(以及 scikit-learn 中的大多数交叉验证实用程序)克隆提供的数据以执行网格搜索。
在这样做时,他们将使用 get_params()
and set_params()
您继承的 BaseEstimator 类。现在 get_params()
将从您声明的 __init__()
方法获取参数。
init_signature = signature(init)
# Consider the constructor parameters excluding 'self'
parameters = [p for p in init_signature.parameters.values()
if p.name != 'self' and p.kind != p.VAR_KEYWORD]
...
...
现在要获取值,使用]( https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/base.py#L228 ):
for key in self._get_param_names():
value = getattr(self, key, None)
所以这将给出的参数是:
col = None
drop_original = None
不是您使用的带有 前导下划线
的那些。两者的值都是 None,因为您的对象没有任何具有这些名称的属性。
现在这些无值参数将用于实例化克隆对象in clone()
:
...
new_object = klass(**new_object_params)
...
...
然后这些 None
值将设置为您的 _col
和 _drop_original
。这就是错误的根源。
这件事已记录在the deleloper guidelines in scikit中:
The arguments accepted by init should all be keyword arguments with a default value. In other words, a user should be able to instantiate an estimator without passing any arguments to it. The arguments should all correspond to hyperparameters describing the model or the optimisation problem the estimator tries to solve.
In addition, every keyword argument accepted by init should correspond to an attribute on the instance. Scikit-learn relies on this to find the relevant attributes to set on an estimator when doing model selection.
因此,建议的解决方法是从参数名称中删除前导下划线(以便 __init__
和 self
中的名称应该相同):
class TimeEncoding(BaseEstimator, TransformerMixin):
# Changed the names from _col to col
def __init__(self, col, drop_original=True):
self.col = col
self.drop_original = drop_original
def transform(self, X):
X = X.copy()
# Updated the names to be used
unique_vals = float(len(X[self.col].unique()))
X['sin_{}'.format(self.col)] = np.sin(2 * np.pi * X[self.col] / unique_vals)
X['cos_{}'.format(self.col)] = np.cos(2 * np.pi * X[self.col] / unique_vals)
if self.drop_original:
X.drop([self.col], axis=1, inplace=True, errors='ignore')
return X
现在对所有自定义估算器执行此操作。
现在,如果您在使用属性的前导下划线方面有一些限制(可能尝试将它们设为私有(private)或类似的东西),您的第二个选择是覆盖 set_params()
方法以显式设置参数。
关于python - 自定义转换器和 GridSearch - 管道中的 ValueError,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50523930/
我正在尝试并行运行具有循环返回值的函数。但它似乎停留在 results = pool.map(algorithm_file.foo, population) 在 for 循环的第二次迭代中 r
Serving Flask 应用程序“服务器”(延迟加载) 环境:生产警告:这是一个开发服务器。不要在生产部署中使用它。请改用生产 WSGI 服务器。 Debug模式:开启 在 http://0.0.
我使用“product.pricelist”模型中的 get_product_price_rule() 函数。我的代码是: price = self._get_display_price(produ
我收到以下错误: Traceback (most recent call last): File "/home/odroid/trackAndFollow/getPositions.py", line
我正在尝试采用机器学习方法,但遇到了一些问题。这是我的代码: import sys import scipy import numpy import matplotlib import pandas
我尝试使用 tensorflow 1.4.0 对我的原始记录进行分类。过程如下。 拳头:读取图片和标签,输出“tfrecord”格式的文件。第二:读取tf记录和训练 编写tfrecord脚本是 !/u
我是新手,所以需要任何帮助,当我要求一个例子时,我的教授给我了这段代码,我希望有一个工作模型...... from numpy import loadtxt import numpy as np fr
我无法弄清楚为什么会出现此 ValueError...为了提供一些上下文,我正在使用 requests、BeautifulSoup 和 json 与 python 来抓取站点 json 数据。 我不确
我已经尝试使用这两个循环以及列表理解。即使我正在尝试将数字转换为列表中的整型,两者都无法解析整数。
我已经尝试使用这两个循环以及列表理解。即使我正在尝试将数字转换为列表中的整型,两者都无法解析整数。
我只有四个星期的 Python 经验。使用 Tkinter 创建一个工具,将新的公司 Logo 粘贴到现有图像之上。 下面的方法是获取给定目录中的所有图像并将新 Logo 粘贴到初始级别。现有图像、编
我只有四个星期的 Python 经验。使用 Tkinter 创建一个工具,将新的公司 Logo 粘贴到现有图像之上。 下面的方法是获取给定目录中的所有图像并将新 Logo 粘贴到初始级别。现有图像、编
我在尝试在 Keras 2.0.8、Python 3.6.1 和 Tensorflow 后端中训练模型时遇到问题。 错误消息: ValueError: Error when checking targ
我已经尝试使用这两个循环以及列表理解。即使我正在尝试将数字转换为列表中的整型,两者都无法解析整数。
我有这段代码: while True: try: start = int(input("Starting number: ")) fin = int(i
我是 python 的初学者编码员,试图制作一个“模具滚筒”,您可以在其中选择模具的大小,它在我的代码的第 20 行返回此错误 import sys import random import geto
我有以下代码: import fxcmpy import pandas as pd from pandas import datetime from pandas import DataFrame a
我正在尝试使用 django 和 python 制作一个博客应用程序。我也在尝试使用 s3 存储桶进行存储,使用 heroku 进行部署。我正在学习 coreymschafer 的在线教程。我正在按照
我创建了一个 numpy 数组(考虑输入数据)并想更改顺序(一些数值运算后的输出数据)。在使用转换后的数组时,我遇到错误并找到了根本原因。请在下面找到详细信息并使用 numpy 版本 1.19.1 i
我已经引用了之前的查询 All arguments should have the same length plotly但仍然没有得到我的问题的答案。 我有一个黄金价格数据集。 Date
我是一名优秀的程序员,十分优秀!