gpt4 book ai didi

python - 自定义 scikit 编码器抛出转换错误

转载 作者:行者123 更新时间:2023-11-30 09:17:32 27 4
gpt4 key购买 nike

我正在改编一些在线代码来创建我自己的 scikit-learn One Hot Encoder 版本。自定义类为我做了一些事情,主要是它允许设置一个阈值,低于该阈值,分类变量的罕见级别将被转储到“其他”类中。我可以正确适应,但当我尝试转换时,出现转换错误,就好像在 OHE 之前运行的嵌入式 LabelEncoder 无法正常工作一样。

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import pandas as pd
import numpy as np

#my custom OHE-er
class CustomPandasTransformer(BaseEstimator, TransformerMixin):
def _validate_input(self, X):
if not isinstance(X, pd.DataFrame):
raise TypeError('X must be a DataFrame, but gor type=%s' % type(x))
return X

@staticmethod
def _validate_columns(X, cols):
scols = set(X.columns)
if not all(c in scols for c in cols):
raise ValueError("all columns must be present in X")

class DummyEncoder(CustomPandasTransformer):
def __init__(self, columns, sep='_', drop_one_level=True, tmp_nan_rep='MISSING',other_treshold=20):
self.columns=columns
self.sep = sep
self.drop_one_level = drop_one_level
self.tmp_nan_rep = tmp_nan_rep
self.other_treshold = other_treshold


def fit(self, X, y=None):
X = self._validate_input(X).copy()

tmp_nan = self.tmp_nan_rep
oth_thr = self.other_treshold

cols = self.columns
self._validate_columns(X, cols)

lab_encoders = {}
for col in cols:
#group low freq levels into a 'OTHER' level
tmp_vc = X[col].value_counts()
high_volume_levels = list(tmp_vc[tmp_vc>oth_thr].index)
vec = [v if v in high_volume_levels else 'OTHER' for v in X[col].tolist()]

vec = [tmp_nan if pd.isnull(v) else v for v in vec]
svec = list(set(vec))
if tmp_nan not in svec:
svec.append(tmp_nan)

le = LabelEncoder()
lab_encoders[col] = le.fit(svec)

X[col] = le.transform(vec)

ohe_set = X[cols]
ohe_nan_row = {c: lab_encoders[c].transform([tmp_nan])[0] for c in cols}
ohe_set = ohe_set.append(ohe_nan_row, ignore_index=True)
ohe = OneHotEncoder(sparse=False).fit(ohe_set)

self.ohe_ = ohe
self.le_ = lab_encoders
self.cols_ = cols

return self

def transform(self, X):
check_is_fitted(self, 'ohe_')
X = self._validate_input(X).copy()

ohe = self.ohe_
lenc = self.le_
cols = self.cols_
tmp_nan = self.tmp_nan_rep
sep = self.sep
drop = self.drop_one_level

self._validate_columns(X, cols)
col_order = []
drops = []

for col in cols:
le = lenc[col]

vec = [v if v in list(le.classes_) else 'OTHER' for v in X[col].tolist()]
vec = [tmp_nan if pd.isnull(v) else v for v in vec]

vec_trans = le.transform(vec)
X[col] = vec_trans

le_clz = le.classes_.tolist()
classes = ['%s%s%s' % (col,sep,clz) for clz in le_clz]
col_order.extend(classes)

if drop and len(le_clz)>1:
drops.append(classes[-1])

ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
columns = col_order)

ohe_trans.index=X.index

if drops:
ohe_trans = ohe_trans.drop(drops, axis=1)

X = X.drop(cols, axis=1)

X = pd.concat([X, ohe_trans], axis=1)
return X

#the data
dicpd = {
u'BILL_CLASS': {0: np.nan, 1: np.nan},
u'CL_SUB_TYPE': {0: 'M', 1: 'M'},
u'COB_TYPE': {0: np.nan, 1: np.nan},
u'DUP_BILL_CLASS': {0: np.nan, 1: np.nan},
u'DUP_CL_SUB_TYPE': {0: 'M', 1: 'M'},
u'DUP_COB_TYPE': {0: np.nan, 1: np.nan},
u'DUP_DX_CD': {0: 'M9901', 1: 'Z0100'},
u'DUP_FAC_TYPE': {0: np.nan, 1: np.nan},
u'DUP_FREQUENCY': {0: np.nan, 1: np.nan},
u'DUP_LOB_ID': {0: 'PBC1', 1: 'PBC1'},
u'DUP_MOD1': {0: np.nan, 1: np.nan},
u'DUP_MOD2': {0: np.nan, 1: np.nan},
u'DUP_MOD3': {0: np.nan, 1: np.nan},
u'DUP_MOD4': {0: np.nan, 1: np.nan},
u'DUP_POS_CD': {0: '11', 1: '11'},
u'DUP_PROC_CD': {0: '98941', 1: 'V2020'},
u'DUP_REV_CD': {0: np.nan, 1: np.nan},
u'DX_CD': {0: 'M9901', 1: 'Z0100'},
u'FAC_TYPE': {0: np.nan, 1: np.nan},
u'FREQUENCY': {0: np.nan, 1: np.nan},
u'MBR_AGE': {0: 48, 1: 56},
u'MOD1': {0: '59', 1: np.nan},
u'MOD2': {0: np.nan, 1: np.nan},
u'MOD3': {0: np.nan, 1: np.nan},
u'MOD4': {0: np.nan, 1: np.nan},
u'POS_CD': {0: '11', 1: '11'},
u'PROC_CD': {0: '97140', 1: 'V2781'},
u'REV_CD': {0: np.nan, 1: np.nan},
u'RULE_1': {0: 0, 1: 0},
u'RULE_3': {0: 1, 1: 1},
u'RULE_4': {0: 1, 1: 1},
u'RULE_5': {0: 0, 1: 0},
u'RULE_6': {0: 1, 1: 1},
'SAME_DX': {0: 1, 1: 1},
'SAME_POS_CD': {0: 1, 1: 1},
'SAME_PROC': {0: 0, 1: 0},
'SAME_PROV': {0: 1, 1: 1},
'SAME_REV': {0: 0, 1: 0},
'SAME_TOT': {0: 3, 1: 3},
u'SYSTEM_GEN_DRG': {0: np.nan, 1: np.nan}}

#read into pandas
df1 = pd.DataFrame(dicpd)

#get object types
categorical_features = list(df1.select_dtypes(include=['object']).columns)

de = DummyEncoder(columns = categorical_features,other_treshold=10,drop_one_level=False)

de.fit(df1)

de.transform(df1)


ValueErrorTraceback (most recent call last)
<ipython-input-5-3a72a3fa8104> in <module>()
161 de.fit(df1)
162
--> 163 de.transform(df1)

<ipython-input-5-3a72a3fa8104> in transform(self, X)
95 drops.append(classes[-1])
96
---> 97 ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
98 columns = col_order)
99

/data/dataiku-dss-4.2.3/python.packages/sklearn/preprocessing/data.pyc in transform(self, X)
2073 """
2074 return _transform_selected(X, self._transform,
-> 2075 self.categorical_features, copy=True)
2076
2077

/data/dataiku-dss-4.2.3/python.packages/sklearn/preprocessing/data.pyc in _transform_selected(X, transform, selected, copy)
1807 X : array or sparse matrix, shape=(n_samples, n_features_new)
1808 """
-> 1809 X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)
1810
1811 if isinstance(selected, six.string_types) and selected == "all":

/data/dataiku-dss-4.2.3/python.packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:

ValueError: could not convert string to float: V2781

最佳答案

我发现了错误。整个部分

ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
columns = col_order)

ohe_trans.index=X.index

if drops:
ohe_trans = ohe_trans.drop(drops, axis=1)

X = X.drop(cols, axis=1)

在列循环内。需要在上一个循环完成后运行。

关于python - 自定义 scikit 编码器抛出转换错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/51295392/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com