gpt4 book ai didi

python - 绘制多类问题的 ROC 曲线

转载 作者:行者123 更新时间:2023-12-05 03:33:51 27 4
gpt4 key购买 nike

我正在尝试应用 sklearn 的想法 ROC extension to multiclass到我的数据集。我的每类 ROC 曲线看起来都找到了一条直线,取消显示曲线波动的 sklearn 示例。

我在下面给出一个 MWE 来说明我的意思:

# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)

# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict(test)

然后以下函数绘制 ROC 曲线:

def plot_roc_curve(y_test, y_pred):

n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
y_pred = label_binarize(y_pred, classes=np.arange(n_classes))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
#plt.figure(figsize=(10,5))
plt.figure(dpi=600)
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
color="deeppink", linestyle=":", linewidth=4,)

plt.plot(fpr["macro"], tpr["macro"],
label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
color="navy", linestyle=":", linewidth=4,)

colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)

plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()

输出:

plot_roc_curve(ytest, yhat)

enter image description here

那种直线弯曲一次。我想看看模型在不同阈值下的表现,不只是一个,类似sklearn's illustration的图对于如下所示的 3 类:

enter image description here

最佳答案

  • 重点是您使用 predict() 而不是 predict_proba()/decision_function() 来定义您的 你的帽子。这意味着 - 考虑到阈值向量是由 y_hat 中不同值的数量定义的(请参阅 here 以供引用),您只有 在其上计算 tprfpr(这反过来意味着您的曲线仅在几个点上进行评估)。

  • 的确,考虑一下doc表示要传递给 roc_curve() 中的 y_scores,概率估计值或决策值。在 sklearn 的示例中,决策值用于计算分数。鉴于您正在考虑 RandomForestClassifier(),在您的 y_hat 中考虑概率估计应该是可行的方法。

  • 标签二值化输出有什么意义? ROC 的标准定义是根据二进制分类。要传递给多类问题,您必须使用 OneVsAll 方法将您的问题转换为二元问题,这样您将拥有 n_class 条 ROC 曲线。 (事实上​​ ,由于 SVC() 默认以 OvO 方式处理多类问题,在示例中,他们必须通过应用 OneVsRestClassifier 构造函数强制使用 OvA;与RandomForestClassifier 你没有这样的问题,因为它本质上是多类的,参见 here 以供引用)。用这些术语来说,一旦您切换到 predict_proba(),您就会发现标签二值化预测没有多大意义。

     # all imports
    import numpy as np
    import matplotlib.pyplot as plt
    from itertools import cycle
    from sklearn import svm, datasets
    from sklearn.metrics import roc_curve, auc
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import label_binarize
    from sklearn.datasets import make_classification
    from sklearn.ensemble import RandomForestClassifier
    # dummy dataset
    X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
    train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)

    # random forest model
    model = RandomForestClassifier()
    model.fit(train, ytrain)
    yhat = model.predict_proba(test)

    def plot_roc_curve(y_test, y_pred):
    n_classes = len(np.unique(y_test))
    y_test = label_binarize(y_test, classes=np.arange(n_classes))

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    thresholds = dict()
    for i in range(n_classes):
    fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], y_pred[:, i], drop_intermediate=False)
    roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    #plt.figure(figsize=(10,5))
    plt.figure(dpi=600)
    lw = 2
    plt.plot(fpr["micro"], tpr["micro"],
    label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
    color="deeppink", linestyle=":", linewidth=4,)

    plt.plot(fpr["macro"], tpr["macro"],
    label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
    color="navy", linestyle=":", linewidth=4,)

    colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
    for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
    label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)

    plt.plot([0, 1], [0, 1], "k--", lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic (ROC) curve")
    plt.legend()

最后,考虑 roc_curve() 还有一个 drop_intermediate 参数,用于丢弃次优阈值(了解它可能会有用)。

关于python - 绘制多类问题的 ROC 曲线,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/70278059/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com