gpt4 book ai didi

r - 教程中的错误(使用 iml 包解释机器学习模型)

转载 作者:行者123 更新时间:2023-12-05 03:21:34 24 4
gpt4 key购买 nike

在标题为“复制要求”(https://uc-r.github.io/iml-pkg)的部分中尝试执行以下代码时出现以下错误:

#classification data
df <- rsample::attrition %>%
mutate_if(is.ordered, factor, ordered = FALSE) %>%
mutate(Attrition = recode(Attrition, "Yes" = "1", "No" = "0") %>% factor(levels = c("1", "0")))

> Error: 'attrition' is not an exported object from 'namespace:rsample'

使用以下代码解决了问题:

#data
library(modeldata)
data("attrition", package = "modeldata")
#classification data
df <- attrition %>%
mutate_if(is.ordered, factor, ordered = FALSE) %>%
mutate(Attrition = recode(Attrition, "Yes" = "1", "No" = "0") %>% factor(levels = c("1", "0")))

不幸的是,我在尝试执行以下代码后遇到了另一个错误(标题为“全局解释/特征重要性”的部分(https://uc-r.github.io/iml-pkg)):

#compute feature importance with specified loss metric
imp.glm <- FeatureImp$new(predictor.glm, loss = "mse")
imp.rf <- FeatureImp$new(predictor.rf, loss = "mse")
imp.gbm <- FeatureImp$new(predictor.gbm, loss = "mse")

> Error in [.data.frame(prediction, , self$class, drop = FALSE) : undefined columns selected

> Error in [.data.frame(prediction, , self$class, drop = FALSE) : undefined columns selected

> Error in [.data.frame(prediction, , self$class, drop = FALSE) : undefined columns selected

我用的是 R 4.2.0/Win10

最佳答案

教程中显示的参数需要稍作改动;而不是 class = "classification",将其更改为 class = 2(根据 the docs ),示例按预期工作:

library(rsample)   # data splitting
library(ggplot2) # allows extension of visualizations
library(dplyr) # basic data transformation
library(h2o) # machine learning modeling
#install.packages("iml")
library(iml) # ML interprtation
#install.packages("modeldata")
library(modeldata)
library(R6)

h2o.no_progress()
h2o.init()
#> Connection successful!
#>
#> R is connected to the H2O cluster:
#> H2O cluster uptime: 9 minutes 18 seconds
#> H2O cluster timezone: Australia/Melbourne
#> H2O data parsing timezone: UTC
#> H2O cluster version: 3.36.0.1
#> H2O cluster version age: 6 months and 28 days !!!
#> H2O cluster name: H2O_started_from_R_jared_mpb432
#> H2O cluster total nodes: 1
#> H2O cluster total memory: 1.58 GB
#> H2O cluster total cores: 4
#> H2O cluster allowed cores: 4
#> H2O cluster healthy: TRUE
#> H2O Connection ip: localhost
#> H2O Connection port: 54321
#> H2O Connection proxy: NA
#> H2O Internal Security: FALSE
#> H2O API Extensions: Amazon S3, XGBoost, Algos, Infogram, AutoML, Core V3, TargetEncoder, Core V4
#> R Version: R version 4.1.3 (2022-03-10)

df <- modeldata::attrition %>%
mutate_if(is.ordered, factor, ordered = FALSE) %>%
mutate(Attrition = recode(Attrition, "Yes" = "1", "No" = "0") %>%
factor(levels = c("1", "0")))

# convert to h2o object
df.h2o <- as.h2o(df)

# create train, validation, and test splits
set.seed(123)
splits <- h2o.splitFrame(df.h2o, ratios = c(.7, .15), destination_frames = c("train","valid","test"))
names(splits) <- c("train","valid","test")

# variable names for resonse & features
y <- "Attrition"
x <- setdiff(names(df), y)

# elastic net model
glm <- h2o.glm(
x = x,
y = y,
training_frame = splits$train,
validation_frame = splits$valid,
family = "binomial",
seed = 123
)

# random forest model
rf <- h2o.randomForest(
x = x,
y = y,
training_frame = splits$train,
validation_frame = splits$valid,
ntrees = 1000,
stopping_metric = "AUC",
stopping_rounds = 10,
stopping_tolerance = 0.005,
seed = 123
)
#> Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.

# gradient boosting machine model
gbm <- h2o.gbm(
x = x,
y = y,
training_frame = splits$train,
validation_frame = splits$valid,
ntrees = 1000,
stopping_metric = "AUC",
stopping_rounds = 10,
stopping_tolerance = 0.005,
seed = 123
)
#> Warning in .h2o.processResponseWarnings(res): early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!.

# model performance
h2o.auc(glm, valid = TRUE)
#> [1] 0.7870935
## [1] 0.7870935
h2o.auc(rf, valid = TRUE)
#> [1] 0.7681021
## [1] 0.7681021
h2o.auc(gbm, valid = TRUE)
#> [1] 0.7468242
## [1] 0.7468242

features <- as.data.frame(splits$valid) %>% select(-Attrition)

# 2. Create a vector with the actual responses
response <- as.vector(as.numeric(splits$valid$Attrition))

# 3. Create custom predict function that returns the predicted values as a
# vector (probability of purchasing in our example)
pred <- function(model, newdata) {
results <- as.data.frame(h2o.predict(model, as.h2o(newdata)))
return(results[[3L]])
}

# example of prediction output
pred(glm, features) %>% head()
#> [1] 0.12243347 0.12887908 0.09674399 0.26008143 0.00672000 0.13741387

predictor.glm <- Predictor$new(
model = glm,
data = features,
y = response,
predict.fun = pred,
class = "classification"
)
predictor.glm$predict(features[1:10,])
#> Error in `[.data.frame`(prediction, , self$class, drop = FALSE): undefined columns selected
# class = "classification" doesn't make sense; from the docs:
### The class column to be returned in case of multiclass output.
### You can either use numbers, e.g. class=2 would take the 2nd column
### from the predictions, or the column name of the predicted class,
### e.g. class="dog".
# so, in this case, 'class = 2' should work as expected

predictor.glm <- Predictor$new(
model = glm,
data = features,
y = response,
predict.function = pred,
class = 2
)
predictor.glm$predict(features[1:10,])
#> p1
#> 1 0.12243347
#> 2 0.12887908
#> 3 0.09674399
#> 4 0.26008143
#> 5 0.00672000
#> 6 0.13741387
#> 7 0.47917917
#> 8 0.11775822
#> 9 0.11316964
#> 10 0.22963757

predictor.rf <- Predictor$new(
model = rf,
data = features,
y = response,
predict.fun = pred,
class = 2
)

predictor.gbm <- Predictor$new(
model = gbm,
data = features,
y = response,
predict.fun = pred,
class = 2
)

imp.glm <- FeatureImp$new(predictor.glm, loss = "mse")
imp.rf <- FeatureImp$new(predictor.rf, loss = "mse")
imp.gbm <- FeatureImp$new(predictor.gbm, loss = "mse")

p1 <- plot(imp.glm) + ggtitle("GLM")
p2 <- plot(imp.rf) + ggtitle("RF")
p3 <- plot(imp.gbm) + ggtitle("GBM")

#gridExtra::grid.arrange(p1, p2, p3, nrow = 1)
p1

p2

p3

reprex package 创建于 2022-07-28 (v2.0.1)

关于r - 教程中的错误(使用 iml 包解释机器学习模型),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/72930868/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com