gpt4 book ai didi

r - xgboost泊松回归: label must be nonnegative

转载 作者:行者123 更新时间:2023-11-30 08:37:12 25 4
gpt4 key购买 nike

我使用的是带有 R 和 xgboost 版本 0.6-4 的 Windows 10 笔记本电脑。运行以下代码时出现奇怪的错误。

xgb_params <- list("objective" = "count:poisson",
"eval_metric" = "rmse")
regression <- xgboost(data = training_fold,
label = y_training_fold,
nrounds = 10,
params = xgb_params)

Error in xgb.iter.update(bst$handle, dtrain, iteration - 1, obj) :
amalgamation/../src/objective/regression_obj.cc:190: Check failed:
label_correct PoissonRegression: label must be nonnegative

但是当我查看标签摘要时,它指出:

Min.   1st Qu. Median  Mean   3rd Qu. Max.   NA's
0.1129 0.3387 0.7000 1.0987 1.5265 4.5405 287

我该如何解决这个问题?我尝试删除 NA,但这没有帮助。

提前致谢!

编辑

这是训练数据的示例

dput(droplevels(head(train[, c(1,2,4,5,6,8,9,10,11)], 20)))

structure(list(VacancyId = structure(c(1L, 1L, 1L, 1L, 2L, 2L,
3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("55288","56838", "57822", "57902", "57925", "58008"), class = "factor"),
VacancyBankId = c(2L, 1609L, 1611L, 147L, 17L, 1611L, 2L,
257L, 1611L, 2L, 147L, 17L, 1611L, 239L, 1609L, 2L, 1609L,
2L, 2L, 1609L), FunctionId = c(36L, 36L, 36L, 36L, 35L, 35L,
3L, 4L, 4L, 4L, 4L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L),
EducationLevel = c(6L, 6L, 6L, 6L, 6L, 6L, 4L, 6L, 6L, 6L,
6L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L), ProvinceId = c(22L,
22L, 22L, 22L, 24L, 24L, 19L, 16L, 16L, 16L, 16L, 19L, 19L,
19L, 21L, 21L, 16L, 16L, 22L, 22L), CandidatesCount = c(126L,
27L, 18L, 12L, 1L, 4L, 2L, 6L, 7L, 7L, 1L, 8L, 15L, 13L,
7L, 7L, 7L, 7L, 7L, 7L), DurationDays = c(62L, 62L, 62L,
62L, 18L, 18L, 43L, 61L, 61L, 61L, 61L, 60L, 60L, 60L, 62L,
62L, 62L, 62L, 62L, 62L), DurationWeeks = c(8.857142857,
8.857142857, 8.857142857, 8.857142857, 2.571428571, 2.571428571,
6.142857143, 8.714285714, 8.714285714, 8.714285714, 8.714285714,
8.571428571, 8.571428571, 8.571428571, 8.857142857, 8.857142857,
8.857142857, 8.857142857, 8.857142857, 8.857142857), CandidatesPerWeek = c(NA,
3.048387097, 2.032258065, 1.35483871, 0.388888889, 1.555555556,
0.325581395, 0.68852459, 0.803278689, 0.803278689, 0.114754098,
0.933333333, 1.75, 1.516666667, 0.790322581, 0.790322581,
0.790322581, 0.790322581, 0.790322581, 0.790322581)), .Names = c("VacancyId", "VacancyBankId", "FunctionId", "EducationLevel", "ProvinceId", "CandidatesCount", "DurationDays", "DurationWeeks", "CandidatesPerWeek"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 26L, 27L, 28L, 29L, 30L, 31L), class = "data.frame")

我想使用 FunctionId、Educationlevel、Province 和 VacancyBankId 来预测每周的候选人。因此 y_training_fold 是每周的候选人,training_fold 是职能、教育、省份和职位空缺银行 ID。

希望有人能帮助我!

最佳答案

数据集中的问题不是 y_training_fold 中存在负值,而是存在非整数值。
请参阅以下使用非整数值 y_training_fold 向量进行的模拟:

library(xgboost)

training_fold <- matrix(rnorm(1000),nrow=100)
y_training_fold <- matrix(rnorm(100),ncol=1)

xgb_params <- list("objective" = "count:poisson",
"eval_metric" = "rmse")
regression <- xgboost(data = training_fold,
label = y_training_fold,
nrounds = 10,
params = xgb_params)

错误消息与您报告的完全相同:

Error in xgb.iter.update(bst$handle, dtrain, iteration - 1, obj) : 
[11:46:28] amalgamation/../src/objective/regression_obj.cc:190:
Check failed: label_correct PoissonRegression: label must be nonnegative

现在,尝试使用整数的 y_training_fold 向量:

y_training_fold <- matrix(rpois(100,10),ncol=1)

xgb_params <- list("objective" = "count:poisson",
"eval_metric" = "rmse")
regression <- xgboost(data = training_fold,
label = y_training_fold,
nrounds = 10,
params = xgb_params)

现在xgboost工作得很好:

[1]     train-rmse:9.795855 
[2] train-rmse:9.660112
[3] train-rmse:9.492991
[4] train-rmse:9.287366
[5] train-rmse:9.034582
[6] train-rmse:8.724205
[7] train-rmse:8.343800
[8] train-rmse:7.878869
[9] train-rmse:7.312294
[10] train-rmse:6.632671

编辑。

使用您的数据解决问题的方法是:

dts <- structure(list(VacancyId = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("55288","56838", "57822", "57902", "57925", "58008"), class = "factor"),
VacancyBankId = c(2L, 1609L, 1611L, 147L, 17L, 1611L, 2L,
257L, 1611L, 2L, 147L, 17L, 1611L, 239L, 1609L, 2L, 1609L,
2L, 2L, 1609L), FunctionId = c(36L, 36L, 36L, 36L, 35L, 35L,
3L, 4L, 4L, 4L, 4L, 9L, 9L, 9L, 3L, 3L, 3L, 3L, 3L, 3L),
EducationLevel = c(6L, 6L, 6L, 6L, 6L, 6L, 4L, 6L, 6L, 6L,
6L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L), ProvinceId = c(22L,
22L, 22L, 22L, 24L, 24L, 19L, 16L, 16L, 16L, 16L, 19L, 19L,
19L, 21L, 21L, 16L, 16L, 22L, 22L), CandidatesCount = c(126L,
27L, 18L, 12L, 1L, 4L, 2L, 6L, 7L, 7L, 1L, 8L, 15L, 13L,
7L, 7L, 7L, 7L, 7L, 7L), DurationDays = c(62L, 62L, 62L,
62L, 18L, 18L, 43L, 61L, 61L, 61L, 61L, 60L, 60L, 60L, 62L,
62L, 62L, 62L, 62L, 62L), DurationWeeks = c(8.857142857,
8.857142857, 8.857142857, 8.857142857, 2.571428571, 2.571428571,
6.142857143, 8.714285714, 8.714285714, 8.714285714, 8.714285714,
8.571428571, 8.571428571, 8.571428571, 8.857142857, 8.857142857,
8.857142857, 8.857142857, 8.857142857, 8.857142857), CandidatesPerWeek = c(NA,
3.048387097, 2.032258065, 1.35483871, 0.388888889, 1.555555556,
0.325581395, 0.68852459, 0.803278689, 0.803278689, 0.114754098,
0.933333333, 1.75, 1.516666667, 0.790322581, 0.790322581,
0.790322581, 0.790322581, 0.790322581, 0.790322581)),
.Names = c("VacancyId", "VacancyBankId", "FunctionId", "EducationLevel",
"ProvinceId", "CandidatesCount", "DurationDays", "DurationWeeks",
"CandidatesPerWeek"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 26L, 27L, 28L, 29L, 30L, 31L),
class = "data.frame")

# Delete missing values
dts <- na.omit(dts)

# Build X matrix of potential predictors
# Important: do not use the first column (ID) and the last (response variable)
training_fold <- as.matrix(dts[,-c(1,9)])
# Round to the nearest integer the response variable
y_training_fold <- as.matrix(dts[,9])
y_training_fold <- round(y_training_fold)

xgb_params <- list("objective" = "count:poisson",
"eval_metric" = "rmse")
( regression <- xgboost(data = training_fold,
label = y_training_fold,
nrounds = 10,
params = xgb_params) )
# Output
##### xgb.Booster
# raw: 4.6 Kb
# call:
# xgb.train(params = params, data = dtrain, nrounds = nrounds,
# watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
# early_stopping_rounds = early_stopping_rounds, maximize = maximize,
# save_period = save_period, save_name = save_name, xgb_model = xgb_model,
# callbacks = callbacks)
# params (as set within xgb.train):
# objective = "count:poisson", eval_metric = "rmse", silent = "1"
# xgb.attributes:
# niter
# callbacks:
# cb.print.evaluation(period = print_every_n)
# cb.evaluation.log()
# cb.save.model(save_period = save_period, save_name = save_name)
# niter: 10
# evaluation_log:
# iter train_rmse
# 1 0.914084
# 2 0.829741
# ---
# 9 0.332951
# 10 0.291877

关于r - xgboost泊松回归: label must be nonnegative,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44820186/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com