如何解决R 插入符号:train() 因具有因子预测变量的重复 cv 失败
以下函数应与 Caret 的 train()
函数一起使用。没有任何因素变量或没有交叉验证它工作正常。
使用因子作为预测变量和 repeatedcv
时会出现问题,因为在折叠中并非所有因子都存在但仍出现在因子水平内:
考虑以下改编的 cforest 模型(来自包 partykit
):
cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",library = c("partykit","party"),loop = NULL,type = c("Classification","Regression"),parameters = data.frame(parameter = 'mtry',class = 'numeric',label = "#Randomly Selected Predictors"),grid = function(x,y,len = NULL,search = "grid"){
if(search == "grid") {
out <- data.frame(mtry = caret::var_seq(p = ncol(x),classification = is.factor(y),len = len))
} else {
out <- data.frame(mtry = unique(sample(1:ncol(x),replace = TRUE,size = len)))
}
out
},fit = function(x,wts,param,lev,last,classprobs,...) {
# make consistent factor levels
if(any(sapply(x,is.factor))){
fac_col_names <- names(grep("factor",sapply(x,class),value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
x[,which(names(x) == fac_col_names[i])] <- factor(x[,which(names(x) == fac_col_names[i])],levels = as.character(unique(x[,which(names(x) == fac_col_names[i])])))
}
}
dat <- if(is.data.frame(x)) x else as.data.frame(x,stringsAsFactors = TRUE)
dat$.outcome <- y
theDots <- list(...)
if(any(names(theDots) == "mtry")) # # change controls to mtry?
{
theDots$mtry <- as.integer(param$mtry) # remove gtcrl
theDots$mtry
theDots$mtry <- NULL
} else mtry <- min(param$mtry,ncol(x))
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(.outcome ~ .),data = dat,mtry = mtry),# change controls to mtry?
theDots)
out <- do.call(partykit::cforest,modelArgs)
out
},predict = function(modelFit,newdata = NULL,submodels = NULL) {
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata,stringsAsFactors = TRUE)
# make consistent factor levels
if(any(sapply(newdata,sapply(newdata,value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
newdata[,which(names(newdata) == fac_col_names[i])] <- factor(newdata[,which(names(newdata) == fac_col_names[i])],levels = as.character(unique(newdata[,which(names(newdata) == fac_col_names[i])])))
}
}
## party builds the levels into the model object,so I'm
## going to assume that all the levels will be passed to
## the output
out <- partykit:::predict.cforest(modelFit,newdata = newdata,OOB = TRUE) # predict_party,id?
if(is.matrix(out)) out <- out[,1]
if(!is.null(modelFit$'(response)')) out <- as.character(out) # if(!is.null(modelFit@responses@levels$.outcome)) out <- as.character(out)
out
},prob = function(modelFit,submodels = NULL) { # submodels ?
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata,stringsAsFactors = TRUE)
obsLevels <- levels(modelFit$'(response)')
rawProbs <- partykit::predict.cforest(modelFit,OOB = TRUE) # predict(,type="prob) ? id?
probMatrix <- matrix(unlist(rawProbs),ncol = length(obsLevels),byrow = TRUE)
out <- data.frame(probMatrix)
colnames(out) <- obsLevels
rownames(out) <- NULL
out
},predictors = function(x,...) {
vi <- partykit::varimp(x,...)
names(vi)[vi != 0]
},varImp = function(object,...) {
variableImp <- partykit::varimp(object,...)
out <- data.frame(Overall = variableImp)
out
},tags = c("Random Forest","Ensemble Model","Bagging","Implicit Feature Selection","Accepts Case Weights"),levels = function(x) levels(x@data@get("response")[,1]),sort = function(x) x[order(x[,],oob = function(x) {
obs <- x@data@get("response")[,1]
pred <- partykit:::predict.cforest(x,OOB = TRUE,newdata = NULL)
postResample(pred,obs)
})
在train和repeatcv中使用带有因子预测变量的数据框应用它时,会出现错误:
library(caret)
library(party)
library(partykit)
dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1,to = 20)],each=1))
# specifiy folds with CreateMultiFolds
set.seed(43,kind = "Mersenne-Twister",normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,k = 3,times = 2)
# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid",method = "repeatedcv",number = 3,repeats = 2,index = folds_train,savePred = T)
preds <- dat[,2:5]
response <- dat[,1]
# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4))
#set.seed(42,normal.kind = "Inversion")
model <- caret::train(x = preds,# predictors
y = response,# response
method = cforest_partykit,metric = "RMSE",tuneGrid = tunegrid,trControl = finalcontrol,ntree = 150)
warnings()
1: predictions Failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf,data = newdata,na.action = na.pass,: factor class has new levels a,c,g,k,m,p,s,t
目的是确定每个 fold.rep 的级别并仅分配存在于相应折叠中的那些级别:
for (i in 1:length(folds_train)) {
preds_temp <- preds[folds_train[[i]],]
# check levels
levels(preds_temp$class)
# which are actually present
unique(preds_temp$class)
# assign present levels to each subset
preds_temp$class <- factor(preds_temp$class,levels = as.character(unique(preds_temp$class)))
}
我尝试在 cforest_partykit
函数 (# make consistent factor levels
) 中包含正确因子水平的分配,但似乎没有效果。
如何在插入符号 train()
或 trainControl()
或 createDataPartition()
函数中实现此功能?
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。