I'm trying to use the caret function rfe for variable selection. One of the models I would like to use it on is a logistic regression. I'm trying to follow the example here: https://stats.stackexchange.com/questions/37830/how-do-you-get-lmfuncs-functions-of-the-rfe-function-in-caret-to-do-a-logistic-r
From what I've observed in the following models, I can get the rfe function to work with my binary response variable presence when using random forest, GAM or naive Bayes, but both the basic linear model function and my attempt at making a glm logistic regression function don't work when presence is the predictor. However, the basic linear regression works fine when I use a different continuous variable as the predictor. Why would this be?
The error I'm getting is
Error in rbind(deparse.level, ...) : numbers of columns of arguments do not match
I'm not sure why this would be happening when the input predictors and response come from the same dataframe.
library(caret)
rfcontrol <- rfeControl(functions = rfFuncs, # random forest
method = "repeatedcv", # repeated cv
repeats = 5, # number of repeats
number = 10) # number of folds
out_rf <- rfe(dat[,c(2:6)], dat[,1], sizes = c(1:5), rfeControl = rfcontrol)
#works
nbcontrol <- rfeControl(functions = nbFuncs, # random forest
method = "repeatedcv", # repeated cv
repeats = 5, # number of repeats
number = 10) # number of folds
out_nb <- rfe(dat[,c(2:6)], dat[,1], sizes = c(1:5), rfeControl = nbcontrol)
#works
gamcontrol <- rfeControl(functions = gamFuncs, # random forest
method = "repeatedcv", # repeated cv
repeats = 5, # number of repeats
number = 10) # number of folds
out_gam <- rfe(dat[,c(2:6)], dat[,1], sizes = c(1:5), rfeControl = gamcontrol)
#works
lmcontrol <- rfeControl(functions = gamFuncs, # random forest
method = "repeatedcv", # repeated cv
repeats = 5, # number of repeats
number = 10) # number of folds
out_lm <- rfe(dat[,c(3:6)], dat[,2], sizes = c(1:4), rfeControl = lmcontrol)
#works
out_lm2 <- rfe(dat[,c(2:6)], dat[,1], sizes = c(1:5), rfeControl = lmcontrol)
#doesn't work
glmFuncs <- lmFuncs
glmFuncs$fit=function (x, y, first, last, ...) {
tmp <- as.data.frame(x)
tmp$y <- y
glm(y ~ ., data = tmp, family=binomial(link='logit'))
}
glmcontrol <- rfeControl(functions = glmFuncs, # random forest
method = "repeatedcv", # repeated cv
repeats = 5, # number of repeats
number = 10) # number of folds
out_glm <- rfe(dat[,c(2:6)], dat[,1], sizes = c(1:5), rfeControl = glmcontrol)
#doesn't work
Here is a sample of the dataset I'm using
dat <- structure(list(presence = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("FALSE",
"TRUE"), class = "factor"), mean_annual_temp = c(-10L, -9L, -6L,
-8L, -5L, -13L, -10L, -10L, -12L, -14L, -10L, -9L, -10L, -17L,
-10L, -12L, -17L, -11L, -17L, -9L, -13L, -11L, -17L, -10L, -10L,
-10L, -12L, -11L, -11L, -10L, -13L, -17L, -12L, -10L, -13L, -12L,
-12L, -11L, -15L, -6L, -10L, -14L, -12L, -12L, -5L, -12L, -16L,
-16L, -12L, -19L, -19L, -13L, -18L, -12L, -14L, -10L, -9L, -14L,
-11L, -15L, -14L, -12L, -11L, -11L, -10L, -6L, -14L, -5L, -6L,
-9L, -12L, -11L, -11L, -14L, -16L, -14L, -13L, -14L, -13L, -14L,
-13L, -10L, -12L, -13L, -9L, -13L, -13L, -14L, -12L, -17L, -15L,
-12L, -15L, -11L, -12L, -14L, -5L, -12L, -5L, -14L), min_temp_coldest_month = c(-33L,
-31L, -26L, -32L, -25L, -37L, -36L, -33L, -35L, -35L, -33L, -33L,
-32L, -37L, -30L, -36L, -37L, -32L, -38L, -30L, -36L, -32L, -38L,
-36L, -32L, -30L, -36L, -32L, -34L, -34L, -32L, -38L, -34L, -30L,
-34L, -35L, -35L, -33L, -37L, -27L, -36L, -36L, -35L, -36L, -26L,
-35L, -36L, -37L, -36L, -41L, -41L, -34L, -39L, -36L, -39L, -30L,
-30L, -38L, -33L, -37L, -38L, -35L, -34L, -34L, -34L, -27L, -38L,
-27L, -27L, -30L, -35L, -36L, -33L, -39L, -37L, -37L, -36L, -36L,
-34L, -39L, -34L, -31L, -36L, -37L, -31L, -33L, -36L, -38L, -34L,
-37L, -36L, -36L, -37L, -36L, -36L, -38L, -27L, -33L, -25L, -38L
), temp_annual_range = c(49L, 49L, 41L, 50L, 39L, 52L, 52L, 48L,
50L, 46L, 48L, 50L, 48L, 44L, 42L, 51L, 44L, 47L, 44L, 43L, 52L,
44L, 47L, 52L, 48L, 45L, 51L, 47L, 49L, 53L, 43L, 46L, 47L, 42L,
46L, 51L, 51L, 47L, 47L, 42L, 52L, 50L, 51L, 52L, 42L, 50L, 43L,
45L, 51L, 51L, 48L, 46L, 45L, 52L, 52L, 42L, 47L, 51L, 48L, 47L,
52L, 50L, 49L, 49L, 50L, 42L, 52L, 42L, 41L, 47L, 51L, 53L, 49L,
52L, 47L, 50L, 52L, 48L, 45L, 52L, 46L, 46L, 52L, 51L, 49L, 45L,
51L, 52L, 48L, 45L, 47L, 52L, 47L, 53L, 51L, 52L, 42L, 45L, 39L,
51L), mean_temp_wettest_Q = c(7L, 8L, 4L, 9L, 9L, 6L, 8L, 7L,
6L, 2L, 7L, 8L, 7L, 0L, 3L, 6L, 0L, 6L, 0L, 5L, 6L, 4L, 0L, 8L,
7L, 6L, 6L, 6L, 6L, 8L, 3L, 0L, 5L, 5L, 3L, 6L, 7L, 6L, 2L, 5L,
8L, 5L, 6L, 7L, 5L, 6L, 0L, 0L, 6L, 0L, 0L, 4L, 0L, 7L, 4L, 4L,
8L, 4L, 6L, 2L, 5L, 6L, 7L, 7L, 8L, 5L, 5L, 5L, 5L, 8L, 6L, 7L,
7L, 5L, 1L, 4L, 6L, 4L, 3L, 4L, 3L, 7L, 7L, 6L, 8L, 4L, 6L, 4L,
6L, 0L, 2L, 7L, 3L, 7L, 6L, 5L, 5L, 4L, 8L, 5L), mean_temp_driest_Q = c(-28L,
-21L, -22L, -26L, -17L, -31L, -22L, -22L, -30L, -30L, -22L, -21L,
-24L, -33L, -25L, -31L, -33L, -27L, -33L, -24L, -31L, -27L, -33L,
-22L, -27L, -22L, -31L, -28L, -29L, -28L, -27L, -33L, -29L, -25L,
-29L, -31L, -30L, -28L, -32L, -23L, -21L, -32L, -30L, -31L, -22L,
-30L, -31L, -32L, -30L, -37L, -36L, -28L, -34L, -31L, -33L, -25L,
-25L, -32L, -28L, -32L, -32L, -30L, -29L, -29L, -28L, -23L, -32L,
-22L, -23L, -24L, -31L, -30L, -27L, -33L, -32L, -32L, -31L, -31L,
-29L, -33L, -29L, -18L, -30L, -32L, -21L, -28L, -31L, -33L, -29L,
-33L, -31L, -31L, -32L, -30L, -30L, -33L, -22L, -29L, -19L, -33L
)), class = "data.frame", row.names = c(NA, -100L))