When using the tidymodels dataset, I run into a "subscript out of bounds error". I am using the Ames house prices dataset from Kaggle by the way.
First, I create my recipe:
data.split = initial_split(house.prices, prop=0.8)
train.data = training(data.split)
test.data = testing(data.split)
# Create recipe - OLS
sales.rec = recipe(SalePrice ~., data = train.data) %>%
step_log(SalePrice, LotArea, GrLivArea, TotRmsAbvGrd) %>%
update_role(Id, SalePrice, new_role = "ID") %>%
step_num2factor(MSSubClass, levels = as.character(unique(house.prices$SalePrice))) %>%
step_unknown(PoolQC, Fence, MiscFeature, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1,
FireplaceQu, GarageType, GarageQual, GarageCond, new_level = "None") %>%
step_mutate(PorchArea = OpenPorchSF+EnclosedPorch+`3SsnPorch`+ScreenPorch) %>%
step_mutate(garage.age = YrSold - GarageYrBlt,
house.age = YrSold - YearBuilt,
renovation.age = YrSold - YearRemodAdd
) %>%
step_mutate(has.garage = (GarageType != NA),
has.basement = (BsmtExposure != NA),
has.pool = (PoolQC != NA),
is.new = house.age==0
) %>%
step_cut(OverallQual, breaks = c(2.5, 6.5, 8.5)) %>%
step_cut(OverallCond, breaks = c(2.5, 6.5, 8.5)) %>%
step_mutate(house.age = log(1+house.age)) %>%
step_rm(-c(SalePrice, house.age, renovation.age,
has.garage, has.basement, has.pool, is.new,
MSZoning, LotArea, Alley, LotShape, Utilities, Neighborhood, OverallQual, OverallCond,
ExterCond, Foundation,
BsmtFinSF1, Heating, HeatingQC, CentralAir, GrLivArea, FullBath, KitchenQual, TotRmsAbvGrd,
PavedDrive, PorchArea, MiscVal,SaleCondition)) %>%
step_other(all_nominal_predictors(), all_factor(), all_string()) %>%
step_string2factor(all_string_predictors()) %>%
step_nzv(all_predictors()) %>%
step_impute_median(all_numeric_predictors()) %>%
step_unknown(all_factor_predictors()) %>%
step_normalize(all_numeric_predictors())
Then I set up a ranger random forest model:
my.rf = rand_forest(mtry = tune(), trees = 2000, min_n=tune()) %>%
set_engine("ranger") %>%
set_mode("regression")
Finally, I create a workflow and try to tune the model:
tree.grid = expand.grid(min_n=c(2,14,27,40), mtry = c(4, 8, 12))
folds = rsample::vfold_cv(train.data, v = 5)
metric = metric_set(rmse)
set_dependency("rand_forest", "ranger", "ranger", mode = "regression")
# Random forest
my.rf.rec = sales.rec %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_other(all_nominal_predictors())
my.rf.wflow = workflow() %>%
add_model(my.rf) %>%
add_recipe(my.rf.rec)
my.rf.res = my.rf.wflow %>%
tune_grid(
resamples = folds,
metrics = metric,
grid = tree.grid
)
However, upon running that final lines, I get the following error:
"x Fold1: preprocessor 1/1, model 1/12: Error in y.mat[, 2]: subscript out of bounds" (for lots of preprocessor and models), followed by "Warning: All models failed. Run show_notes(.Last.tune.result) for more information." (if I run the command indicated in the error, it just gives me "Error in y.mat[, 2]: subscript out of bounds")
I've tried googling everything, but cannot find the source of the error, any help would be appreciated :)
EDIT: Here's the head of the dataset, hopefully it helps?
structure(list(Id = c(1, 2, 3, 4, 5, 6), MSSubClass = c(60, 20,
60, 70, 60, 50), MSZoning = c("RL", "RL", "RL", "RL", "RL", "RL"
), LotFrontage = c(65, 80, 68, 60, 84, 85), LotArea = c(8450,
9600, 11250, 9550, 14260, 14115), Street = c("Pave", "Pave",
"Pave", "Pave", "Pave", "Pave"), Alley = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
LotShape = c("Reg", "Reg", "IR1", "IR1", "IR1", "IR1"), LandContour = c("Lvl",
"Lvl", "Lvl", "Lvl", "Lvl", "Lvl"), Utilities = c("AllPub",
"AllPub", "AllPub", "AllPub", "AllPub", "AllPub"), LotConfig = c("Inside",
"FR2", "Inside", "Corner", "FR2", "Inside"), LandSlope = c("Gtl",
"Gtl", "Gtl", "Gtl", "Gtl", "Gtl"), Neighborhood = c("CollgCr",
"Veenker", "CollgCr", "Crawfor", "NoRidge", "Mitchel"), Condition1 = c("Norm",
"Feedr", "Norm", "Norm", "Norm", "Norm"), Condition2 = c("Norm",
"Norm", "Norm", "Norm", "Norm", "Norm"), BldgType = c("1Fam",
"1Fam", "1Fam", "1Fam", "1Fam", "1Fam"), HouseStyle = c("2Story",
"1Story", "2Story", "2Story", "2Story", "1.5Fin"), OverallQual = c(7,
6, 7, 7, 8, 5), OverallCond = c(5, 8, 5, 5, 5, 5), YearBuilt = c(2003,
1976, 2001, 1915, 2000, 1993), YearRemodAdd = c(2003, 1976,
2002, 1970, 2000, 1995), RoofStyle = c("Gable", "Gable",
"Gable", "Gable", "Gable", "Gable"), RoofMatl = c("CompShg",
"CompShg", "CompShg", "CompShg", "CompShg", "CompShg"), Exterior1st = c("VinylSd",
"MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "VinylSd"), Exterior2nd = c("VinylSd",
"MetalSd", "VinylSd", "Wd Shng", "VinylSd", "VinylSd"), MasVnrType = c("BrkFace",
"None", "BrkFace", "None", "BrkFace", "None"), MasVnrArea = c(196,
0, 162, 0, 350, 0), ExterQual = c("Gd", "TA", "Gd", "TA",
"Gd", "TA"), ExterCond = c("TA", "TA", "TA", "TA", "TA",
"TA"), Foundation = c("PConc", "CBlock", "PConc", "BrkTil",
"PConc", "Wood"), BsmtQual = c("Gd", "Gd", "Gd", "TA", "Gd",
"Gd"), BsmtCond = c("TA", "TA", "TA", "Gd", "TA", "TA"),
BsmtExposure = c("No", "Gd", "Mn", "No", "Av", "No"), BsmtFinType1 = c("GLQ",
"ALQ", "GLQ", "ALQ", "GLQ", "GLQ"), BsmtFinSF1 = c(706, 978,
486, 216, 655, 732), BsmtFinType2 = c("Unf", "Unf", "Unf",
"Unf", "Unf", "Unf"), BsmtFinSF2 = c(0, 0, 0, 0, 0, 0), BsmtUnfSF = c(150,
284, 434, 540, 490, 64), TotalBsmtSF = c(856, 1262, 920,
756, 1145, 796), Heating = c("GasA", "GasA", "GasA", "GasA",
"GasA", "GasA"), HeatingQC = c("Ex", "Ex", "Ex", "Gd", "Ex",
"Ex"), CentralAir = c("Y", "Y", "Y", "Y", "Y", "Y"), Electrical = c("SBrkr",
"SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr"), `1stFlrSF` = c(856,
1262, 920, 961, 1145, 796), `2ndFlrSF` = c(854, 0, 866, 756,
1053, 566), LowQualFinSF = c(0, 0, 0, 0, 0, 0), GrLivArea = c(1710,
1262, 1786, 1717, 2198, 1362), BsmtFullBath = c(1, 0, 1,
1, 1, 1), BsmtHalfBath = c(0, 1, 0, 0, 0, 0), FullBath = c(2,
2, 2, 1, 2, 1), HalfBath = c(1, 0, 1, 0, 1, 1), BedroomAbvGr = c(3,
3, 3, 3, 4, 1), KitchenAbvGr = c(1, 1, 1, 1, 1, 1), KitchenQual = c("Gd",
"TA", "Gd", "Gd", "Gd", "TA"), TotRmsAbvGrd = c(8, 6, 6,
7, 9, 5), Functional = c("Typ", "Typ", "Typ", "Typ", "Typ",
"Typ"), Fireplaces = c(0, 1, 1, 1, 1, 0), FireplaceQu = c(NA,
"TA", "TA", "Gd", "TA", NA), GarageType = c("Attchd", "Attchd",
"Attchd", "Detchd", "Attchd", "Attchd"), GarageYrBlt = c(2003,
1976, 2001, 1998, 2000, 1993), GarageFinish = c("RFn", "RFn",
"RFn", "Unf", "RFn", "Unf"), GarageCars = c(2, 2, 2, 3, 3,
2), GarageArea = c(548, 460, 608, 642, 836, 480), GarageQual = c("TA",
"TA", "TA", "TA", "TA", "TA"), GarageCond = c("TA", "TA",
"TA", "TA", "TA", "TA"), PavedDrive = c("Y", "Y", "Y", "Y",
"Y", "Y"), WoodDeckSF = c(0, 298, 0, 0, 192, 40), OpenPorchSF = c(61,
0, 42, 35, 84, 30), EnclosedPorch = c(0, 0, 0, 272, 0, 0),
`3SsnPorch` = c(0, 0, 0, 0, 0, 320), ScreenPorch = c(0, 0,
0, 0, 0, 0), PoolArea = c(0, 0, 0, 0, 0, 0), PoolQC = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), Fence = c(NA, NA, NA, NA, NA, "MnPrv"), MiscFeature = c(NA,
NA, NA, NA, NA, "Shed"), MiscVal = c(0, 0, 0, 0, 0, 700),
MoSold = c(2, 5, 9, 2, 12, 10), YrSold = c(2008, 2007, 2008,
2006, 2008, 2009), SaleType = c("WD", "WD", "WD", "WD", "WD",
"WD"), SaleCondition = c("Normal", "Normal", "Normal", "Abnorml",
"Normal", "Normal"), SalePrice = c(208500, 181500, 223500,
140000, 250000, 143000)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))