Making tidymodels predictions when a custom recipe step requires the outcome variable

118 views Asked by At

I have written a custom recipe step function for a gene expression-based classifier that carries out feature selection by differential expression based on levels of a binary outcome variable. Genes are sorted by p-value, and the number of genes being included is tunable.

The actual feature selection step occurs in bake, where dplyr::select() is used to retain a) the outcome variable, and b) the selected features, which then gets passed on to be fit. So the outcome variable doesn't change, it's just required to define the differential expression model.

However, because the step function requires the outcome variable, I have issues making predictions because the predict functions through tidymodels excludes the outcome variable.

Perhaps there is a very straightforward fix but I haven't found it. Also possible I am confused about what the actual issue is here.

# HOUSEKEEPING ####
rm(list = ls(all = TRUE)) # clean house
# CRAN libraries
library(tidyverse) # install.packages("tidyverse")
library(tidymodels) # install.packages("tidymodels")
library(finetune) # install.packages("finetune")
#> Warning: package 'finetune' was built under R version 4.3.2
library(reprex) # install.packages("reprex")
#> Warning: package 'reprex' was built under R version 4.3.2
# Bioconductor libraries
library(limma) # BiocManager::install("limma")
# set conflict preference
tidymodels_prefer()


# CREATE THE CUSTOM STEP FUNCTION ####
# CREATE THE WRAPPER FUNCTION
step_limma <- function(
    recipe,
    ...,
    role = NA,
    trained = FALSE,
    target = NULL,
    n_features = NULL,
    features = NULL,
    skip = FALSE,
    id = rand_id("limma")) {
    add_step(
        recipe,
        step_limma_new(
            terms = enquos(...),
            role = role,
            trained = trained,
            target = target,
            n_features = n_features,
            features = features,
            skip = skip,
            id = id
        )
    )
}

# CREATE THE CONSTRUCTOR
step_limma_new <- function(terms, role, trained, target, n_features, features, skip, id) {
    step(
        subclass = "limma",
        terms = terms,
        role = role,
        trained = trained,
        target = target,
        n_features = n_features,
        features = features,
        skip = skip,
        id = id
    )
}

prep.step_limma <- function(x, training, info = NULL, ...) {
    target <- x$target
    df_limma <- training %>%
        dplyr::select(contains("X")) %>%
        t()
    tar <- training %>% pull(target)

    design <- model.matrix(~tar)

    fit <- limma::lmFit(df_limma, design)
    ebayes <- limma::eBayes(fit)
    tab <- limma::topTable(ebayes, coef = 2, adjust = "fdr", sort.by = "p", n = x$n_features)
    limma_features <- row.names(tab)

    step_limma_new(
        terms = x$terms,
        trained = TRUE,
        role = x$role,
        target = x$target,
        n_features = x$n_features,
        features = limma_features,
        skip = x$skip,
        id = x$id
    )
}

bake.step_limma <- function(object, new_data, ...) {
    new_data <- new_data %>% dplyr::select(object$target, dplyr::all_of(object$features))
    new_data
}

print.step_limma <- function(x, width = max(20, options()$width - 35), ...) {
    title <- "Variables selected "
    print_step(names(x$n_features), x$terms, x$trained, title, width)
    invisible(x)
}

# TODO fix the tuning for n_features
num_features <- function(range = c(10L, 200L), trans = NULL) {
    new_quant_param(
        type = "integer",
        range = range,
        inclusive = c(TRUE, TRUE),
        trans = trans,
        label = c(n_features = "# features"),
        finalize = NULL
    )
}

tunable.step_limma <- function(x, ...) {
    tibble::tibble(
        name = c("n_features"),
        call_info = list(list(fun = "num_features")),
        source = "recipe",
        component = "step_limma",
        component_id = x$id
    )
}

required_pkgs.step_limma <- function(x, ...) {
    c("limma")
}


# DEFINE SEED ####
seed <- 42


# DEFINE SET ####
set.seed(seed)
set <- matrix(runif(1000, min = 1, max = 10), nrow = 1000, ncol = 1000) %>%
    data.frame() %>%
    tibble() %>%
    mutate(
        target = sample(c(0, 1), 1000, replace = TRUE)  %>% factor, 
        .before = 1
    )


# DEFINE TRAINING AND VALIDATION SETS ####
set.seed(seed)
set_split <- initial_split(set, strata = target)
set_train <- training(set_split)
set_test <- testing(set_split)


# SET UP RESAMPLING ####
set.seed(seed)
cv_folds <- set_train %>% vfold_cv(v = 5, strata = target)


# BUILD MODELS ####
mod_svmliner <- svm_linear() %>%
    set_engine("kernlab") %>%
    set_mode("classification")


# BUILD RECIPE ####
recipe_set <- recipe(target ~ ., data = set_train) %>%
    step_limma(
        target = "target",
        n_features = tune()
    )


# CREATE WORKFLOW ####
workflow_set <- workflow() %>%
    add_recipe(recipe_set) %>% 
    add_model(mod_svmliner)


# TUNE AND TRAIN THE MODELS ####
set.seed(seed)
res_tune <- workflow_set %>%
    tune_race_win_loss(
        resamples = cv_folds,
        metrics = metric_set(roc_auc),
        grid = 5,
        control = control_race(
            save_pred = TRUE,
            event_level = "second",
            save_workflow = TRUE,
            verbose = FALSE
        )
    )
#> ℹ The workflow being saved contains a recipe, which is 6.34 Mb in ℹ memory. If
#> this was not intentional, please set the control setting ℹ `save_workflow =
#> FALSE`.
#> ℹ The workflow being saved contains a recipe, which is 6.34 Mb in ℹ memory. If
#> this was not intentional, please set the control setting ℹ `save_workflow =
#> FALSE`.
#> ℹ The workflow being saved contains a recipe, which is 6.34 Mb in ℹ memory. If
#> this was not intentional, please set the control setting ℹ `save_workflow =
#> FALSE`.


# FINALIZE WORKFLOW ####
workflow_set_final <- workflow_set %>% finalize_workflow(select_best(res_tune))


# FIT THE FINAL MODEL ####
fit <- fit(workflow_set_final, set_train)
#>  Setting default kernel parameters


# GET PREDICTIONS IN TEST SET ####
fit %>% predict(set_test)
#> Error in `dplyr::select()`:
#> ! Can't subset columns that don't exist.
#> ✖ Column `target` doesn't exist.
#> Backtrace:
#>      ▆
#>   1. ├─fit %>% predict(set_test)
#>   2. ├─stats::predict(., set_test)
#>   3. ├─stats::predict(., set_test)
#>   4. ├─workflows:::predict.workflow(., set_test)
#>   5. │ └─workflows:::forge_predictors(new_data, workflow)
#>   6. │   ├─hardhat::forge(new_data, blueprint = mold$blueprint)
#>   7. │   └─hardhat:::forge.data.frame(new_data, blueprint = mold$blueprint)
#>   8. │     ├─hardhat::run_forge(blueprint, new_data = new_data, outcomes = outcomes)
#>   9. │     └─hardhat:::run_forge.default_recipe_blueprint(...)
#>  10. │       └─hardhat:::forge_recipe_default_process(...)
#>  11. │         ├─recipes::bake(object = rec, new_data = new_data)
#>  12. │         └─recipes:::bake.recipe(object = rec, new_data = new_data)
#>  13. │           ├─recipes::bake(step, new_data = new_data)
#>  14. │           └─global bake.step_limma(step, new_data = new_data)
#>  15. │             └─new_data %>% ...
#>  16. ├─dplyr::select(., object$target, dplyr::all_of(object$features))
#>  17. ├─dplyr:::select.data.frame(., object$target, dplyr::all_of(object$features))
#>  18. │ └─tidyselect::eval_select(expr(c(...)), data = .data, error_call = error_call)
#>  19. │   └─tidyselect:::eval_select_impl(...)
#>  20. │     ├─tidyselect:::with_subscript_errors(...)
#>  21. │     │ └─rlang::try_fetch(...)
#>  22. │     │   └─base::withCallingHandlers(...)
#>  23. │     └─tidyselect:::vars_select_eval(...)
#>  24. │       └─tidyselect:::walk_data_tree(expr, data_mask, context_mask)
#>  25. │         └─tidyselect:::eval_c(expr, data_mask, context_mask)
#>  26. │           └─tidyselect:::reduce_sels(node, data_mask, context_mask, init = init)
#>  27. │             └─tidyselect:::walk_data_tree(new, data_mask, context_mask)
#>  28. │               └─tidyselect:::as_indices_sel_impl(...)
#>  29. │                 └─tidyselect:::as_indices_impl(...)
#>  30. │                   └─tidyselect:::chr_as_locations(x, vars, call = call, arg = arg)
#>  31. │                     └─vctrs::vec_as_location(...)
#>  32. └─vctrs (local) `<fn>`()
#>  33.   └─vctrs:::stop_subscript_oob(...)
#>  34.     └─vctrs:::stop_subscript(...)
#>  35.       └─rlang::abort(...)

Created on 2023-11-21 with reprex v2.0.2

Session info

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.1 (2023-06-16 ucrt)
#>  os       Windows 11 x64 (build 22621)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_Canada.utf8
#>  ctype    English_Canada.utf8
#>  tz       America/Edmonton
#>  date     2023-11-21
#>  pandoc   3.1.6.1 @ C:/PROGRA~1/Pandoc/ (via rmarkdown)
1

There are 1 answers

2
EmilHvitfeldt On BEST ANSWER

To create a feature selection-like recipes step. I would recommend that your prep.*() methods returns information about which variables to remove rather than which once to keep. This way your bake.*() method just has to remove the columns that are selected out.

You avoid having to make sure that outcomes, id variables and case weights are kept. Which is the problem you are running into right now.

I added the developer function recipes_remove_cols() for this exact use-case which you can see done in step_zv() here: https://github.com/tidymodels/recipes/blob/ce4f505b55f4222c58f4fbadd3d4083b61efaa14/R/zv.R#L133

It assumes that the variables to remove is in the removals field.

Another note, is that you properly should make it so ... selects the variables to perform the feature selection on. If you don't, you are going to run into the same problem of non-predictors being used.