Applying function rowwise efficiently

Question

Applying function rowwise efficiently

254 views Asked by LulY At 18 July 2023 at 09:07

I have a dataframe with multiple columns containing information on one diagnosis. The entries are TRUE, FALSE or NA. I create a vector which summarizes those columns as follows: If a patient was diagnosed at some time (TRUE), then TRUE, if the only valid entry is FALSE, then FALSE and if there just missings, then NA. Written text as code:

data.frame(a= c(FALSE, TRUE, NA, FALSE, TRUE, NA, FALSE, TRUE, NA),
           b= c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, NA, NA, NA),
           expected= c(FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, NA))

I need to go trough all the columns rowwise and I do so using split. Unfortunatelly, my data is big and it takes a long while. What I do at the moment is

library(magrittr)
# big example data
df <- expand.grid(c(FALSE, TRUE, NA), c(FALSE, TRUE, NA)) %>%
  .[rep(1:nrow(.), 50000), ] %>%
  as.data.frame() %>%
  setNames(., nm= c("a", "b"))

# My approach
df$res <- df %>%
  split(., 1:nrow(.)) %>%
  lapply(., function(row_i){
    ifelse(all(is.na(row_i)), NA,
           ifelse(any(row_i, na.rm= TRUE), TRUE,
                  ifelse(any(!row_i, na.rm= TRUE), FALSE,
                         row_i)))
  }) %>%
  unlist()

Is there a more efficient way to solve this task?

Original Q&A

There are 6 answers

jared_mamrot On 18 July 2023 at 09:21

One option is to use the vectorised case_when() function from the dplyr package (https://dplyr.tidyverse.org/reference/case_when.html), e.g.

library(dplyr)

df <- expand.grid(c(FALSE, TRUE, NA), c(FALSE, TRUE, NA)) %>%
  .[rep(1:nrow(.), 50000), ] %>%
  as.data.frame() %>%
  setNames(., nm= c("a", "b"))

df$res <- df %>%
  split(., 1:nrow(.)) %>%
  lapply(., function(row_i){
    ifelse(all(is.na(row_i)), NA,
           ifelse(any(row_i, na.rm= TRUE), TRUE,
                  ifelse(any(!row_i, na.rm= TRUE), FALSE,
                         row_i)))
  }) %>%
  unlist()
current_output <- df

# load 'clean' example data
df <- expand.grid(c(FALSE, TRUE, NA), c(FALSE, TRUE, NA)) %>%
  .[rep(1:nrow(.), 50000), ] %>%
  as.data.frame() %>%
  setNames(., nm= c("a", "b"))

case_when_output <- df %>%
  mutate(res = case_when(if_any(everything(), ~.x == TRUE) ~ TRUE,
                              if_all(everything(), ~is.na(.x)) ~ NA,
                              TRUE ~ FALSE))

all.equal(current_output, case_when_output)
#> [1] TRUE

^{Created on 2023-07-18 with reprex v2.0.2}

Benchmark (6yo core-i5 macbook pro; updated 20-07-2023):

library(dplyr)
# install.packages("purrrlyr")
library(purrrlyr)
library(microbenchmark)
library(ggplot2)

df <- expand.grid(c(FALSE, TRUE, NA), c(FALSE, TRUE, NA)) %>%
  .[rep(1:nrow(.), 50000), ] %>%
  as.data.frame() %>%
  setNames(., nm= c("a", "b"))

ingo_pingo_func <- function(df) {
  df$res <- df %>%
  split(., 1:nrow(.)) %>%
  lapply(., function(row_i){
    ifelse(all(is.na(row_i)), NA,
           ifelse(any(row_i, na.rm= TRUE), TRUE,
                  ifelse(any(!row_i, na.rm= TRUE), FALSE,
                         row_i)))
  }) %>%
  unlist()
}

jared_mamrot_func <- function(df) {
  case_when_output <- df %>%
    mutate(res = case_when(if_any(1:2, ~.x == TRUE) ~ TRUE,
                              if_all(1:2, ~is.na(.x)) ~ NA,
                              TRUE ~ FALSE))
}

darren_tsai_func <- function(df) {
  df$result <- as.logical(do.call(\(...) pmax(..., na.rm = TRUE), df[1:2]))
}

roland_func <- function(df) {
  cols <- 1:2
  df$result <- Reduce(\(x, y) x | y, df[cols])
  df[is.na(df$result), "result"] <- Reduce(\(x, y) ifelse(!is.na(x) | !is.na(y), FALSE, NA), 
                                           df[is.na(df$result), cols])
}

yuriy_saraykin_func <- function(df) {
  whereNA <- rowSums(is.na(df)) == ncol(df)
  df$expected <- rowSums(df, na.rm = TRUE) > 0
  df$expected[whereNA] <- NA
}

efz_func <- function(df) {
  output <- df %>% by_row(..f=function(row_i){
    ifelse(all(is.na(row_i)), NA,
           ifelse(any(row_i, na.rm= TRUE), TRUE,
                  ifelse(any(!row_i, na.rm= TRUE), FALSE,
                         row_i)))
    
  }, .collate = 'rows')
}

TIC_func <- function(df) {
  df$result <- rowSums(df, na.rm = TRUE) > 0 * NA^(rowMeans(is.na(df)) == 1)
}

result <- microbenchmark(ingo_pingo_func(df),
                         jared_mamrot_func(df), 
                         darren_tsai_func(df),
                         roland_func(df),
                         yuriy_saraykin_func(df),
                         efz_func(df),
                         TIC_func(df),
                         times = 5)

result$expr <- forcats::fct_rev(forcats::fct_reorder(result$expr, result$time, mean))
autoplot(result)

Roland On 18 July 2023 at 09:25

cols <- 1:2
DF$result <- Reduce(\(x, y) x | y, DF[cols])
DF[is.na(DF$result), "result"] <- Reduce(\(x, y) ifelse(!is.na(x) | !is.na(y), FALSE, NA), 
                                         DF[is.na(DF$result), cols])
#      a     b expected result
#1 FALSE FALSE    FALSE  FALSE
#2  TRUE FALSE     TRUE   TRUE
#3    NA FALSE    FALSE  FALSE
#4 FALSE  TRUE     TRUE   TRUE
#5  TRUE  TRUE     TRUE   TRUE
#6    NA  TRUE     TRUE   TRUE
#7 FALSE    NA    FALSE  FALSE
#8  TRUE    NA     TRUE   TRUE
#9    NA    NA       NA     NA

efz On 18 July 2023 at 09:44

another possible approach:

library(purrrlyr)

df %>% by_row(..f=function(row_i){
  ifelse(all(is.na(row_i)), NA,
         ifelse(any(row_i, na.rm= TRUE), TRUE,
                ifelse(any(!row_i, na.rm= TRUE), FALSE,
                       row_i)))
  
}, .collate = 'rows')

which gives

a     b     .out 
   <lgl> <lgl> <lgl>
 1 FALSE FALSE FALSE
 2 TRUE  FALSE TRUE 
 3 NA    FALSE FALSE
 4 FALSE TRUE  TRUE 
 5 TRUE  TRUE  TRUE 
 6 NA    TRUE  TRUE 
 7 FALSE NA    FALSE
 8 TRUE  NA    TRUE 
 9 NA    NA    NA   
10 FALSE FALSE FALSE

Yuriy Saraykin On 18 July 2023 at 10:53

df <- data.frame(a= c(FALSE, TRUE, NA, FALSE, TRUE, NA, FALSE, TRUE, NA),
                 b= c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, NA, NA, NA))


whereNA <- rowSums(is.na(df)) == ncol(df)
df$expected <- rowSums(df, na.rm = TRUE) > 0
df$expected[whereNA] <- NA

df
#>       a     b expected
#> 1 FALSE FALSE    FALSE
#> 2  TRUE FALSE     TRUE
#> 3    NA FALSE    FALSE
#> 4 FALSE  TRUE     TRUE
#> 5  TRUE  TRUE     TRUE
#> 6    NA  TRUE     TRUE
#> 7 FALSE    NA    FALSE
#> 8  TRUE    NA     TRUE
#> 9    NA    NA       NA

^{Created on 2023-07-18 with reprex v2.0.2}

ThomasIsCoding On 20 July 2023 at 09:18

You can use rowSums + rowMean + is.na like below

> df$result <- rowSums(df, na.rm = TRUE) > 0 * NA^(rowMeans(is.na(df)) == 1)

> df
      a     b result
1 FALSE FALSE  FALSE
2  TRUE FALSE   TRUE
3    NA FALSE  FALSE
4 FALSE  TRUE   TRUE
5  TRUE  TRUE   TRUE
6    NA  TRUE   TRUE
7 FALSE    NA  FALSE
8  TRUE    NA   TRUE
9    NA    NA     NA

data

df <- data.frame(
    a = c(FALSE, TRUE, NA, FALSE, TRUE, NA, FALSE, TRUE, NA),
    b = c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, NA, NA, NA)
)

**Darren Tsai** · Accepted Answer · 2023-07-18T09:29:43+00:00

A vectorized solution using pmax():

df$result <- as.logical(do.call(\(...) pmax(..., na.rm = TRUE), df[1:2]))

df
#       a     b expected result
# 1 FALSE FALSE    FALSE  FALSE
# 2  TRUE FALSE     TRUE   TRUE
# 3    NA FALSE    FALSE  FALSE
# 4 FALSE  TRUE     TRUE   TRUE
# 5  TRUE  TRUE     TRUE   TRUE
# 6    NA  TRUE     TRUE   TRUE
# 7 FALSE    NA    FALSE  FALSE
# 8  TRUE    NA     TRUE   TRUE
# 9    NA    NA       NA     NA

You can also merge all the parameters into a list to avoid the anonymous function in do.call(). I rewrite it as a function rowAnys to complement rowSums/rowMeans in base.

rowAnys <- function(x) {
  as.logical(do.call(pmax, c(na.rm = TRUE, x)))
}

You could also use pmin to implement rowwise-all().

rowAlls <- function(x) {
  as.logical(do.call(pmin, c(na.rm = TRUE, x)))
}

df$any <- rowAnys(df[1:2])
df$all <- rowAlls(df[1:2])

df
#       a     b expected   any   all
# 1 FALSE FALSE    FALSE FALSE FALSE
# 2  TRUE FALSE     TRUE  TRUE FALSE
# 3    NA FALSE    FALSE FALSE FALSE
# 4 FALSE  TRUE     TRUE  TRUE FALSE
# 5  TRUE  TRUE     TRUE  TRUE  TRUE
# 6    NA  TRUE     TRUE  TRUE  TRUE
# 7 FALSE    NA    FALSE FALSE FALSE
# 8  TRUE    NA     TRUE  TRUE  TRUE
# 9    NA    NA       NA    NA    NA

TechQA.

Applying function rowwise efficiently

There are 6 answers

data

Related Questions in R

Related Questions in DATAFRAME

Related Questions in LOGICAL-OPERATORS

Related Questions in ROWWISE

Popular Questions

Trending Questions