simplifying R code using dplyr (or other) to rowSums while ignoring NA, unlss all is NA

909 views Asked by At

I've initially solved my NA-issue helped by this questions. However, I would like to simplify my code. In the past, I've enjoyed the way dplyr has helped me simplify R code.

Below is a minimal working example illustrating my current solution and where I am at with dplyr.

I have data like this,

dta <- data.frame(foo=c(1,NA,3,4), fooZ=c(4,NA,5,NA), fooQ2=c(7,0,9,NA))

I need to summarize quite a few rows, using 0 as a value that I sum and keeping NA's for rows with all NA. Like this,

dta$sum1 <- rowSums(dta[, c('fooZ', 'fooQ2') ], na.rm=TRUE) * ifelse(
      rowSums(is.na(dta[, c('fooZ', 'fooQ2') ])) == 
               ncol(dta[, c('fooZ', 'fooQ2') ]), NA, 1)
dta
# >   foo fooZ fooQ2 sum1
# > 1   1    4     7   11
# > 2  NA   NA     0    0
# > 3   3    5     9   14
# > 4   4   NA    NA   NA

This does the trick and creates sum1, but I have to repeat the reference to the data three times. Can I simplify this in some handy way? I've made the below code using dplyr, but maybe there's a better way of summarizing rows; while keeping NA for rows that have all NA, ignoring NA's in rows with one or more values, and treating 0 a value to be 'summarized'?

# install.packages(c("dplyr", "ggplot2"), dependencies = TRUE)
require(dplyr)
dta$sum2 = dta %>% select(fooZ, fooQ2) %>% rowSums(., na.rm = TRUE)
dta
# >   foo fooZ fooQ2 sum1 sum2
# > 1   1    4     7   11   11
# > 2  NA   NA     0    0    0
# > 3   3    5     9   14   14
# > 4   4   NA    NA   NA    0

This creates sum2, but generates a 0 if na.rm = TRUE and too many NA's if na.rm = F.

Update as of 16 22:18:33Z

I made this somewhat elaborate micro-benchmark comparison of the different answer. Please feel do not haste to optimize any of the function. Writing R functions is not my force. Regardless,

set.seed(667)
n <- 1e5+22
dta <- data.frame(
  foo = sample(c(1:10, NA), n, replace = TRUE),
  fooZ = sample(c(1:10, NA), n, replace = TRUE),
  fooQ2 = sample(c(1:10, NA), n, replace = TRUE))

slice <- c(902:907,979:984)
dta[slice,]
#>     foo fooZ fooQ2
#> 902  10    7     2
#> 903  10   10     9
#> 904  NA   NA     8
#> 905   6    4     3
#> 906   8    9    10
#> 907   1    5    NA
#> 979  NA    1     1
#> 980  10    2    NA
#> 981   7   NA    NA
#> 982   3    7     7
#> 983  NA    9     6
#> 984   7   10     7


# `baseline' solution
baseline <- function(z, ...) {W  <- z[, c(...)]; W <- rowSums(W, na.rm=TRUE) * ifelse(rowSums(is.na(W)) == ncol(W), NA, 1); W}

# install.packages(c("dplyr", "ggplot2"), dependencies = TRUE)
require(dplyr)
# G. G.Gro's dplyr solution
G.Gro_dplyr1 <- function(z, ...) z %>% mutate(sum2 = select(., ...) %>% { rowSums(., na.rm = TRUE) + ifelse(apply(is.na(.), 1, all), NA, 0) })

# G. G.Gro's Variation 1a solution
G.Gro_dplyr1a <- function(z, ...) z %>% mutate(sum2 = select(., fooZ, fooQ2) %>% apply(1, . %>% { sum(., na.rm = TRUE) + if (all(is.na(.))) NA else 0}))

# G. G.Gro's base solution
G.Gro_base <- function(z, ...) {W  <- z[, c(...)]; S = {X <- dta[, c("fooZ", "fooQ2")]; rowSums(X, na.rm = TRUE) + ifelse(apply(is.na(X), 1, all), NA, 0)}; S}

# Thierry's solution
Thierry_my_sum <- function(z, ...){z <- select(z, ...); sums <- rowSums(z, na.rm = TRUE); sums[apply(is.na(z), 1, all)] <- NA; sums}

# lmo's solution
lmo <- function(z, ...) {W  <- z[, c(...)]; rowSums(W, na.rm=TRUE) * (NA^(rowSums(is.na(W)) == ncol(W)))}

# Benjamin's solution
Benjamin <- function(..., na.rm = FALSE, all.na = NA){v <- list(...); all_na <- lapply(v, is.na); all_na <- Reduce(`&`, all_na); all_na; if (na.rm){v <- lapply(v, function(x) {x[is.na(x)] <- 0; x}); }; v <- Reduce(`+`, v); v[all_na] <- all.na; v;}

# Aramis7d's solution
Aramis7d <- function(z, ...) {z %>% select(...) %>% mutate(sum = rowSums(., na.rm=TRUE)) %>% mutate(s2 = rowSums(is.na(.))) %>% mutate(sum = if_else(s2 < 2, sum, as.double(NA))) %>%  select(sum) }

# Fail's solution combining from all
Fail <- function(z, ...){z <- select(z, ...); zTF <- rowMeans(is.na(z)) == 1; replace(rowSums(z, na.rm = TRUE), zTF, NA)}

# install.packages("microbenchmark", dependencies = TRUE)
require(microbenchmark)

# run test
res <- microbenchmark(
baseline(dta, c("fooZ", "fooQ2")),
Thierry_my_sum(dta, fooZ, fooQ2),
G.Gro_dplyr1(dta, fooZ, fooQ2)[,ncol(dta)+1],
G.Gro_dplyr1a(dta, fooZ, fooQ2)[, ncol(dta) + 1],
G.Gro_base(dta, c("fooZ", "fooQ2")),
(dta %>% mutate(sum99 = Benjamin(fooZ, fooQ2, na.rm = TRUE)))[,ncol(dta)+1],
lmo(dta, c("fooZ", "fooQ2")),
Aramis7d(dta, fooZ, fooQ2)[,1],
Fail(dta, fooZ, fooQ2),
 times = 25)

# clean up
levels(res[[1]]) <- c('baseline', 'Thierry', 'G.Gro1', 'G.Gro1a', 'G.Gro2', 'Benjamin', 'lmo', 'Aramis7d', 'Fail')

## Print results:
print(res)

 print(res)
#> Unit: milliseconds
#>      expr        min         lq        mean     median          uq        max neval cld
#>  baseline  12.729803  15.691060   31.141114  23.299101   48.694436   72.83702    25   a  
#>   Thierry 215.541035 241.795764  298.319826 263.822553  363.066476  494.90875    25   b 
#>    G.Gro1 226.761181 242.617099  295.413437 264.911513  307.339115  591.28424    25   b 
#>   G.Gro1a 935.176542 985.329298 1088.300741 997.788858 1030.085839 1736.51506    25   c
#>    G.Gro2 219.650080 227.464694  292.898566 246.188189  320.789036  505.08154    25   b 
#>  Benjamin   6.227054   9.327364   15.583907  11.230079   14.345366   55.44653    25   a  
#>       lmo   4.138434   5.970850    9.329506   6.851132    8.406799   39.40295    25   a  
#>  Aramis7d  33.966101  38.737671   60.777304  66.663967   72.686939  100.72799    25   a  
#>      Fail  11.464254  13.932386   20.476011  14.865245   25.156740   58.37730    25   a  

### Plot results:
boxplot(res)

box-

5

There are 5 answers

3
lmo On BEST ANSWER

Here's a base R trick using exponentiation of NA:

rowSums(dta[-1], na.rm=TRUE) * (NA^(rowSums(is.na(dta[-1])) == ncol(dta[-1])))
[1] 11  8 14 NA

Any number to the 0th power is 1, so any rows that contain a non-NA value return a 1 in the second term. Otherwise, NA is returned.

This assumes that you only want to take into account variables other than your first variable.

Combining the improvements the OP made to the code above with an additional step, we could improve the efficiency with

rowSumsNA <- function(dat, ...) {
    W <- data.matrix(dat[...])
    rowSums(W, na.rm=TRUE) * (NA^(rowSums(is.na(W)) == ncol(W)))
}

Most of the improvements are in the OP's method of storing the subset data.frame prior to the calculation (127ms vs 84ms on my machine), but a slight additional improvement can be had by converting that data.frame to a matrix prior to calling rowSums (84ms vs 77ms on my machine).

2
Benjamin On

Not as elegant as the other solutions, but it avoids having to drop variables from the data frame and then rejoin. So this is good if you're interested in keeping your data frame intact. It will lose it's advantage if you have a lot of variables to include.

dta %>% 
  mutate(all_na = Reduce(`&`, lapply(list(fooZ, fooQ2), is.na)),
         sum1 = Reduce(`+`, lapply(list(fooZ, fooQ2), function(x) {x[is.na(x)] <- 0; x})),
         sum1 = ifelse(all_na, NA, sum1)) %>% 
  select(-all_na)

Alternatively, you can bundle it into a function:

rsum <- function(..., na.rm = FALSE, all.na = NA){

  v <- list(...)

  all_na <- lapply(v, is.na)
  all_na <- Reduce(`&`, all_na)
  all_na

  if (na.rm){
    v <- lapply(v, function(x) {x[is.na(x)] <- 0; x})
  }

  v <- Reduce(`+`, v)

  v[all_na] <- all.na
  v
}

dta %>% 
  mutate(sum1 = rsum(fooZ, fooQ2, na.rm = TRUE))
4
Thierry On

Here is a simple dplyr solution

library(dplyr)
dta <- data.frame(foo=c(1,NA,3,4), fooZ=c(4,NA,5,NA), fooQ2=c(7,0,9,NA))
my_sum <- function(z, ...){
  z <- select(z, ...)
  sums <- rowSums(z, na.rm = TRUE)
  sums[apply(is.na(z), 1, all)] <- NA
  sums
}

dta %>%
  mutate(
    sum1 = my_sum(., fooZ, fooQ2),
    sum2 = my_sum(., foo, fooQ2),
    sum3 = my_sum(., foo, fooZ)
  )
7
G. Grothendieck On

1) dplyr This computes the row sums and then adds on NA or 0 depending on whether the entire row is NA or not.

dta %>%
    mutate(sum2 = select(., fooZ, fooQ2) %>%
                  { rowSums(., na.rm = TRUE) + ifelse(apply(is.na(.), 1, all), NA, 0) })

giving:

  foo fooZ fooQ2 sum2
1   1    4     7   11
2  NA   NA     8    8
3   3    5     9   14
4   4   NA    NA   NA

1a) Variation A variation of (1) is:

dta %>%
    mutate(sum2 = select(., fooZ, fooQ2) %>%
        apply(1, . %>% { sum(., na.rm = TRUE) + if (all(is.na(.))) NA else 0}))

2) base Using no packages we can do this:

transform(dta, sum2 = { 
      X <- data.frame(fooZ, fooQ2)
      rowSums(X, na.rm = TRUE) + ifelse(apply(is.na(X), 1, all), NA, 0)
})

3) data.table

library(data.table)
DT <- as.data.table(dta)
DT[, sum2 := rowSums(.SD, na.rm = TRUE) + ifelse(apply(is.na(.SD), 1, all), NA, 0) , .SDcols = c("fooZ", "fooQ2")]

Update: Moved select inside mutate to preserve foo column. Added additional solutions.

1
Aramis7d On

alternately, using dplyr, you can try something like:

dta %>%
  select(-foo) %>%
  mutate(sum1 = rowSums(., na.rm=TRUE)) %>%
  mutate(s2 = rowSums(is.na(.))) %>%  
  mutate(sum1 = if_else(s2 < 2, sum1, as.double(NA))) %>%
  bind_cols(dta) %>%
  select(foo, fooZ, fooQ2, sum1)

which gives:

  foo fooZ fooQ2 sum1
1   1    4     7   11
2  NA   NA     8    8
3   3    5     9   14
4   4   NA    NA   NA

in case you don't really care about retaining the column foo , you can get rid of the col_bind fucntion call