Replicate column names and splice

53 views Asked by At

I have a dataframe as follows:

df <- data.frame(A = c(2, 0, 1), B = c(0, 3, 2))

#   A B
# 1 2 0
# 2 0 3
# 3 1 2

The number in each cell indicates the times for which the corresponding column name should repeat. The replicates should be spliced by semicolons(;) to a single string. The expected output turns out to

#      A     B
# 1  A;A  <NA>
# 2 <NA> B;B;B
# 3    A   B;B

I'm searching a efficient way to deal with a much larger dataset:

set.seed(1234)
df <- as.data.frame(matrix(sample(0:5, 1e4*26, replace = TRUE), 1e4, 26))
names(df) <- LETTERS

#    A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
# 1  3 0 3 5 4 0 3 2 2 1 3 3 4 3 2 4 0 1 4 5 2 5 5 2 0 0
# 2  1 5 1 0 3 3 2 0 1 5 5 2 5 0 2 5 1 1 2 4 5 5 0 5 0 0
# 3  5 5 2 0 1 4 5 4 0 5 5 1 1 1 2 2 4 5 4 5 5 5 0 4 0 0
# ...
# [ reached 'max' / getOption("max.print") -- omitted 9997 rows ]

I prefer base or tidyverse solutions. data.table is welcome but I'm unfamiliar to it though.

2

There are 2 answers

0
Darren Tsai On BEST ANSWER

One base option:

res <- df
n <- unlist(res)
res[res > 0] <- sapply(
  split(rep.int(names(df)[col(df)], n), rep.int(seq_along(n), n)),
  paste, collapse = ';'
)
res[res == 0] <- NA

res[1:5, 1:5]
#           A         B       C         D       E
# 1     A;A;A      <NA>   C;C;C D;D;D;D;D E;E;E;E
# 2         A B;B;B;B;B       C      <NA>   E;E;E
# 3 A;A;A;A;A B;B;B;B;B     C;C      <NA>       E
# 4   A;A;A;A   B;B;B;B    <NA>         D E;E;E;E
# 5     A;A;A         B C;C;C;C       D;D   E;E;E
0
Maël On

Here is a base R option:

df[] <- mapply(\(x, y) sapply(x, \(z) ifelse(z == 0, NA, paste(rep(y, z), collapse = ";"))),
       df, names(df))

#      A     B
# 1  A;A  <NA>
# 2 <NA> B;B;B
# 3    A   B;B

And converted to tidyverse:

library(purrr)
library(dplyr)
df %>% 
 imap_dfr(\(x, y) sapply(x, \(z) ifelse(z == 0, NA, paste(rep(y, z), collapse = ";"))))

Another dplyr option (but probably slow):

df %>% 
  rowwise() %>% 
  mutate(across(everything(), ~ ifelse(.x == 0, NA, paste(rep(rep(cur_column(), n()), .x), collapse = ";"))))

Benchmark:

library(microbenchmark)
mb <- microbenchmark(
  darren = {res <- df
  n <- unlist(res)
  res[res > 0] <- sapply(
    split(rep.int(names(df)[col(df)], n), rep.int(seq_along(n), n)),
    paste, collapse = ';'
  )
  res[res == 0] <- NA},
  mael = mapply(\(x, y) sapply(x, \(z) ifelse(z == 0, NA, paste(rep(y, z), collapse = ";"))),
                df, names(df)),
  setup = {
    set.seed(1234)
    df <- as.data.frame(matrix(sample(0:5, 1e4*26, replace = TRUE), 1e4, 26))
    names(df) <- LETTERS
  },
  times = 10
)

#Unit: seconds
#   expr      min       lq     mean   median       uq      max neval
# darren 1.017067 1.042890 1.116436 1.105676 1.203968 1.227238    10
#   mael 1.426412 1.518794 1.576665 1.581085 1.628206 1.713553    10