R string split, to normalized (long) format with running index

851 views Asked by At

I have this data frame

structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2", 
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id", 
"rules"), row.names = c(NA, -2L), class = "data.frame")

#  rule.id           rules
#1       1 Lamp1.1,Lamp1.2
#2       2 Lamp2.1,Lamp2.2

which I need to split on the "rules" column by separator comma (","), multiple commas occur (not only 2 like in example) and then transform this into a normalized format with keeping the relevant rule.id value from the original df. The result should look like this:

structure(list(rule.id = c(1, 1, 2, 2), lhs = c("Lamp1.1", "Lamp1.2", 
"Lamp2.1", "Lamp2.1")), .Names = c("rule.id", "lhs"), row.names = c(NA, 
-4L), class = "data.frame")

#  rule.id     lhs
#1       1 Lamp1.1
#2       1 Lamp1.2
#3       2 Lamp2.1
#4       2 Lamp2.1

I have a code that takes care of the str split and normalize (long) format, but not sure how to take care of the rule.id requirement

lhs.norm <- as.data.frame(
  cbind(
    rules.df$ruleid, 
    unlist(strsplit(
      unlist(lapply(strsplit(unlist(lapply(as.character(rules.df$rules),function(x) substr(x,2,nchar(x)))), "} =>", fixed = T), function(x) x[1]))
      ,","))))

thanks to @acrun solution using

cSplit(rules.df.lhs, "lhs", ",", "long"))

I benchmarked 19 seconds for 1M rows (result was around 2M rows)

2

There are 2 answers

4
akrun On BEST ANSWER

We can use cSplit from splitstackshape

library(splitstackshape)
cSplit(df, "rules", ",", "long")
#   rule.id   rules
#1:       1 Lamp1.1
#2:       1 Lamp1.2
#3:       2 Lamp2.1
#4:       2 Lamp2.2

If this is a huge dataset, we can use stringi to split

library(stringi)
lst <- stri_split_fixed(df$rules, ",")
df2 <- data.frame(rule.id = rep(df$rule.id, lengths(lst)),
                  rules = unlist(lst))
df2
#   rule.id   rules
#1       1 Lamp1.1
#2       1 Lamp1.2
#3       2 Lamp2.1
#4       2 Lamp2.2

Another option is data.table

library(data.table)
setDT(df)[, strsplit(as.character(rules), ","), by = rule.id]
0
jay.sf On

With the new base pipes we can make @akrun's great solution using stringi::stri_split_fixed even faster. This also exploits recycling of the rule.id column.

stringi::stri_split_fixed(d$rules, ",") |>
  unlist() |>
  cbind(d[1])
#   unlist(stringi::stri_split_fixed(d$rules, ",")) rule.id
# 1                                         Lamp1.1       1
# 2                                         Lamp1.2       2
# 3                                         Lamp2.1       1
# 4                                         Lamp2.2       2

Benchmark

sapply(c('splitstackshape', 'stringi', 'data.table', 'reshape2'),
       library, character.only=TRUE)
dl <- data.frame(rule.id=1:1e6, rules=d$rules)
microbenchmark::microbenchmark(
  melt=cbind(dl[1],  do.call(rbind, strsplit(as.character(dl$rules), ',', fixed=T))) |>
    reshape2::melt('rule.id'),
  cbind=stri_split_fixed(dl$rules, ",") |>
    unlist() |>
    cbind(dl[1]),
  dtable=as.data.table(dl)[, strsplit(as.character(rules), ","), by = rule.id],
  cSplit=cSplit(dl, "rules", ",", "long"),
  stringi={lst <- stri_split_fixed(dl$rules, ",")
  data.frame(rule.id = rep(dl$rule.id, lengths(lst)),
             rules = unlist(lst))}, times=3L)
# Unit: milliseconds
#    expr        min         lq       mean     median         uq        max neval  cld
#    melt  1472.5459  1518.5649  1608.8124  1564.5838  1676.9456  1789.3075     3  b  
#   cbind   335.7105   365.9372   380.9120   396.1639   403.5128   410.8617     3 a   
#  dtable 10414.8486 10605.5725 10674.1134 10796.2965 10803.7458 10811.1951     3    d
#  cSplit  3003.0660  3079.3098  3232.6108  3155.5537  3347.3832  3539.2128     3   c 
# stringi   421.1481   469.1054   518.9577   517.0627   567.8626   618.6624     3 a   
# Warning messages:
# 1: In type.convert.default(unlist(x, use.names = FALSE)) :
#   'as.is' should be specified by the caller; using TRUE
# 2: In type.convert.default(unlist(x, use.names = FALSE)) :
#   'as.is' should be specified by the caller; using TRUE
# 3: In type.convert.default(unlist(x, use.names = FALSE)) :
#   'as.is' should be specified by the caller; using TRUE

Note: The warnings stem from cSplit() which code probably wasn't updated for a long time.


Data

d <- structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
                                                                         "Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
                                                                                                                            "rules"), row.names = c(NA, -2L), class = "data.frame")