Row wise merge comma separated values in a `data.table`

217 views Asked by At

I have a data.table DT as follows.

DT <- structure(list(ID = c("Bats", "HL", "JL", "Spidey", "Supes", 
"X"), List1 = c("Morrison, Brubaker, Daniel, Loeb", "David, Bryne, Lee", 
"", "Loeb, Lee", "Moore, Siegel, Millar", "Bendis, Whendon"), 
    List2 = c("Rucka, Kane, Morrison", "Lee, Mantlo, Bryne", 
    "Meltzer, Sekowsky, Morrison", "Waid, Yost, Kirby, Lee", 
    "", "Claremont, Whendon, Morrison")), .Names = c("ID", "List1", 
"List2"), row.names = c(NA, -6L), class = c("data.table", "data.frame"
), .internal.selfref = NULL, sorted = "ID")

DT

      ID                            List1                        List2
1:   Bats Morrison, Brubaker, Daniel, Loeb        Rucka, Kane, Morrison
2:     HL                David, Bryne, Lee           Lee, Mantlo, Bryne
3:     JL                                   Meltzer, Sekowsky, Morrison
4: Spidey                        Loeb, Lee       Waid, Yost, Kirby, Lee
5:  Supes            Moore, Siegel, Millar                             
6:      X                  Bendis, Whendon Claremont, Whendon, Morrison

I would like to merge the two lists row-wise in columns DT$List1 and DT$List2 together without duplicates.

I can do this with apply as follows.

DT$merged <- apply(DT,1,function(vec){
  paste(unique(strsplit(paste(vec[2],vec[3],sep=", "),", ")[[1]]),collapse=", ")
})

     DT
       ID                            List1                        List2
1:   Bats Morrison, Brubaker, Daniel, Loeb        Rucka, Kane, Morrison
2:     HL                David, Bryne, Lee           Lee, Mantlo, Bryne
3:     JL                                   Meltzer, Sekowsky, Morrison
4: Spidey                        Loeb, Lee       Waid, Yost, Kirby, Lee
5:  Supes            Moore, Siegel, Millar                             
6:      X                  Bendis, Whendon Claremont, Whendon, Morrison
                                          merged
1: Morrison, Brubaker, Daniel, Loeb, Rucka, Kane
2:                     David, Bryne, Lee, Mantlo
3:                 , Meltzer, Sekowsky, Morrison
4:                  Loeb, Lee, Waid, Yost, Kirby
5:                         Moore, Siegel, Millar
6:          Bendis, Whendon, Claremont, Morrison

How to get the same result efficiently with data.table without the ", " in beginning and end due to empty cells?

4

There are 4 answers

1
eddi On BEST ANSWER
DT[, merged := toString(unique(c(strsplit(List1, split = ", ")[[1]],
                                 strsplit(List2, split = ", ")[[1]]))), by = ID][]
#       ID                            List1                        List2
#1:   Bats Morrison, Brubaker, Daniel, Loeb        Rucka, Kane, Morrison
#2:     HL                David, Bryne, Lee           Lee, Mantlo, Bryne
#3:     JL                                   Meltzer, Sekowsky, Morrison
#4: Spidey                        Loeb, Lee       Waid, Yost, Kirby, Lee
#5:  Supes            Moore, Siegel, Millar                             
#6:      X                  Bendis, Whendon Claremont, Whendon, Morrison
#                                          merged
#1: Morrison, Brubaker, Daniel, Loeb, Rucka, Kane
#2:                     David, Bryne, Lee, Mantlo
#3:                   Meltzer, Sekowsky, Morrison
#4:                  Loeb, Lee, Waid, Yost, Kirby
#5:                         Moore, Siegel, Millar
#6:          Bendis, Whendon, Claremont, Morrison

Or replace the 'by' with 1:nrow(DT) if your ID column is not unique.

0
akrun On

Try

library(data.table)#v1.9.5+
DT[, merged := do.call(paste, c(.SD, sep=", ")), .SDcols= List1:List2
 ][, merged:=unlist(lapply(strsplit(merged, ", "),
          function(x)  toString(unique(x))))]   
#     ID                            List1                        List2
#1:   Bats Morrison, Brubaker, Daniel, Loeb        Rucka, Kane, Morrison
#2:     HL                David, Bryne, Lee           Lee, Mantlo, Bryne
#3:     JL                                   Meltzer, Sekowsky, Morrison
#4: Spidey                        Loeb, Lee       Waid, Yost, Kirby, Lee
#5:  Supes            Moore, Siegel, Millar                             
#6:      X                  Bendis, Whendon Claremont, Whendon, Morrison
#                                          merged
#1: Morrison, Brubaker, Daniel, Loeb, Rucka, Kane
#2:                     David, Bryne, Lee, Mantlo
#3:                 , Meltzer, Sekowsky, Morrison
#4:                  Loeb, Lee, Waid, Yost, Kirby
#5:                         Moore, Siegel, Millar
#6:          Bendis, Whendon, Claremont, Morrison

Or we could use regex after do.call(paste to remove the duplicate elements

 DT[, merged := gsub('^,\\s*|(\\b\\S+\\b)(?=.*\\b\\1\\b.*),\\s*|,\\s*$', '',
     do.call(paste, c(.SD, sep=", ")), perl=TRUE), .SDcols = List1:List2]
#   ID                            List1                        List2
#1:   Bats Morrison, Brubaker, Daniel, Loeb        Rucka, Kane, Morrison
#2:     HL                David, Bryne, Lee           Lee, Mantlo, Bryne
#3:     JL                                   Meltzer, Sekowsky, Morrison
#4: Spidey                        Loeb, Lee       Waid, Yost, Kirby, Lee
#5:  Supes            Moore, Siegel, Millar                             
#6:      X                  Bendis, Whendon Claremont, Whendon, Morrison
 #                                         merged
#1: Brubaker, Daniel, Loeb, Rucka, Kane, Morrison
#2:                     David, Lee, Mantlo, Bryne
#3:                   Meltzer, Sekowsky, Morrison
#4:                  Loeb, Waid, Yost, Kirby, Lee
#5:                         Moore, Siegel, Millar
#6:          Bendis, Claremont, Whendon, Morrison
0
Veerendra Gadekar On

This could be another way

DT[, merged:= paste(union(unlist(strsplit(List1, ', ')),
              unlist(strsplit(List2, ', '))), collapse = ', '), by = ID]

#> DT
#       ID                            List1                        List2
#1:   Bats Morrison, Brubaker, Daniel, Loeb        Rucka, Kane, Morrison
#2:     HL                David, Bryne, Lee           Lee, Mantlo, Bryne
#3:     JL                                   Meltzer, Sekowsky, Morrison
#4: Spidey                        Loeb, Lee       Waid, Yost, Kirby, Lee
#5:  Supes            Moore, Siegel, Millar                             
#6:      X                  Bendis, Whendon Claremont, Whendon, Morrison
#                                          merged
#1: Morrison, Brubaker, Daniel, Loeb, Rucka, Kane
#2:                     David, Bryne, Lee, Mantlo
#3:                   Meltzer, Sekowsky, Morrison
#4:                  Loeb, Lee, Waid, Yost, Kirby
#5:                         Moore, Siegel, Millar
#6:          Bendis, Whendon, Claremont, Morrison
0
Saksham On

A simple example to show another approach. The function trimComma just trims off the the commas at beginning and at the end. It also trims repeating commas and can easily be used with paste

trimComma <-function(x)
{
    gsub(",,",",",gsub("^,+|,+$|","",x))
}