Sum up replicate columns in dataframe, keep rows unique

102 views Asked by At

I thought this would be a relatively easy task to do but I couldn't find examples here that weren't focused on summing up rows based on column conditions. What I'm trying to achieve is to sum up column replicates, but keeping rows unique.

Here's what I mean:

                                 MKC100.1 MKC100.2 MKC100.3 MKC103.1 MKC103.2 MKC103.3 MKC104.2 MKC104.3
299fc0ac11fb4afd0da849a2c45583b3        0        0        0        0        0        0        0        1
9bc2bacdfadf4c1352ffbc991803287c     1183     1666     1318        0        0        0       10       20
38b782d9f01c69c3570fe0edd5864dc0      493      626      543       10        0        0        5        5
6d078397349f7d39c34d237a6ef4cb75    43735    51511    46876        0        0        0        1        0
c22e752b441ee4190f27a3690c5d1206        0        0        0     2795     1128     1956        1        1
f6513affb198fb9845741b61ece8db4b       59       58       82        0        0        0        0        0



structure(list(MKC100.1 = c(0L, 1183L, 493L, 43735L, 0L, 59L), 
    MKC100.2 = c(0L, 1666L, 626L, 51511L, 0L, 58L), MKC100.3 = c(0L, 
    1318L, 543L, 46876L, 0L, 82L), MKC103.1 = c(0L, 0L, 10L, 
    0L, 2795L, 0L), MKC103.2 = c(0L, 0L, 0L, 0L, 1128L, 0L), 
    MKC103.3 = c(0L, 0L, 0L, 0L, 1956L, 0L), MKC104.2 = c(0L, 
    10L, 5L, 1L, 1L, 0L), MKC104.3 = c(1L, 20L, 5L, 0L, 1L, 0L
    )), class = "data.frame", row.names = c("299fc0ac11fb4afd0da849a2c45583b3", 
"9bc2bacdfadf4c1352ffbc991803287c", "38b782d9f01c69c3570fe0edd5864dc0", 
"6d078397349f7d39c34d237a6ef4cb75", "c22e752b441ee4190f27a3690c5d1206", 
"f6513affb198fb9845741b61ece8db4b"))

The replicates typically come in three but will have an ".1" or ".2" or ".3" appended to the end of it. I need to essentially merge these all up so that I end up with a dataframe like below:

                                MKC100 MKC103   MKC104
299fc0ac11fb4afd0da849a2c45583b3     0        0       1
9bc2bacdfadf4c1352ffbc991803287c     4167     0       30
38b782d9f01c69c3570fe0edd5864dc0     1662     10      10
6d078397349f7d39c34d237a6ef4cb75     142122   0       1 
c22e752b441ee4190f27a3690c5d1206     0        5879    2   
f6513affb198fb9845741b61ece8db4b     199      0       0

EDIT Sometimes replicates are removed and I won't have all three replicates per sample in the dataframe. Any replicates should still be merged together. I've updated the sample data to reflect this scenario.

3

There are 3 answers

1
r2evans On BEST ANSWER

base R

out <- lapply(split(names(data), sub("\\..*", "", names(data))),
              function(nms) rowSums(data[,nms])) |>
  data.frame()
out
#                                  MKC100 MKC103
# 299fc0ac11fb4afd0da849a2c45583b3      0      0
# 9bc2bacdfadf4c1352ffbc991803287c   4167      0
# 38b782d9f01c69c3570fe0edd5864dc0   1662     10
# 6d078397349f7d39c34d237a6ef4cb75 142122      0
# c22e752b441ee4190f27a3690c5d1206      0   5879
# f6513affb198fb9845741b61ece8db4b    199      0

dplyr+tidyr

A slightly different approach, we'll pivot/summarize/unpivot:

library(dplyr)
library(tidyr)
data |>
  tibble::rownames_to_column() |>
  pivot_longer(cols = -rowname) |>
  mutate(name = sub("\\..*", "", name)) |>
  summarize(value = sum(value), .by = c(rowname, name)) |>
  pivot_wider(id_cols = rowname)
# # A tibble: 6 × 3
#   rowname                          MKC100 MKC103
#   <chr>                             <int>  <int>
# 1 299fc0ac11fb4afd0da849a2c45583b3      0      0
# 2 9bc2bacdfadf4c1352ffbc991803287c   4167      0
# 3 38b782d9f01c69c3570fe0edd5864dc0   1662     10
# 4 6d078397349f7d39c34d237a6ef4cb75 142122      0
# 5 c22e752b441ee4190f27a3690c5d1206      0   5879
# 6 f6513affb198fb9845741b61ece8db4b    199      0

You can add |> tibble::column_to_rownames() if you need them back instead of a column.

purrr

library(purrr)
out <- split(names(data), sub("\\..*", "", names(data))) |>
  map_dfc(~ rowSums(data[,.x])) |>
  as.data.frame()
rownames(out) <- rownames(data)
out
#                                  MKC100 MKC103
# 299fc0ac11fb4afd0da849a2c45583b3      0      0
# 9bc2bacdfadf4c1352ffbc991803287c   4167      0
# 38b782d9f01c69c3570fe0edd5864dc0   1662     10
# 6d078397349f7d39c34d237a6ef4cb75 142122      0
# c22e752b441ee4190f27a3690c5d1206      0   5879
# f6513affb198fb9845741b61ece8db4b    199      0

The use of as.data.frame is to change it from being a tibble, since tidyverse and tibbles specifically are biased against row names.

You may want to consider either discarding the row names entirely, or bringing them into the frame as a column (as I did with rownames_to-column above, easily done in base R with data$rowname <- rownames(data)).

0
s_baldur On

Another base R solution:

res <- data[character(0)] 
cols <- c("MKC100", "MKC103")
res[cols] <- lapply(cols, \(col) rowSums(data[grep(col, names(data))]))

If you have constant pattern length

res[cols] <- lapply(cols, \(col) rowSums(data[col == substr(names(data), 1, 6)]))

Output

#                                  MKC100 MKC103
# 299fc0ac11fb4afd0da849a2c45583b3      0      0
# 9bc2bacdfadf4c1352ffbc991803287c   4167      0
# 38b782d9f01c69c3570fe0edd5864dc0   1662     10
# 6d078397349f7d39c34d237a6ef4cb75 142122      0
# c22e752b441ee4190f27a3690c5d1206      0   5879
# f6513affb198fb9845741b61ece8db4b    199      0
1
ThomasIsCoding On

Here are some base R options:


- with aggregate + reshape + row.names<-

`row.names<-`(aggregate(
    . ~ id,
    reshape(
        data,
        direction = "long",
        varying = 1:ncol(data)
    ), sum
)[-(1:2)], row.names(data))

gives

                                 MKC100 MKC103
299fc0ac11fb4afd0da849a2c45583b3      0      0
9bc2bacdfadf4c1352ffbc991803287c   4167      0
38b782d9f01c69c3570fe0edd5864dc0   1662     10
6d078397349f7d39c34d237a6ef4cb75 142122      0
c22e752b441ee4190f27a3690c5d1206      0   5879
f6513affb198fb9845741b61ece8db4b    199      0

- with split.default + rowSums

as.data.frame(
    lapply(
        split.default(data, sub("\\.\\d+$", "", names(data))),
        rowSums
    )
)

gives

                                 MKC100 MKC103
299fc0ac11fb4afd0da849a2c45583b3      0      0
9bc2bacdfadf4c1352ffbc991803287c   4167      0
38b782d9f01c69c3570fe0edd5864dc0   1662     10
6d078397349f7d39c34d237a6ef4cb75 142122      0
c22e752b441ee4190f27a3690c5d1206      0   5879
f6513affb198fb9845741b61ece8db4b    199      0

- with xtabs + aggregate + as.matrix

xtabs(
    Freq ~ .,
    aggregate(
        Freq ~ .,
        transform(
            as.data.frame.table(as.matrix(data)),
            Var2 = sub("\\.\\d+$", "", Var2)
        ), sum
    )
)

gives

                                  Var2
Var1                               MKC100 MKC103
  299fc0ac11fb4afd0da849a2c45583b3      0      0
  9bc2bacdfadf4c1352ffbc991803287c   4167      0
  38b782d9f01c69c3570fe0edd5864dc0   1662     10
  6d078397349f7d39c34d237a6ef4cb75 142122      0
  c22e752b441ee4190f27a3690c5d1206      0   5879
  f6513affb198fb9845741b61ece8db4b    199      0

- with reshape + aggregate + as.matrix

reshape(
    aggregate(
        Freq ~ .,
        transform(
            as.data.frame.table(as.matrix(data)),
            Var2 = sub("\\.\\d+$", "", Var2)
        ), sum
    ),
    direction = "wide",
    idvar = "Var1",
    timevar = "Var2"
)

gives

                              Var1 Freq.MKC100 Freq.MKC103
1 299fc0ac11fb4afd0da849a2c45583b3           0           0
2 9bc2bacdfadf4c1352ffbc991803287c        4167           0
3 38b782d9f01c69c3570fe0edd5864dc0        1662          10
4 6d078397349f7d39c34d237a6ef4cb75      142122           0
5 c22e752b441ee4190f27a3690c5d1206           0        5879
6 f6513affb198fb9845741b61ece8db4b         199           0

- with aggregate + stack + xtabs

xtabs(
    values ~ grp + ind,
    aggregate(
        values ~ .,
        transform(
            stack(data),
            ind = sub("\\.\\d+$", "", ind),
            grp = row.names(data)[ave(seq_along(ind), ind, FUN = seq_along)]
        ), sum
    )
)

gives

                                  ind
grp                                MKC100 MKC103
  299fc0ac11fb4afd0da849a2c45583b3      0      0
  38b782d9f01c69c3570fe0edd5864dc0   1662     10
  6d078397349f7d39c34d237a6ef4cb75 142122      0
  9bc2bacdfadf4c1352ffbc991803287c   4167      0
  c22e752b441ee4190f27a3690c5d1206      0   5879
  f6513affb198fb9845741b61ece8db4b    199      0