R - calculate proportions across columns with common prefix

55 views Asked by At

I have a dataframe with 37 columns in it, with a representative sample, df below

 df <- structure(list(irm = 201201:201202, trans11 = c(379L, 433L), 
    trans12 = 4:3, trans13 = 5:4, trans14 = c(13L, 3L), trans15 = c(29L, 
    21L), trans16 = c(0L, 0L), trans21 = c(6L, 4L), trans22 = 2:1, 
    trans23 = c(0L, 0L), trans24 = 0:1, trans25 = c(0L, 0L), 
    trans26 = c(0L, 0L), trans31 = c(2L, 2L), trans32 = c(0L, 
    0L), trans33 = 5:6, trans34 = c(0L, 0L), trans35 = c(7L, 
    2L), trans36 = c(0L, 0L), trans41 = c(4L, 10L), trans42 = c(0L, 
    0L), trans43 = c(0L, 0L), trans44 = c(4L, 10L), trans45 = c(3L, 
    1L), trans46 = c(0L, 0L), trans51 = c(15L, 18L), trans52 = c(0L, 
    0L), trans53 = c(1L, 1L), trans54 = c(4L, 0L), trans55 = c(96L, 
    115L), trans56 = c(0L, 0L), trans61 = c(0L, 0L), trans62 = c(0L, 
    0L), trans63 = c(0L, 0L), trans64 = c(0L, 0L), trans65 = c(0L, 
    0L), trans66 = c(0L, 0L)), row.names = c(NA, -2L), class = c("data.table", 
"data.frame"))

The dataframe has 37 columns: trans11...trans16...trans61...trans66 plus the irm column for month.

What I would like to do is the following:

  1. for each row/column entry in trans11 all the way through trans66, calculate the proportion of the entry relative to the sum of all other columns with the same prefix (e.g. trans1). So for the example here, the first row in entries 2 through 7 would be: (0.8813953, 0.009302326, 0.01162791, 0.03023256, 0.06744186, 0) for columns 2 through 7 (since we need to sum trans11....trans16)

  2. How would I do this for all 36 columns in the larger df?

Is there a way to do this with group_by and starts_with from dplyr? I know a for loop is probably possible but any and all suggestions are welcome.

Thanks

Updated example with larger dataframe example

3

There are 3 answers

1
Onyambu On BEST ANSWER

in base R use prop.table/propotions:

cbind(df, prop = prop.table(as.matrix(df[-1]), 1))

      irm trans11 trans12 trans13 trans14 trans15 trans16 prop.trans11 prop.trans12 prop.trans13 prop.trans14 prop.trans15 prop.trans16
1  201201     379       4       5      13      29       0    0.8813953  0.009302326  0.011627907  0.030232558   0.06744186            0
2  201202     433       3       4       3      21       0    0.9331897  0.006465517  0.008620690  0.006465517   0.04525862            0
3  201203     468       2       9       9      19       0    0.9230769  0.003944773  0.017751479  0.017751479   0.03747535            0
4  201204     514       3       9       8      15       0    0.9362477  0.005464481  0.016393443  0.014571949   0.02732240            0
5  201205     559      11       8       6      12       0    0.9379195  0.018456376  0.013422819  0.010067114   0.02013423            0
6  201206     566       5       9       6      16       0    0.9401993  0.008305648  0.014950166  0.009966777   0.02657807            0
7  201207     577       3      13       4      21       0    0.9336570  0.004854369  0.021035599  0.006472492   0.03398058            0
8  201208     622       7      14      10      15       0    0.9311377  0.010479042  0.020958084  0.014970060   0.02245509            0
9  201209     665       4      12       3      11       0    0.9568345  0.005755396  0.017266187  0.004316547   0.01582734            0
10 201210     692       4      11       7      15       0    0.9492455  0.005486968  0.015089163  0.009602195   0.02057613            0
11 201211     738       6       5       5       9       0    0.9672346  0.007863696  0.006553080  0.006553080   0.01179554            0
12 201212     830       8       8       4      11       0    0.9639954  0.009291521  0.009291521  0.004645761   0.01277584            0

If you need to select using trans name:

cbind(df, prop = prop.table(as.matrix(df[startsWith(names(df), 'trans')]), 1))

Edit

With the edit of the data, use:

library(tidyverse)
df %>%
  pivot_longer(starts_with('trans'), names_to = c('tr', 'grp'), 
                 names_pattern = '(.*)(.)') %>%
  mutate(value = value/sum(value), .by = c(irm, tr)) %>%
  pivot_wider(names_from = c(tr, grp), names_sep = '', names_prefix = 'prop_')
5
Gregor Thomas On
df |>
  mutate(
    row_sum = rowSums(across(starts_with("trans"))),
    across(starts_with("trans"), \(x) x / row_sum, .names = "{.col}_prop")
    )
#        irm trans11 trans12 trans13 trans14 trans15 trans16 row_sum trans11_prop trans12_prop
#  1: 201201     379       4       5      13      29       0     430    0.8813953  0.009302326
#  2: 201202     433       3       4       3      21       0     464    0.9331897  0.006465517
#  3: 201203     468       2       9       9      19       0     507    0.9230769  0.003944773
#  4: 201204     514       3       9       8      15       0     549    0.9362477  0.005464481
#  5: 201205     559      11       8       6      12       0     596    0.9379195  0.018456376
#  6: 201206     566       5       9       6      16       0     602    0.9401993  0.008305648
#  7: 201207     577       3      13       4      21       0     618    0.9336570  0.004854369
#  8: 201208     622       7      14      10      15       0     668    0.9311377  0.010479042
#  9: 201209     665       4      12       3      11       0     695    0.9568345  0.005755396
# 10: 201210     692       4      11       7      15       0     729    0.9492455  0.005486968
# 11: 201211     738       6       5       5       9       0     763    0.9672346  0.007863696
# 12: 201212     830       8       8       4      11       0     861    0.9639954  0.009291521
#     trans13_prop trans14_prop trans15_prop trans16_prop
#  1:  0.011627907  0.030232558   0.06744186            0
#  2:  0.008620690  0.006465517   0.04525862            0
#  3:  0.017751479  0.017751479   0.03747535            0
#  4:  0.016393443  0.014571949   0.02732240            0
#  5:  0.013422819  0.010067114   0.02013423            0
#  6:  0.014950166  0.009966777   0.02657807            0
#  7:  0.021035599  0.006472492   0.03398058            0
#  8:  0.020958084  0.014970060   0.02245509            0
#  9:  0.017266187  0.004316547   0.01582734            0
# 10:  0.015089163  0.009602195   0.02057613            0
# 11:  0.006553080  0.006553080   0.01179554            0
# 12:  0.009291521  0.004645761   0.01277584            0

You can, of course, drop the row_sum column after.

0
TarJae On

Here is a plain tidyverse solution by first pivoting into long format, doing the calculation and then pivoting to wide.

library(tidyr)
library(dplyr)
df %>%
  pivot_longer(-irm, names_to = "trans", values_to = "value") %>%
  mutate(sum_group = sum(value), proportion = value / sum_group, .by=c(irm), .keep = "unused") %>% 
  pivot_wider(names_from = "trans", values_from = "proportion", names_glue = "{trans}_prop") %>% 
  right_join(df, join_by(irm)) 
 irm sum_group trans11_prop trans12_prop trans13_prop trans14_prop trans15_prop trans16_prop trans11 trans12 trans13 trans14 trans15 trans16
    <int>     <int>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>   <int>   <int>   <int>   <int>   <int>   <int>
 1 201201       430        0.881      0.00930      0.0116       0.0302        0.0674            0     379       4       5      13      29       0
 2 201202       464        0.933      0.00647      0.00862      0.00647       0.0453            0     433       3       4       3      21       0
 3 201203       507        0.923      0.00394      0.0178       0.0178        0.0375            0     468       2       9       9      19       0
 4 201204       549        0.936      0.00546      0.0164       0.0146        0.0273            0     514       3       9       8      15       0
 5 201205       596        0.938      0.0185       0.0134       0.0101        0.0201            0     559      11       8       6      12       0
 6 201206       602        0.940      0.00831      0.0150       0.00997       0.0266            0     566       5       9       6      16       0
 7 201207       618        0.934      0.00485      0.0210       0.00647       0.0340            0     577       3      13       4      21       0
 8 201208       668        0.931      0.0105       0.0210       0.0150        0.0225            0     622       7      14      10      15       0
 9 201209       695        0.957      0.00576      0.0173       0.00432       0.0158            0     665       4      12       3      11       0
10 201210       729        0.949      0.00549      0.0151       0.00960       0.0206            0     692       4      11       7      15       0
11 201211       763        0.967      0.00786      0.00655      0.00655       0.0118            0     738       6       5       5       9       0
12 201212       861        0.964      0.00929      0.00929      0.00465       0.0128            0     830       8       8       4      11       0