Making long data wide with nested categories

84 views Asked by At

I'm trying to convert long data to a wide format, however I have multiple categories that need to be nested. My current data looks like this:

YRTR    sub_cou SUBJ    PATH    path_count  pre_drop_count  freq
20173   ACCT 2251   ACCT    1051 -> 2251    1   235 0.40%
20183   ACCT 2251   ACCT    1051 -> 2251    1   217 0.50%
20203   ACCT 2251   ACCT    1051 -> 2251    1   248 0.40%
20213   ACCT 2251   ACCT    1051 -> 2251    1   219 0.50%
20213   ACCT 2251   ACCT    1051 and 2251 -> NA 1   219 0.50%
20173   ACCT 2251   ACCT    1853 -> 2251    2   235 0.90%
20183   ACCT 2251   ACCT    2251 -> 1051    1   217 0.50%
20173   ACCT 2251   ACCT    2251 -> 2251    224 235 95.30%
20183   ACCT 2251   ACCT    2251 -> 2251    210 217 96.80%
20193   ACCT 2251   ACCT    2251 -> 2251    240 258 93%
20203   ACCT 2251   ACCT    2251 -> 2251    223 248 89.90%
20213   ACCT 2251   ACCT    2251 -> 2251    204 219 93.20%
20173   ACCT 2251   ACCT    2251 -> NA  11  235 4.70%
20183   ACCT 2251   ACCT    2251 -> NA  6   217 2.80%
20193   ACCT 2251   ACCT    2251 -> NA  18  258 7.00%
20203   ACCT 2251   ACCT    2251 -> NA  25  248 10.10%
20213   ACCT 2251   ACCT    2251 -> NA  14  219 6.40%
20173   ACCT 2251   ACCT    NA -> 2251  17  235 7.20%
20183   ACCT 2251   ACCT    NA -> 2251  23  217 10.60%
20193   ACCT 2251   ACCT    NA -> 2251  29  258 11%
20203   ACCT 2251   ACCT    NA -> 2251  37  248 14.90%
20213   ACCT 2251   ACCT    NA -> 2251  40  219 18.30%


I'm trying to turn this into a wide format by YRTR, but also have the values of path_count, pre_drop_count and freq. So ideally it would look something like this:

            20173           20183           20193           20203       
sub_cou     SUBJ    PATH    path_count  pre_drop_count  freq    path_count  pre_drop_count  freq    path_count  pre_drop_count  freq    path_count  pre_drop_count  freq
ACCT 2251   ACCT    1853 -> 2251    2   235 0.90%   NA  NA  NA  NA  NA  NA  NA  NA  NA
ACCT 2251   ACCT    NA -> 2251  17  235 7.20%   23  217 10.60%  29  258 11% 37  248 14.90%
ACCT 2251   ACCT    2251 -> NA  11  235 4.70%   6   217 2.80%   18  258 7.00%   25  248 10.10%
ACCT 2251   ACCT    2251 -> 2251    224 235 95.30%  210 217 96.80%  240 258 93% 223 248 89.90%
ACCT 2251   ACCT    1051 -> 2251    1   235 0.40%   1   217 0.50%   NA  NA  NA  1   248 0.40%
ACCT 2251   ACCT    2251 -> 1051    NA  NA  NA  1   217 0.50%   NA  NA  NA  NA  NA  NA

I've tried using dcast but it seems to only want to put YRTR on top.

Edited to add dput() output:

dput(path_agg2)
structure(list(YRTR = c(20173L, 20173L, 20173L, 20173L, 20173L, 
20183L, 20183L, 20183L, 20183L, 20183L, 20193L, 20193L, 20193L, 
20203L, 20203L, 20203L, 20203L, 20213L, 20213L, 20213L, 20213L, 
20213L), sub_cou = c("ACCT 2251", "ACCT 2251", "ACCT 2251", "ACCT 2251", 
"ACCT 2251", "ACCT 2251", "ACCT 2251", "ACCT 2251", "ACCT 2251", 
"ACCT 2251", "ACCT 2251", "ACCT 2251", "ACCT 2251", "ACCT 2251", 
"ACCT 2251", "ACCT 2251", "ACCT 2251", "ACCT 2251", "ACCT 2251", 
"ACCT 2251", "ACCT 2251", "ACCT 2251"), SUBJ = c("ACCT", "ACCT", 
"ACCT", "ACCT", "ACCT", "ACCT", "ACCT", "ACCT", "ACCT", "ACCT", 
"ACCT", "ACCT", "ACCT", "ACCT", "ACCT", "ACCT", "ACCT", "ACCT", 
"ACCT", "ACCT", "ACCT", "ACCT"), PATH = c("1853 -> 2251", "NA -> 2251", 
"2251 -> NA", "2251 -> 2251", "1051 -> 2251", "2251 -> NA", "2251 -> 1051", 
"2251 -> 2251", "1051 -> 2251", "NA -> 2251", "NA -> 2251", "2251 -> 2251", 
"2251 -> NA", "2251 -> 2251", "2251 -> NA", "NA -> 2251", "1051 -> 2251", 
"2251 -> 2251", "1051 and 2251 -> NA", "1051 -> 2251", "2251 -> NA", 
"NA -> 2251"), path_count = c(2L, 17L, 11L, 224L, 1L, 6L, 1L, 
210L, 1L, 23L, 29L, 240L, 18L, 223L, 25L, 37L, 1L, 204L, 1L, 
1L, 14L, 40L), pre_drop_count = c(235L, 235L, 235L, 235L, 235L, 
217L, 217L, 217L, 217L, 217L, 258L, 258L, 258L, 248L, 248L, 248L, 
248L, 219L, 219L, 219L, 219L, 219L), freq = c("0.9%", "7.2%", 
"4.7%", "95.3%", "0.4%", "2.8%", "0.5%", "96.8%", "0.5%", "10.6%", 
"11.2%", "93%", "7%", "89.9%", "10.1%", "14.9%", "0.4%", "93.2%", 
"0.5%", "0.5%", "6.4%", "18.3%")), row.names = c(NA, -22L), class = "data.frame")
1

There are 1 answers

0
Karthik S On BEST ANSWER

Does this answer:

> path_agg2_wider <- path_agg2 %>% pivot_wider(
+   names_from = YRTR, 
+   values_from = c(path_count, pre_drop_count, freq)
+ )
> path_agg2_wider <- path_agg2_wider[c(1:3,4,9,14,5,10,15,6,11,16,7,12,17,8,13,18)]
> path_agg2_wider
# A tibble: 7 x 18
  sub_cou SUBJ  PATH  path_count_20173 pre_drop_count_~ freq_20173 path_count_20183 pre_drop_count_~ freq_20183 path_count_20193 pre_drop_count_~ freq_20193
  <chr>   <chr> <chr>            <int>            <int> <chr>                 <int>            <int> <chr>                 <int>            <int> <chr>     
1 ACCT 2~ ACCT  1853~                2              235 0.9%                     NA               NA NA                       NA               NA NA        
2 ACCT 2~ ACCT  NA -~               17              235 7.2%                     23              217 10.6%                    29              258 11.2%     
3 ACCT 2~ ACCT  2251~               11              235 4.7%                      6              217 2.8%                     18              258 7%        
4 ACCT 2~ ACCT  2251~              224              235 95.3%                   210              217 96.8%                   240              258 93%       
5 ACCT 2~ ACCT  1051~                1              235 0.4%                      1              217 0.5%                     NA               NA NA        
6 ACCT 2~ ACCT  2251~               NA               NA NA                        1              217 0.5%                     NA               NA NA        
7 ACCT 2~ ACCT  1051~               NA               NA NA                       NA               NA NA                       NA               NA NA        
# ... with 6 more variables: path_count_20203 <int>, pre_drop_count_20203 <int>, freq_20203 <chr>, path_count_20213 <int>, pre_drop_count_20213 <int>,
#   freq_20213 <chr>
>