calculate mean for Long Format - n_distinct issue and group_by issue

280 views Asked by At

I am facing an issue with calculating a mean on Long Format Data.

The dataset I am using is very long and quite complex, so I will just reproduce 4 cases.

dta = structure(list(idno = c(12501, 12501, 12501, 12501, 12501, 12502, 
12502, 12502, 12502, 12502, 12502, 12502, 12502, 12502, 12502, 
12502, 12502, 12591, 12591, 12591, 12591, 12591, 12591, 12591, 
12591, 12591, 12591, 12591, 12591, 12591, 12591, 12591, 12592, 
12592, 12592, 12592, 12592), isex = structure(c(1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L), .Label = c("FEMALE", "MALE"), class = "factor"), children = c(0, 
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), variable = structure(c(1L, 
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 
144L, 1L, 2L, 3L, 4L, 5L, 135L, 136L, 137L, 138L, 139L, 140L, 
141L, 142L, 143L, 144L, 1L, 2L, 3L, 4L, 5L), .Label = c("act1.001", 
"act1.002", "act1.003", "act1.004", "act1.005", "act1.006", "act1.007", 
"act1.008", "act1.009", "act1.010", "act1.011", "act1.012", "act1.013", 
"act1.014", "act1.015", "act1.016", "act1.017", "act1.018", "act1.019", 
"act1.020", "act1.021", "act1.022", "act1.023", "act1.024", "act1.025", 
"act1.026", "act1.027", "act1.028", "act1.029", "act1.030", "act1.031", 
"act1.032", "act1.033", "act1.034", "act1.035", "act1.036", "act1.037", 
"act1.038", "act1.039", "act1.040", "act1.041", "act1.042", "act1.043", 
"act1.044", "act1.045", "act1.046", "act1.047", "act1.048", "act1.049", 
"act1.050", "act1.051", "act1.052", "act1.053", "act1.054", "act1.055", 
"act1.056", "act1.057", "act1.058", "act1.059", "act1.060", "act1.061", 
"act1.062", "act1.063", "act1.064", "act1.065", "act1.066", "act1.067", 
"act1.068", "act1.069", "act1.070", "act1.071", "act1.072", "act1.073", 
"act1.074", "act1.075", "act1.076", "act1.077", "act1.078", "act1.079", 
"act1.080", "act1.081", "act1.082", "act1.083", "act1.084", "act1.085", 
"act1.086", "act1.087", "act1.088", "act1.089", "act1.090", "act1.091", 
"act1.092", "act1.093", "act1.094", "act1.095", "act1.096", "act1.097", 
"act1.098", "act1.099", "act1.100", "act1.101", "act1.102", "act1.103", 
"act1.104", "act1.105", "act1.106", "act1.107", "act1.108", "act1.109", 
"act1.110", "act1.111", "act1.112", "act1.113", "act1.114", "act1.115", 
"act1.116", "act1.117", "act1.118", "act1.119", "act1.120", "act1.121", 
"act1.122", "act1.123", "act1.124", "act1.125", "act1.126", "act1.127", 
"act1.128", "act1.129", "act1.130", "act1.131", "act1.132", "act1.133", 
"act1.134", "act1.135", "act1.136", "act1.137", "act1.138", "act1.139", 
"act1.140", "act1.141", "act1.142", "act1.143", "act1.144"), class = "factor"), 
   value = structure(c(8L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L, 
   8L, 8L, 8L, 8L, 8L, 8L, 8L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
   1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("a Sleep", 
   "c Work", "e Travel/Commute", "f Cooking", "g Housework", 
   "h Odd jobs", "i Eating", "j Child care", "k Care for others", 
   "m Leisure", "u TV/Radio", "v Others", "t Visiting/Socialising", 
   "p Telephone/Online Communication", "b Personal care", "d Studies/library", 
   "s Religious", "l Shopping", "q Computing/Internet", "n Highbrow", 
   "o Sport", "r Civic"), class = "factor"), sleep = c(0, 0, 
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10, 
   10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
   10, 10)), .Names = c("idno", "isex", "children", "variable", 
"value", "sleep"), row.names = c(1L, 2L, 3L, 4L, 5L, 145L, 146L, 
 147L, 148L, 149L, 150L, 151L, 152L, 153L, 154L, 155L, 288L, 289L, 
 290L, 291L, 292L, 293L, 423L, 424L, 425L, 426L, 427L, 428L, 429L, 
 430L, 431L, 432L, 433L, 434L, 435L, 436L, 437L), class = "data.frame")

The first 5 rows look like this.

     idno   isex children variable             value sleep
1   12501 FEMALE        0 act1.001      j Child care     0
2   12501 FEMALE        0 act1.002 k Care for others     0
3   12501 FEMALE        0 act1.003 k Care for others     0
4   12501 FEMALE        0 act1.004 k Care for others     0
5   12501 FEMALE        0 act1.005 k Care for others     0

idno is the personal identifier, then the sex, the children, the time stamp, the activity and a last row where I created a dummy for sleep.

If I use aggregate

aggregate(sleep ~ isex, data = dta, FUN = sum)
150 / 2 # 2 women in my dataset
50 / 2 # 2 men in my dataset

I get as mean 75 minutes of sleep for women and 25 for men.

With summarise I can calculate the sum of all activities like this by gender. We can see that the sum of sleep is correct.

dta %>% 
 group_by(isex, value, add = T) %>% 
 summarise(smn = n()*10) 

    isex             value smn
1 FEMALE           a Sleep 150
2 FEMALE      j Child care  10
3 FEMALE k Care for others  40
4   MALE           a Sleep  50
5   MALE      j Child care 120

My problem is that I need to now divide this smn sum by the correct number n of groups (so in this minimal example 2 and 2).

So simply using mean does not work

dta %>% 
 group_by(isex, value, add = T) %>% 
 summarise(smn = n()*10, mean(smn) ) 

Using distinct does not give me the correct sum for activities

dta %>% 
  distinct(idno) %>%  
  group_by(isex, value, add = T) %>% 
  summarise(smn = n()*10) 

So I need to use distinct to get the correct sum to divide my sum of activities not to calculate this sum of activities. So distinct should be used after the first summarise. I am not figuring out how to do it.

Any clue ?

desired output

     isex             value smn mean
1 FEMALE           a Sleep 150   75
2 FEMALE      j Child care  10    5
3 FEMALE k Care for others  40   20
4   MALE           a Sleep  50   25
5   MALE      j Child care 120   60

One thing: my problem is that in my original dataset the number of male/female (or other groups) are not even, so I want to avoid dividing by hand.

1

There are 1 answers

0
Frank On BEST ANSWER

Hm, I guess reusing group_by with add=TRUE is one way:

dta %>% 
  group_by(isex) %>% 
  mutate( ng = n_distinct(idno) ) %>% 
  group_by(value,add=TRUE) %>%
  summarise( smn = 10*n(), mean = 10*n()/ng[1] )
#     isex             value smn mean
# 1 FEMALE           a Sleep 150   75
# 2 FEMALE      j Child care  10    5
# 3 FEMALE k Care for others  40   20
# 4   MALE           a Sleep  50   25
# 5   MALE      j Child care 120   60

There is probably some way with one of the dplyr join functions, too.