I am facing an issue with calculating a mean on Long Format Data.
The dataset I am using is very long and quite complex, so I will just reproduce 4 cases.
dta = structure(list(idno = c(12501, 12501, 12501, 12501, 12501, 12502,
12502, 12502, 12502, 12502, 12502, 12502, 12502, 12502, 12502,
12502, 12502, 12591, 12591, 12591, 12591, 12591, 12591, 12591,
12591, 12591, 12591, 12591, 12591, 12591, 12591, 12591, 12592,
12592, 12592, 12592, 12592), isex = structure(c(1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), .Label = c("FEMALE", "MALE"), class = "factor"), children = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), variable = structure(c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
144L, 1L, 2L, 3L, 4L, 5L, 135L, 136L, 137L, 138L, 139L, 140L,
141L, 142L, 143L, 144L, 1L, 2L, 3L, 4L, 5L), .Label = c("act1.001",
"act1.002", "act1.003", "act1.004", "act1.005", "act1.006", "act1.007",
"act1.008", "act1.009", "act1.010", "act1.011", "act1.012", "act1.013",
"act1.014", "act1.015", "act1.016", "act1.017", "act1.018", "act1.019",
"act1.020", "act1.021", "act1.022", "act1.023", "act1.024", "act1.025",
"act1.026", "act1.027", "act1.028", "act1.029", "act1.030", "act1.031",
"act1.032", "act1.033", "act1.034", "act1.035", "act1.036", "act1.037",
"act1.038", "act1.039", "act1.040", "act1.041", "act1.042", "act1.043",
"act1.044", "act1.045", "act1.046", "act1.047", "act1.048", "act1.049",
"act1.050", "act1.051", "act1.052", "act1.053", "act1.054", "act1.055",
"act1.056", "act1.057", "act1.058", "act1.059", "act1.060", "act1.061",
"act1.062", "act1.063", "act1.064", "act1.065", "act1.066", "act1.067",
"act1.068", "act1.069", "act1.070", "act1.071", "act1.072", "act1.073",
"act1.074", "act1.075", "act1.076", "act1.077", "act1.078", "act1.079",
"act1.080", "act1.081", "act1.082", "act1.083", "act1.084", "act1.085",
"act1.086", "act1.087", "act1.088", "act1.089", "act1.090", "act1.091",
"act1.092", "act1.093", "act1.094", "act1.095", "act1.096", "act1.097",
"act1.098", "act1.099", "act1.100", "act1.101", "act1.102", "act1.103",
"act1.104", "act1.105", "act1.106", "act1.107", "act1.108", "act1.109",
"act1.110", "act1.111", "act1.112", "act1.113", "act1.114", "act1.115",
"act1.116", "act1.117", "act1.118", "act1.119", "act1.120", "act1.121",
"act1.122", "act1.123", "act1.124", "act1.125", "act1.126", "act1.127",
"act1.128", "act1.129", "act1.130", "act1.131", "act1.132", "act1.133",
"act1.134", "act1.135", "act1.136", "act1.137", "act1.138", "act1.139",
"act1.140", "act1.141", "act1.142", "act1.143", "act1.144"), class = "factor"),
value = structure(c(8L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("a Sleep",
"c Work", "e Travel/Commute", "f Cooking", "g Housework",
"h Odd jobs", "i Eating", "j Child care", "k Care for others",
"m Leisure", "u TV/Radio", "v Others", "t Visiting/Socialising",
"p Telephone/Online Communication", "b Personal care", "d Studies/library",
"s Religious", "l Shopping", "q Computing/Internet", "n Highbrow",
"o Sport", "r Civic"), class = "factor"), sleep = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10)), .Names = c("idno", "isex", "children", "variable",
"value", "sleep"), row.names = c(1L, 2L, 3L, 4L, 5L, 145L, 146L,
147L, 148L, 149L, 150L, 151L, 152L, 153L, 154L, 155L, 288L, 289L,
290L, 291L, 292L, 293L, 423L, 424L, 425L, 426L, 427L, 428L, 429L,
430L, 431L, 432L, 433L, 434L, 435L, 436L, 437L), class = "data.frame")
The first 5 rows look like this.
idno isex children variable value sleep
1 12501 FEMALE 0 act1.001 j Child care 0
2 12501 FEMALE 0 act1.002 k Care for others 0
3 12501 FEMALE 0 act1.003 k Care for others 0
4 12501 FEMALE 0 act1.004 k Care for others 0
5 12501 FEMALE 0 act1.005 k Care for others 0
idno is the personal identifier, then the sex, the children, the time stamp, the activity and a last row where I created a dummy for sleep.
If I use aggregate
aggregate(sleep ~ isex, data = dta, FUN = sum)
150 / 2 # 2 women in my dataset
50 / 2 # 2 men in my dataset
I get as mean
75 minutes of sleep for women and 25 for men.
With summarise
I can calculate the sum
of all activities like this by gender. We can see that the sum of sleep is correct.
dta %>%
group_by(isex, value, add = T) %>%
summarise(smn = n()*10)
isex value smn
1 FEMALE a Sleep 150
2 FEMALE j Child care 10
3 FEMALE k Care for others 40
4 MALE a Sleep 50
5 MALE j Child care 120
My problem is that I need to now divide this smn
sum by the correct number n
of groups (so in this minimal example 2 and 2).
So simply using mean
does not work
dta %>%
group_by(isex, value, add = T) %>%
summarise(smn = n()*10, mean(smn) )
Using distinct
does not give me the correct sum for activities
dta %>%
distinct(idno) %>%
group_by(isex, value, add = T) %>%
summarise(smn = n()*10)
So I need to use distinct
to get the correct sum to divide my sum of activities not to calculate this sum of activities. So distinct
should be used after the first summarise
. I am not figuring out how to do it.
Any clue ?
desired output
isex value smn mean
1 FEMALE a Sleep 150 75
2 FEMALE j Child care 10 5
3 FEMALE k Care for others 40 20
4 MALE a Sleep 50 25
5 MALE j Child care 120 60
One thing: my problem is that in my original dataset the number of male/female (or other groups) are not even, so I want to avoid dividing by hand.
Hm, I guess reusing
group_by
withadd=TRUE
is one way:There is probably some way with one of the dplyr join functions, too.