I'm trying to create a function that will compute ANOVA and the following Tukey test for multiple Ys (taken from a list of columns Ylist), and summarise both tests results of all Ys in one df. to do so, I have this working code:
anova_fun <- function(df,x.var,z.var){
aov_res <- map(df[Ylist], ~tryCatch(aov(formula(englue(".~{{x.var}} * {{z.var}}")), data=df), error = function(e) NULL))
# Apply Tukey's HSD test to the results of each ANOVA test:
tukey_res <- map(aov_res, ~tryCatch(TukeyHSD(., ordered = TRUE), error = function(e) NULL))
# Convert the results of each ANOVA test into a tidy data frame using the broom package:
aov_res_df <- bind_rows(map(aov_res, broom::tidy), .id = "metal") %>%
mutate(metal = sub('\\..*', '', metal)) %>%
dcast(metal~paste0('p.value.',term), value.var='p.value')
print(aov_res_df)
# Combine the results of the Tukey HSD tests into a single data frame:
tukey_res_df <- bind_rows(map(tukey_res, tidy), .id = "metal") %>%
mutate(metal = sub('\\..*', '', metal)) %>%
dcast(metal~paste0('p.value.',contrast), value.var='adj.p.value')
print(tukey_res_df)
# Merge ANOVA and Tukey results
final_stat_summary <- merge(aov_res_df,tukey_res_df, by="metal")
#print all results:
print(final_stat_summary)
print(tukey_res)
}
The problem is that the Tukey function changes the order of the pairs compared for different Ys in the columns list, which results in a final df with two different columns for each comparison pair (an x:y column and a y:x column).
My df:
structure(list(treatment = structure(c(3L, 3L, 4L, 4L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 4L, 4L, 3L,
3L, 4L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 3L, 3L, 3L, 4L, 4L, 4L, 3L,
4L, 4L, 3L, 3L, 3L, 4L, 3L, 4L, 4L, 3L, 3L, 3L, 3L, 4L), levels = c("capture: external reference",
"external reference", "reference", "pollution"), class = "factor"),
year = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L), levels = c("2018",
"2019", "2020", "2021", "2022"), class = "factor"), `51 V ug/g` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 6.94, 4.28, NA, 2.07, 3.17, 8.72, 2.2, 0.66, 0.58, 0.69,
1.58, 1.24, 0.5, 0.25, NA, 0.49, NA, 0.32, 0.49, 0.3, 3.03,
2.71, 2.7, 2.74, 1.69, 7.47, 2.12, 1.92, 5.45, 3.47, 1.64,
3.15, 2.49, 15.16, NA), `55 Mn ug/g` = c(22.77, NA, 3.3,
NA, NA, NA, NA, NA, 17.46444095, NA, 2.233532391, 15.02875187,
15.22884699, 3.150832343, NA, 10.07593931, 4.2494323, 7.2,
6.88, NA, 2.28, 7.56, 6.45, 1.29, 2.83, 10.27, 6, 10.13,
2.89, 5.09, 3.36, NA, 3.4, NA, 1.6, 1.71, 2.67, 1.59, 2.08,
0.71, 13.94, 4.86, 46.52, 9.5, 6.17, 30.5, 19.48, 5.94, 15.63,
14.39, 200.04, NA), `75 As ug/g` = c(NA, NA, NA, NA, NA,
NA, NA, NA, 0.342496594, NA, 0.060462912, 0.421820923, 0.273621642,
0.028194996, NA, 0.313784484, 0.088320976, 0, 0.06, NA, 0,
0.25, 0, 0.05, 6.69, 4.72, 9.19, 9.69, 7.84, 8.04, 9.66,
NA, 7.18, NA, 5.82, 3.53, 4.81, 0.4, 0, 0.17, 0.29, 0.27,
0.75, 0.24, 0.21, 0.44, 0.35, 0.17, 0.48, 0.28, 0.98, NA)), row.names = c(52L,
54L, 56L, 66L, 67L, 68L, 69L, 70L, 100L, 101L, 102L, 103L, 105L,
106L, 107L, 108L, 109L, 253L, 257L, 259L, 264L, 266L, 272L, 273L,
275L, 279L, 280L, 281L, 282L, 283L, 285L, 289L, 291L, 292L, 293L,
294L, 296L, 299L, 301L, 303L, 609L, 612L, 622L, 625L, 629L, 632L,
634L, 635L, 636L, 1023L, 1025L, 1027L), class = "data.frame")
the current output for anova_fun(df,treatment,year) with Ylist <- c("51 V ug/g","55 Mn ug/g","75 As ug/g") is:
), p.value.Residuals = c(NA_real_, NA_real_, NA_real_), p.value.treatment = c(0.111169242030236,
0.184759581382741, 0.787284487247997), `p.value.treatment:year` = c(0.361344304889175,
0.938903024618852, 0.96409565381219), p.value.year = c(0.0209814550433628,
0.00024500226229898, 0.00988351135656023), `p.value.2018-2019` = c(NA,
0.9983781688177, NA), `p.value.2018-2020` = c(NA, 0.986264806623042,
NA), `p.value.2019-2020` = c(NA, 0.997480737457968, NA), `p.value.2020-2019` = c(NA,
NA, 0.0451094846184962), `p.value.2020-2021` = c(NA, NA, 0.031422447488233
), `p.value.2020-2022` = c(NA, NA, 0.500278777416374), `p.value.2021-2018` = c(NA,
0.999968007265246, NA), `p.value.2021-2019` = c(NA, 0.96949318678529,
0.999798201731843), `p.value.2021-2020` = c(0.669820951745168,
0.783021573522353, NA), `p.value.2022-2018` = c(NA, 0.00953298879421094,
NA), `p.value.2022-2019` = c(NA, 0.000396295689156001, 0.996953515620612
), `p.value.2022-2020` = c(0.020095427470792, 7.76976999404821e-05,
NA), `p.value.2022-2021` = c(0.073001843847973, 0.000871521773720296,
0.998620882399395), `p.value.pollution-reference` = c(NA, NA,
0.787284487247999), `p.value.pollution:2018-pollution:2020` = c(NA,
1, NA), `p.value.pollution:2019-pollution:2018` = c(NA, 0.999999989107945,
NA), `p.value.pollution:2019-pollution:2020` = c(NA, 0.999997734276393,
NA), `p.value.pollution:2019-reference:2020` = c(NA, 0.999999996427652,
NA), `p.value.pollution:2020-pollution:2019` = c(NA, NA, 0.82519269122796
), `p.value.pollution:2020-pollution:2021` = c(NA, NA, 0.742025372423251
), `p.value.pollution:2020-reference:2019` = c(NA, NA, 0.510130719610712
), `p.value.pollution:2020-reference:2021` = c(NA, NA, 0.483951673040763
), `p.value.pollution:2020-reference:2022` = c(NA, NA, 0.910103171279935
), `p.value.pollution:2021-pollution:2018` = c(NA, 0.999857162309916,
NA), `p.value.pollution:2021-pollution:2019` = c(NA, 0.999974561040774,
0.99999999944096), `p.value.pollution:2021-pollution:2020` = c(0.780702792073819,
0.985242652525984, NA), `p.value.pollution:2021-reference:2019` = c(NA,
0.999899683576947, 0.99999999997636), `p.value.pollution:2021-reference:2020` = c(0.997113879316256,
0.996646806211096, NA), `p.value.pollution:2021-reference:2021` = c(0.999653630512921,
0.999999909209083, NA), `p.value.pollution:2022-pollution:2018` = c(NA_real_,
NA_real_, NA_real_), `p.value.pollution:2022-pollution:2019` = c(NA_real_,
NA_real_, NA_real_), `p.value.pollution:2022-pollution:2020` = c(NA_real_,
NA_real_, NA_real_), `p.value.pollution:2022-pollution:2021` = c(NA_real_,
NA_real_, NA_real_), `p.value.pollution:2022-reference:2018` = c(NA_real_,
NA_real_, NA_real_), `p.value.pollution:2022-reference:2019` = c(NA_real_,
NA_real_, NA_real_), `p.value.pollution:2022-reference:2020` = c(NA_real_,
NA_real_, NA_real_), `p.value.pollution:2022-reference:2021` = c(NA_real_,
NA_real_, NA_real_), `p.value.pollution:2022-reference:2022` = c(NA_real_,
NA_real_, NA_real_), `p.value.reference-pollution` = c(0.111169242030245,
0.184759581382805, NA), `p.value.reference:2018-pollution:2018` = c(NA,
0.999896472417036, NA), `p.value.reference:2018-pollution:2019` = c(NA,
0.999983334586107, NA), `p.value.reference:2018-pollution:2020` = c(NA,
0.998364300827686, NA), `p.value.reference:2018-pollution:2021` = c(NA,
0.999999999962931, NA), `p.value.reference:2018-reference:2019` = c(NA,
0.999971056558036, NA), `p.value.reference:2018-reference:2020` = c(NA,
0.999621426820569, NA), `p.value.reference:2018-reference:2021` = c(NA,
0.999999787278022, NA), `p.value.reference:2019-pollution:2018` = c(NA,
0.999999928415538, NA), `p.value.reference:2019-pollution:2019` = c(NA,
1, 0.999999999992951), `p.value.reference:2019-pollution:2020` = c(NA,
0.999914906147447, NA), `p.value.reference:2019-reference:2020` = c(NA,
0.999999610065901, NA), `p.value.reference:2020-pollution:2018` = c(NA,
0.999999999967094, NA), `p.value.reference:2020-pollution:2019` = c(NA,
NA, 0.658659950217536), `p.value.reference:2020-pollution:2020` = c(0.800197990112071,
0.999998962635691, 0.999762421864156), `p.value.reference:2020-pollution:2021` = c(NA,
NA, 0.521543410831375), `p.value.reference:2020-reference:2019` = c(NA,
NA, 0.262715545162835), `p.value.reference:2020-reference:2021` = c(NA,
NA, 0.230073516904054), `p.value.reference:2020-reference:2022` = c(NA,
NA, 0.781042590686855), `p.value.reference:2021-pollution:2018` = c(NA,
0.999980347970849, NA), `p.value.reference:2021-pollution:2019` = c(NA,
0.999999262667568, 0.9999999976967), `p.value.reference:2021-pollution:2020` = c(0.807962768154783,
0.989373660078433, NA), `p.value.reference:2021-pollution:2021` = c(NA,
NA, 1), `p.value.reference:2021-reference:2019` = c(NA, 0.999995936114535,
0.999999999708513), `p.value.reference:2021-reference:2020` = c(0.999972117720937,
0.998678507959776, NA), `p.value.reference:2022-pollution:2018` = c(NA,
0.0478260734990693, NA), `p.value.reference:2022-pollution:2019` = c(NA,
0.0130027870898417, 0.999999828018741), `p.value.reference:2022-pollution:2020` = c(0.0190147041746224,
0.000249668223134436, NA), `p.value.reference:2022-pollution:2021` = c(0.347260235453882,
0.0164495943288261, 0.999999990943905), `p.value.reference:2022-reference:2018` = c(NA,
0.187046550589803, NA), `p.value.reference:2022-reference:2019` = c(NA,
0.00181547271725346, 0.99999987095751), `p.value.reference:2022-reference:2020` = c(0.0866039340188859,
0.000328231469116091, NA), `p.value.reference:2022-reference:2021` = c(0.149313701197782,
0.00284645715600984, 0.999999987560445)), row.names = c(NA, -3L
), class = "data.frame")
the desired ouput will be the same df but with one column for each comparison, for example, one column combining the two: p.value.2019-2020 and p.value.2020-2019.
What am I doing wrong? tnx!