Can I control the order of each compression in TukeyHSD?

37 views Asked by At

I'm trying to create a function that will compute ANOVA and the following Tukey test for multiple Ys (taken from a list of columns Ylist), and summarise both tests results of all Ys in one df. to do so, I have this working code:

anova_fun <- function(df,x.var,z.var){
  aov_res <- map(df[Ylist], ~tryCatch(aov(formula(englue(".~{{x.var}} * {{z.var}}")), data=df), error = function(e) NULL))
  # Apply Tukey's HSD test to the results of each ANOVA test:
   tukey_res <- map(aov_res, ~tryCatch(TukeyHSD(., ordered = TRUE), error = function(e) NULL))
  
  # Convert the results of each ANOVA test into a tidy data frame using the broom package:
    aov_res_df <- bind_rows(map(aov_res, broom::tidy), .id = "metal") %>%
    mutate(metal = sub('\\..*', '', metal)) %>%
    dcast(metal~paste0('p.value.',term), value.var='p.value')
  print(aov_res_df)
  
  # Combine the results of the Tukey HSD tests into a single data frame:
    tukey_res_df <- bind_rows(map(tukey_res, tidy), .id = "metal") %>%
    mutate(metal = sub('\\..*', '', metal)) %>%
    dcast(metal~paste0('p.value.',contrast), value.var='adj.p.value')
  print(tukey_res_df)
  
  # Merge ANOVA and Tukey results
  final_stat_summary <- merge(aov_res_df,tukey_res_df, by="metal")
  #print all results:
  print(final_stat_summary)
  
  print(tukey_res)
}

The problem is that the Tukey function changes the order of the pairs compared for different Ys in the columns list, which results in a final df with two different columns for each comparison pair (an x:y column and a y:x column).

My df:

structure(list(treatment = structure(c(3L, 3L, 4L, 4L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 4L, 4L, 3L, 
3L, 4L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 3L, 3L, 3L, 4L, 4L, 4L, 3L, 
4L, 4L, 3L, 3L, 3L, 4L, 3L, 4L, 4L, 3L, 3L, 3L, 3L, 4L), levels = c("capture: external reference", 
"external reference", "reference", "pollution"), class = "factor"), 
    year = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L), levels = c("2018", 
    "2019", "2020", "2021", "2022"), class = "factor"), `51 V ug/g` = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, 6.94, 4.28, NA, 2.07, 3.17, 8.72, 2.2, 0.66, 0.58, 0.69, 
    1.58, 1.24, 0.5, 0.25, NA, 0.49, NA, 0.32, 0.49, 0.3, 3.03, 
    2.71, 2.7, 2.74, 1.69, 7.47, 2.12, 1.92, 5.45, 3.47, 1.64, 
    3.15, 2.49, 15.16, NA), `55 Mn ug/g` = c(22.77, NA, 3.3, 
    NA, NA, NA, NA, NA, 17.46444095, NA, 2.233532391, 15.02875187, 
    15.22884699, 3.150832343, NA, 10.07593931, 4.2494323, 7.2, 
    6.88, NA, 2.28, 7.56, 6.45, 1.29, 2.83, 10.27, 6, 10.13, 
    2.89, 5.09, 3.36, NA, 3.4, NA, 1.6, 1.71, 2.67, 1.59, 2.08, 
    0.71, 13.94, 4.86, 46.52, 9.5, 6.17, 30.5, 19.48, 5.94, 15.63, 
    14.39, 200.04, NA), `75 As ug/g` = c(NA, NA, NA, NA, NA, 
    NA, NA, NA, 0.342496594, NA, 0.060462912, 0.421820923, 0.273621642, 
    0.028194996, NA, 0.313784484, 0.088320976, 0, 0.06, NA, 0, 
    0.25, 0, 0.05, 6.69, 4.72, 9.19, 9.69, 7.84, 8.04, 9.66, 
    NA, 7.18, NA, 5.82, 3.53, 4.81, 0.4, 0, 0.17, 0.29, 0.27, 
    0.75, 0.24, 0.21, 0.44, 0.35, 0.17, 0.48, 0.28, 0.98, NA)), row.names = c(52L, 
54L, 56L, 66L, 67L, 68L, 69L, 70L, 100L, 101L, 102L, 103L, 105L, 
106L, 107L, 108L, 109L, 253L, 257L, 259L, 264L, 266L, 272L, 273L, 
275L, 279L, 280L, 281L, 282L, 283L, 285L, 289L, 291L, 292L, 293L, 
294L, 296L, 299L, 301L, 303L, 609L, 612L, 622L, 625L, 629L, 632L, 
634L, 635L, 636L, 1023L, 1025L, 1027L), class = "data.frame")

the current output for anova_fun(df,treatment,year) with Ylist <- c("51 V ug/g","55 Mn ug/g","75 As ug/g") is:

), p.value.Residuals = c(NA_real_, NA_real_, NA_real_), p.value.treatment = c(0.111169242030236, 
0.184759581382741, 0.787284487247997), `p.value.treatment:year` = c(0.361344304889175, 
0.938903024618852, 0.96409565381219), p.value.year = c(0.0209814550433628, 
0.00024500226229898, 0.00988351135656023), `p.value.2018-2019` = c(NA, 
0.9983781688177, NA), `p.value.2018-2020` = c(NA, 0.986264806623042, 
NA), `p.value.2019-2020` = c(NA, 0.997480737457968, NA), `p.value.2020-2019` = c(NA, 
NA, 0.0451094846184962), `p.value.2020-2021` = c(NA, NA, 0.031422447488233
), `p.value.2020-2022` = c(NA, NA, 0.500278777416374), `p.value.2021-2018` = c(NA, 
0.999968007265246, NA), `p.value.2021-2019` = c(NA, 0.96949318678529, 
0.999798201731843), `p.value.2021-2020` = c(0.669820951745168, 
0.783021573522353, NA), `p.value.2022-2018` = c(NA, 0.00953298879421094, 
NA), `p.value.2022-2019` = c(NA, 0.000396295689156001, 0.996953515620612
), `p.value.2022-2020` = c(0.020095427470792, 7.76976999404821e-05, 
NA), `p.value.2022-2021` = c(0.073001843847973, 0.000871521773720296, 
0.998620882399395), `p.value.pollution-reference` = c(NA, NA, 
0.787284487247999), `p.value.pollution:2018-pollution:2020` = c(NA, 
1, NA), `p.value.pollution:2019-pollution:2018` = c(NA, 0.999999989107945, 
NA), `p.value.pollution:2019-pollution:2020` = c(NA, 0.999997734276393, 
NA), `p.value.pollution:2019-reference:2020` = c(NA, 0.999999996427652, 
NA), `p.value.pollution:2020-pollution:2019` = c(NA, NA, 0.82519269122796
), `p.value.pollution:2020-pollution:2021` = c(NA, NA, 0.742025372423251
), `p.value.pollution:2020-reference:2019` = c(NA, NA, 0.510130719610712
), `p.value.pollution:2020-reference:2021` = c(NA, NA, 0.483951673040763
), `p.value.pollution:2020-reference:2022` = c(NA, NA, 0.910103171279935
), `p.value.pollution:2021-pollution:2018` = c(NA, 0.999857162309916, 
NA), `p.value.pollution:2021-pollution:2019` = c(NA, 0.999974561040774, 
0.99999999944096), `p.value.pollution:2021-pollution:2020` = c(0.780702792073819, 
0.985242652525984, NA), `p.value.pollution:2021-reference:2019` = c(NA, 
0.999899683576947, 0.99999999997636), `p.value.pollution:2021-reference:2020` = c(0.997113879316256, 
0.996646806211096, NA), `p.value.pollution:2021-reference:2021` = c(0.999653630512921, 
0.999999909209083, NA), `p.value.pollution:2022-pollution:2018` = c(NA_real_, 
NA_real_, NA_real_), `p.value.pollution:2022-pollution:2019` = c(NA_real_, 
NA_real_, NA_real_), `p.value.pollution:2022-pollution:2020` = c(NA_real_, 
NA_real_, NA_real_), `p.value.pollution:2022-pollution:2021` = c(NA_real_, 
NA_real_, NA_real_), `p.value.pollution:2022-reference:2018` = c(NA_real_, 
NA_real_, NA_real_), `p.value.pollution:2022-reference:2019` = c(NA_real_, 
NA_real_, NA_real_), `p.value.pollution:2022-reference:2020` = c(NA_real_, 
NA_real_, NA_real_), `p.value.pollution:2022-reference:2021` = c(NA_real_, 
NA_real_, NA_real_), `p.value.pollution:2022-reference:2022` = c(NA_real_, 
NA_real_, NA_real_), `p.value.reference-pollution` = c(0.111169242030245, 
0.184759581382805, NA), `p.value.reference:2018-pollution:2018` = c(NA, 
0.999896472417036, NA), `p.value.reference:2018-pollution:2019` = c(NA, 
0.999983334586107, NA), `p.value.reference:2018-pollution:2020` = c(NA, 
0.998364300827686, NA), `p.value.reference:2018-pollution:2021` = c(NA, 
0.999999999962931, NA), `p.value.reference:2018-reference:2019` = c(NA, 
0.999971056558036, NA), `p.value.reference:2018-reference:2020` = c(NA, 
0.999621426820569, NA), `p.value.reference:2018-reference:2021` = c(NA, 
0.999999787278022, NA), `p.value.reference:2019-pollution:2018` = c(NA, 
0.999999928415538, NA), `p.value.reference:2019-pollution:2019` = c(NA, 
1, 0.999999999992951), `p.value.reference:2019-pollution:2020` = c(NA, 
0.999914906147447, NA), `p.value.reference:2019-reference:2020` = c(NA, 
0.999999610065901, NA), `p.value.reference:2020-pollution:2018` = c(NA, 
0.999999999967094, NA), `p.value.reference:2020-pollution:2019` = c(NA, 
NA, 0.658659950217536), `p.value.reference:2020-pollution:2020` = c(0.800197990112071, 
0.999998962635691, 0.999762421864156), `p.value.reference:2020-pollution:2021` = c(NA, 
NA, 0.521543410831375), `p.value.reference:2020-reference:2019` = c(NA, 
NA, 0.262715545162835), `p.value.reference:2020-reference:2021` = c(NA, 
NA, 0.230073516904054), `p.value.reference:2020-reference:2022` = c(NA, 
NA, 0.781042590686855), `p.value.reference:2021-pollution:2018` = c(NA, 
0.999980347970849, NA), `p.value.reference:2021-pollution:2019` = c(NA, 
0.999999262667568, 0.9999999976967), `p.value.reference:2021-pollution:2020` = c(0.807962768154783, 
0.989373660078433, NA), `p.value.reference:2021-pollution:2021` = c(NA, 
NA, 1), `p.value.reference:2021-reference:2019` = c(NA, 0.999995936114535, 
0.999999999708513), `p.value.reference:2021-reference:2020` = c(0.999972117720937, 
0.998678507959776, NA), `p.value.reference:2022-pollution:2018` = c(NA, 
0.0478260734990693, NA), `p.value.reference:2022-pollution:2019` = c(NA, 
0.0130027870898417, 0.999999828018741), `p.value.reference:2022-pollution:2020` = c(0.0190147041746224, 
0.000249668223134436, NA), `p.value.reference:2022-pollution:2021` = c(0.347260235453882, 
0.0164495943288261, 0.999999990943905), `p.value.reference:2022-reference:2018` = c(NA, 
0.187046550589803, NA), `p.value.reference:2022-reference:2019` = c(NA, 
0.00181547271725346, 0.99999987095751), `p.value.reference:2022-reference:2020` = c(0.0866039340188859, 
0.000328231469116091, NA), `p.value.reference:2022-reference:2021` = c(0.149313701197782, 
0.00284645715600984, 0.999999987560445)), row.names = c(NA, -3L
), class = "data.frame")

the desired ouput will be the same df but with one column for each comparison, for example, one column combining the two: p.value.2019-2020 and p.value.2020-2019.

What am I doing wrong? tnx!

0

There are 0 answers