Using summarise in wide vs long format

56 views Asked by At

I have a dataframe with the format below. Status has two levels (PRE,POST).

SI_mean TU_mean ED_mean MT_mean DT_mean SK_mean ATT_mean Status
2.6 2.75 2.6 2.8 3.4 2.5 3.8 PRE
3 3 2.4 2.4 3 3 4 PRE
2.4 2.75 2.4 2.2 2.6 2.25 2.8 PRE

I want to compare the values of level of status per column using wilcox.test. So I immediately tried,

df |>
  summarise(across(contains("mean"),~wilcox.test(.x~Status)$p.value))

but I am greeted by

Error in `summarise()`:
ℹ In argument: `across(contains("mean"), ~wilcox.test(.x ~ Status)$p.value)`.
ℹ In row 1.
Caused by error in `across()`:
! Can't compute column `SI_mean`.
Caused by error in `wilcox.test.formula()`:
! grouping factor must have exactly 2 levels

so I worked with a long-format instead and it works as expected,

df |> pivot_longer(contains("mean"),names_to = "Variable",values_to = "Mean") |>
  group_by(Variable) |>
  summarise(
    wilcox_p_value = wilcox.test(Mean~Status)$p.value
    )

but why did the summarise fail in wide-format?

I'm just interested on what I have misunderstood in summarise functionality and how will I get it to work on a wide-format.

Data

df=structure(list(SI_mean = c(2.6, 3, 2.4, 3, 3, 3.2, 2.2, 4, 3.8, 
2.8, 3.6, 2, 3.6, 3.6, 3.8, 3.2, 3, 4, 4, 3, 3.2, 4, 4, 3.2, 
3.2, 3, 3.2, 3.8, 4, 4, 4, 3), TU_mean = c(2.75, 3, 2.75, 3, 
3, 2.75, 3, 3.5, 3.75, 2.5, 3.25, 2, 3.5, 4, 3, 3.25, 3, 4, 4, 
3, 4, 4, 4, 3.25, 3.25, 3, 3, 3.25, 4, 4, 4, 3), ED_mean = c(2.6, 
2.4, 2.4, 3, 2.8, 4, 2, 3.8, 2.6, 2, 2.8, 2, 3, 3.4, 3, 1, 3, 
4, 3.8, 3, 3, 4, 4, 3.2, 4, 2.6, 4, 4, 3.8, 3.6, 4, 3), MT_mean = c(2.8, 
2.4, 2.2, 3, 2.8, 3.4, 2.2, 3.6, 3.4, 3, 2.6, 1.8, 3.4, 3, 4, 
2, 3, 3.4, 3.4, 3, 4, 4, 4, 3.2, 4, 2.8, 4, 4, 3.8, 3.6, 4, 3
), DT_mean = c(3.4, 3, 2.6, 3, 3, 3.8, 2.4, 3.6, 3, 3, 2.8, 2.4, 
3.6, 3.6, 3, 2.2, 3, 4, 4, 4, 3.6, 4, 4, 3.6, 3.8, 2.8, 4, 4, 
4, 3.8, 4, 3), SK_mean = c(2.5, 3, 2.25, 3, 3, 3.5, 2.25, 4, 
3.25, 2.25, 2.5, 2.5, 3.75, 3.75, 4, 1, 2, 4, 3.25, 3, 3.75, 
4, 4, 2.75, 4, 3, 4, 4, 4, 4, 4, 3), ATT_mean = c(3.8, 4, 2.8, 
3, 3, 3.8, 3, 3.6, 3, 4, 4, 3, 3.8, 3.6, 4, 3.8, 4, 4, 4, 4, 
4, 4, 4, 3.6, 3.8, 3, 4, 4, 4, 4, 4, 4), Status = c("PRE", "PRE", 
"PRE", "PRE", "PRE", "PRE", "PRE", "PRE", "PRE", "PRE", "PRE", 
"PRE", "PRE", "PRE", "PRE", "PRE", "PRE", "POST", "POST", "POST", 
"POST", "POST", "POST", "POST", "POST", "POST", "POST", "POST", 
"POST", "POST", "POST", "POST")), class = c("rowwise_df", "tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -32L), groups = structure(list(
    .rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
        10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 
        21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 
        32L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -32L), class = c("tbl_df", 
"tbl", "data.frame")))
1

There are 1 answers

2
Rui Barradas On BEST ANSWER

The problem is in the data.
Your tibble is grouped by rows, which can be seen in the second printed line of

df %>% print(n = 1L)
#> # A tibble: 32 × 8
#> # Rowwise: 

and is therefore processed row by row. Then, each Status value is just the one in that row. But wilcox.test needs a grouping variable with two levels and gives an error.

The solution is to ungroup the data first, then run the tests.

suppressPackageStartupMessages(
  library(dplyr)
)

df %>% print(n = 1L)
#> # A tibble: 32 × 8
#> # Rowwise: 
#>   SI_mean TU_mean ED_mean MT_mean DT_mean SK_mean ATT_mean Status
#>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>    <dbl> <chr> 
#> 1     2.6    2.75     2.6     2.8     3.4     2.5      3.8 PRE   
#> # ℹ 31 more rows

attributes(df)
#> $names
#> [1] "SI_mean"  "TU_mean"  "ED_mean"  "MT_mean"  "DT_mean"  "SK_mean"  "ATT_mean"
#> [8] "Status"  
#> 
#> $class
#> [1] "rowwise_df" "tbl_df"     "tbl"        "data.frame"
#> 
#> $row.names
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
#> [26] 26 27 28 29 30 31 32
#> 
#> $groups
#> # A tibble: 32 × 1
#>          .rows
#>    <list<int>>
#>  1         [1]
#>  2         [1]
#>  3         [1]
#>  4         [1]
#>  5         [1]
#>  6         [1]
#>  7         [1]
#>  8         [1]
#>  9         [1]
#> 10         [1]
#> # ℹ 22 more rows

df %>%
  ungroup() %>%
  summarise(across(contains("mean"), ~wilcox.test(.x ~ Status)$p.value))
#> Warning: There were 7 warnings in `summarise()`.
#> The first warning was:
#> ℹ In argument: `across(contains("mean"), ~wilcox.test(.x ~ Status)$p.value)`.
#> Caused by warning in `wilcox.test.default()`:
#> ! cannot compute exact p-value with ties
#> ℹ Run `dplyr::last_dplyr_warnings()` to see the 6 remaining warnings.
#> # A tibble: 1 × 7
#>   SI_mean TU_mean  ED_mean MT_mean  DT_mean SK_mean ATT_mean
#>     <dbl>   <dbl>    <dbl>   <dbl>    <dbl>   <dbl>    <dbl>
#> 1  0.0147 0.00512 0.000527 0.00114 0.000114 0.00252  0.00613

Created on 2023-11-09 with reprex v2.0.2