How to perform multiple t.tests with R package srvyr?

568 views Asked by At

As a follow up on a recent SO question (see here) I am wondering how to perform multiple t.tests in R with weighted data (package srvyr). I cant make it run and would be happy if anyone could help me here. I added a random sample in the code below. Many thanks!

      #create data
      surveydata <- as.data.frame(replicate(1,sample(1:5,1000,rep=TRUE)))
      colnames(surveydata)[1] <- "q1"
      surveydata$q2 <- sample(6, size = nrow(surveydata), replace = TRUE)
      surveydata$q3 <- sample(6, size = nrow(surveydata), replace = TRUE)
      surveydata$q4 <- sample(6, size = nrow(surveydata), replace = TRUE)
      surveydata$group <- c(1,2)
      
      #replace all value "6" wir NA
      surveydata[surveydata == 6] <- NA
      
      #add NAs to group 1 in q1
      surveydata$q1[which(surveydata$q1==1 & surveydata$group==1)] = NA
      surveydata$q1[which(surveydata$q1==2 & surveydata$group==1)] = NA
      surveydata$q1[which(surveydata$q1==3 & surveydata$group==1)] = NA
      surveydata$q1[which(surveydata$q1==4 & surveydata$group==1)] = NA
      surveydata$q1[which(surveydata$q1==5 & surveydata$group==1)] = NA
      
      #add weights
      surveydata$weights <- round(runif(nrow(surveydata), min=0.2, max=1.5), 3)

      #create vector for relevant questions
                rquest <- names(surveydata)[1:4]


              
      # create survey design
                library(srvyr)
                surveydesign <- surveydata %>%
                          as_survey_design(strata = group, weights = weights, variables = c("group", all_of(rquest)))
      


      # perform multiple t.test (doesn't work yet)
                outcome <- do.call(rbind, lapply(names(surveydesign$variables)[-1], function(i) {
                          tryCatch({
                                    test <- t.test(as.formula(paste(i, "~ survey")), data = surveydesign)
                                    data.frame(question = i, 
                                               group1 = test$estimate[1], 
                                               group2 = test$estimate[2], 
                                               difference = diff(test$estimate),
                                               p_value = test$p.value, row.names = 1)
                          }, error = function(e) {
                                    data.frame(question = i, 
                                               group1 = NA,
                                               group2 = NA, 
                                               difference = NA,
                                               p_value = NA, row.names = 1)
                          })
                }))
                
                                  

                     
                                  
1

There are 1 answers

5
Chuck P On BEST ANSWER

As I understand it you have a series of question columns in the example q1 to q4. You've used srvyr to generate a weights column. It is possible in our data that for a particular question one entire group maybe all NA and you'd like to generate results into a df even when that is true. You want a weighted Student's t-test making use of the weights column not a simple t-test. The only function I found that provides that is weights::wtd.t.test which doesn't offer a formula interface but wants to be fed vectors.

In order of steps taken:

  1. Load requisite libraries
library(srvyr)
library(dplyr)
library(rlang)
library(weights)
  1. Build a custom function that removes the NAs by variable, pulls the vectors for x, y, weightx, weighty, runs the test, and extracts the info you want into a df row.
multiple_wt_ttest <- function(i) {
   i <- ensym(i)
   xxx <- surveydata %>% 
      filter(!is.na(!!i)) %>% 
      split(.$group)
   newx <- pull(xxx[[1]], i)
   newy <- pull(xxx[[2]], i)
   wtx <- pull(xxx[[1]], weights)
   wty <- pull(xxx[[2]], weights)
   test <- wtd.t.test(x = newx, 
                      y = newy, 
                      weight = wtx, 
                      weighty = wty, 
                      samedata = FALSE)
   data.frame(question = rlang::as_name(i), 
              group1 = test$additional[[2]],
              group2 = test$additional[[3]], 
              difference = test$additional[[1]],
              p.value = test$coefficients[[3]])
}
  1. Once we have the function we can use lapply to apply it column by column (notice it handles the case in q2 where group == 1 is all NA.
lapply(names(surveydata)[1:4], multiple_wt_ttest)
#> [[1]]
#>   question group1   group2 difference p.value
#> 1       q1    NaN 3.010457        NaN      NA
#> 
#> [[2]]
#>   question   group1   group2  difference  p.value
#> 1       q2 3.009003 3.071842 -0.06283922 0.515789
#> 
#> [[3]]
#>   question   group1   group2 difference   p.value
#> 1       q3 2.985096 2.968867  0.0162288 0.8734034
#> 
#> [[4]]
#>   question   group1   group2 difference    p.value
#> 1       q4 2.856255 3.047787 -0.1915322 0.04290471
  1. Finally, wrap it in a do.call and rbind to make the df you desire
do.call(rbind, lapply(names(surveydata)[1:4], multiple_wt_ttest))
#>   question   group1   group2  difference    p.value
#> 1       q1      NaN 3.010457         NaN         NA
#> 2       q2 3.009003 3.071842 -0.06283922 0.51578905
#> 3       q3 2.985096 2.968867  0.01622880 0.87340343
#> 4       q4 2.856255 3.047787 -0.19153218 0.04290471

Your data (without showing all the gyrations to create it and heading the first 200 rows)

surveydata <- 
structure(list(q1 = c(NA, 1L, NA, 4L, NA, 5L, NA, 3L, NA, 5L, 
NA, 5L, NA, 1L, NA, 5L, NA, 3L, NA, 4L, NA, 5L, NA, 4L, NA, 2L, 
NA, 5L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 5L, 
NA, 4L, NA, 4L, NA, 3L, NA, 4L, NA, 2L, NA, 4L, NA, 3L, NA, 1L, 
NA, 1L, NA, 3L, NA, 5L, NA, 3L, NA, 5L, NA, 5L, NA, 4L, NA, 2L, 
NA, 5L, NA, 1L, NA, 3L, NA, 2L, NA, 5L, NA, 4L, NA, 1L, NA, 5L, 
NA, 2L, NA, 2L, NA, 4L, NA, 1L, NA, 3L, NA, 4L, NA, 5L, NA, 3L, 
NA, 5L, NA, 1L, NA, 1L, NA, 3L, NA, 2L, NA, 4L, NA, 4L, NA, 1L, 
NA, 4L, NA, 3L, NA, 2L, NA, 3L, NA, 5L, NA, 2L, NA, 5L, NA, 2L, 
NA, 1L, NA, 5L, NA, 2L, NA, 1L, NA, 2L, NA, 3L, NA, 2L, NA, 3L, 
NA, 4L, NA, 4L, NA, 3L, NA, 1L, NA, 3L, NA, 1L, NA, 5L, NA, 3L, 
NA, 5L, NA, 4L, NA, 1L, NA, 4L, NA, 1L, NA, 3L, NA, 1L, NA, 4L, 
NA, 5L, NA, 4L, NA, 4L, NA, 3L, NA, 3L, NA, 2L, NA, 1L), q2 = c(NA, 
2L, 2L, 1L, 5L, 4L, 3L, 2L, 4L, 4L, 5L, 1L, 4L, 5L, 1L, 4L, NA, 
2L, 2L, 5L, 5L, 4L, 5L, 4L, NA, 1L, 3L, 4L, 5L, 2L, NA, 5L, 2L, 
NA, 4L, 4L, 5L, 4L, 1L, NA, 5L, 1L, 4L, 2L, 1L, NA, 5L, 1L, 3L, 
2L, 4L, NA, 2L, NA, 1L, 4L, NA, 2L, 3L, NA, 3L, 1L, 1L, 1L, 1L, 
1L, 4L, 5L, 1L, 4L, 5L, 4L, NA, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 3L, 
5L, NA, 2L, NA, NA, 5L, 2L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 2L, NA, 
1L, 3L, 5L, 5L, 1L, 1L, NA, 3L, 3L, 1L, 2L, 3L, 3L, 2L, 4L, 2L, 
5L, 4L, 3L, 1L, NA, 4L, 3L, 1L, 5L, 5L, 5L, 2L, 2L, 4L, 5L, 4L, 
1L, 3L, NA, 1L, 3L, 5L, 2L, 1L, 3L, 3L, NA, NA, 5L, NA, 5L, 2L, 
5L, 2L, NA, NA, NA, 1L, 4L, 3L, 2L, 3L, 1L, 3L, 5L, 1L, 2L, 3L, 
5L, 4L, 4L, NA, NA, 5L, 2L, 3L, 3L, 2L, 2L, 1L, 3L, 1L, 4L, 5L, 
2L, 5L, 3L, 1L, 5L, NA, 4L, 3L, 5L, 1L, 1L, 3L, 4L, 4L, 1L, 4L, 
3L, 3L, NA, 2L, 3L, 5L, 5L), q3 = c(4L, 4L, 1L, NA, 4L, 5L, 1L, 
3L, 4L, 4L, 1L, 3L, 2L, 1L, 2L, 4L, 2L, 3L, 4L, 4L, 1L, 3L, 4L, 
5L, 5L, 1L, 3L, 5L, 1L, 2L, 1L, 5L, 5L, 3L, 1L, 3L, 1L, 5L, 1L, 
3L, NA, NA, 3L, 5L, NA, 2L, 2L, 1L, 1L, 3L, 5L, 5L, 2L, NA, 5L, 
2L, 3L, NA, NA, 3L, 2L, 5L, 2L, 1L, NA, NA, 4L, 2L, NA, 1L, NA, 
NA, 5L, 3L, 5L, 4L, 2L, 4L, NA, 2L, 4L, 5L, NA, 2L, 1L, 3L, NA, 
5L, 5L, 4L, 5L, 1L, 5L, 4L, 5L, 3L, 2L, 2L, 2L, 1L, 2L, 1L, NA, 
NA, 5L, 1L, 2L, 5L, 5L, 5L, 3L, 3L, 3L, 2L, 4L, NA, 3L, NA, 3L, 
4L, 2L, 2L, 5L, 1L, NA, 1L, NA, 2L, 2L, 3L, 2L, 5L, 1L, 4L, 4L, 
3L, 5L, 5L, NA, NA, 4L, NA, 5L, 1L, 1L, 2L, 5L, 4L, 5L, 4L, 1L, 
1L, NA, 4L, 4L, 4L, 5L, 1L, NA, 2L, 3L, NA, 1L, NA, NA, NA, 4L, 
2L, 4L, 2L, 1L, 1L, 2L, 1L, 5L, 1L, 3L, 3L, 4L, NA, 1L, 1L, 1L, 
3L, 5L, 1L, NA, 3L, 5L, 5L, 4L, NA, 1L, 4L, 5L, 3L, 5L, NA, 1L, 
4L), q4 = c(NA, 3L, 1L, 1L, 2L, NA, 1L, 5L, 1L, 3L, 3L, 1L, 3L, 
5L, 1L, 3L, 2L, 1L, 1L, 3L, 5L, 5L, NA, 5L, 5L, 5L, 4L, 4L, 4L, 
3L, 3L, 2L, 1L, 3L, 5L, 3L, 1L, 5L, NA, 3L, 2L, 5L, 4L, 4L, 4L, 
2L, 1L, 1L, 2L, 5L, 2L, 1L, 3L, 4L, 3L, 1L, 1L, 4L, 4L, 1L, 2L, 
3L, 3L, 4L, NA, 3L, 3L, 2L, 2L, NA, 3L, 5L, 4L, 4L, 3L, 3L, 4L, 
NA, NA, 3L, NA, 1L, NA, 3L, 3L, 3L, 2L, 3L, 3L, 4L, 1L, 1L, 2L, 
5L, 1L, 1L, 5L, 2L, 2L, 2L, 3L, 4L, 5L, 3L, NA, NA, 2L, 2L, 3L, 
2L, 3L, 2L, 3L, 1L, 3L, 3L, 4L, 5L, NA, 4L, 4L, 3L, 1L, 4L, 5L, 
4L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 5L, 1L, 5L, 2L, NA, 4L, 2L, 
1L, 3L, 3L, 4L, 3L, 2L, 4L, 5L, 4L, 2L, 3L, 5L, 1L, NA, 3L, 2L, 
5L, NA, 1L, 2L, 4L, 5L, 2L, NA, 1L, 3L, NA, 3L, 3L, 3L, 5L, 4L, 
5L, 3L, 3L, NA, 4L, 2L, 3L, 2L, 5L, 4L, 4L, 5L, 5L, 3L, 2L, NA, 
4L, 1L, 5L, 2L, 4L, 3L, 4L, NA, 3L, 1L, 3L), group = structure(c(1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("1", "2"), class = "factor"), 
    weights = c(1.445, 0.408, 0.621, 0.961, 1.492, 0.625, 1.131, 
    0.246, 0.612, 0.621, 1.311, 0.649, 1.282, 0.898, 1.268, 0.641, 
    0.764, 0.759, 0.306, 0.707, 0.899, 0.785, 1.279, 0.458, 0.882, 
    0.384, 1.492, 0.468, 0.785, 0.707, 0.489, 1.113, 0.692, 0.293, 
    0.642, 1.327, 0.362, 1.405, 1.173, 0.732, 0.661, 0.522, 1.001, 
    0.374, 1.181, 0.819, 1.389, 0.43, 0.477, 0.879, 0.634, 0.417, 
    0.359, 1.007, 0.866, 0.203, 1.469, 0.294, 1.326, 1.391, 0.871, 
    1.036, 1.251, 0.417, 1.074, 1.268, 0.963, 0.469, 0.215, 1.074, 
    0.644, 1.054, 0.787, 0.714, 0.568, 0.397, 1.421, 0.692, 0.262, 
    0.644, 0.793, 0.808, 0.25, 0.842, 1.039, 0.359, 0.987, 1.257, 
    0.301, 0.203, 0.823, 1.328, 1.192, 0.256, 1.099, 0.668, 1.129, 
    0.413, 0.266, 1.121, 0.893, 1.484, 0.568, 1.255, 0.531, 0.461, 
    0.773, 0.298, 0.233, 0.676, 0.478, 0.806, 0.556, 0.201, 0.801, 
    0.348, 1.396, 0.552, 0.384, 0.615, 0.499, 0.819, 0.954, 0.943, 
    0.956, 0.323, 0.706, 0.699, 0.9, 1.156, 1.436, 1.115, 0.762, 
    0.258, 1.421, 0.644, 1.349, 0.251, 0.735, 0.479, 1.055, 1.395, 
    1.062, 1.155, 0.869, 0.436, 0.415, 0.745, 1.247, 0.21, 0.879, 
    0.776, 0.747, 0.835, 0.609, 0.733, 0.563, 1.067, 1.436, 0.679, 
    1.497, 1.385, 1.087, 1.286, 0.503, 0.738, 0.504, 0.665, 1.421, 
    1.288, 0.691, 0.972, 0.467, 0.425, 0.406, 0.862, 0.749, 0.935, 
    0.291, 0.444, 1.118, 1.048, 0.886, 0.982, 0.578, 1.402, 0.778, 
    1.139, 0.804, 0.618, 1.147, 0.594, 0.984, 0.986, 0.941, 0.794, 
    0.323, 1.41, 0.902, 0.417)), row.names = c(NA, 200L), class = "data.frame")