Coding a multiple responses question using RSTUDIO

451 views Asked by At

Let's say we have this question Why are you not happy? and we have 5 answers (1, 2, 3, 4, 5)

s = data.frame(subjects = 1:12,
  Why_are_you_not_happy = c(1,2,4,5,1,2,4,3,2,1,3,4))

in the previous example every subject picked only one option. but let's say that each of the subjects 3, 7 and 10 picked more than one option.

  • subject 3 : options 1,2,5
  • subject 7 : option 3,4
  • subject 10 : option 1,5

I want to code the options of this question considering these multiple options for these 3 subjects, while preserving the shape of the dataframe.

The next case is if the dataframe includes 2 questions as follows :

df <- data.frame(subjects = 1:12,
                 Why_are_you_not_happy = 
                   c(1,2,"1,2,5",5,1,2,"3,4",3,2,"1,5",3,4),
                 why_are_you_sad = 
                   c("1,2,3",1,2,3,"4,5,3",2,1,4,3,1,1,1) )

How can we making the proper coding for the first and second scenario ? The objective is to apply multiple correspondence analysis (MCA).

Thank you

1

There are 1 answers

5
jared_mamrot On BEST ANSWER

Edit 1:

With your updated example data you have (at least) two options: you can separate each column, or you can pivot_longer() the data and group the "scores" together. E.g.

library(tidyr)

df <- data.frame(subjects = 1:12,
                 Why_are_you_not_happy = 
                   c(1,2,"1,2,5",5,1,2,"3,4",3,2,"1,5",3,4),
                 why_are_you_sad = 
                   c("1,2,3",1,2,3,"4,5,3",2,1,4,3,1,1,1))
df
#>    subjects Why_are_you_not_happy why_are_you_sad
#> 1         1                     1           1,2,3
#> 2         2                     2               1
#> 3         3                 1,2,5               2
#> 4         4                     5               3
#> 5         5                     1           4,5,3
#> 6         6                     2               2
#> 7         7                   3,4               1
#> 8         8                     3               4
#> 9         9                     2               3
#> 10       10                   1,5               1
#> 11       11                     3               1
#> 12       12                     4               1

df1 <- df %>%
  separate(Why_are_you_not_happy,
           sep = ",", into = c("Why_are_you_not_happy_1",
                               "Why_are_you_not_happy_2",
                               "Why_are_you_not_happy_3")) %>%
  separate(why_are_you_sad,
           sep = ",", into = c("why_are_you_sad_1",
                               "why_are_you_sad_2",
                               "why_are_you_sad_3"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 11 rows [1, 2, 4,
#> 5, 6, 7, 8, 9, 10, 11, 12].
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 10 rows [2, 3, 4,
#> 6, 7, 8, 9, 10, 11, 12].
df1
#>    subjects Why_are_you_not_happy_1 Why_are_you_not_happy_2
#> 1         1                       1                    <NA>
#> 2         2                       2                    <NA>
#> 3         3                       1                       2
#> 4         4                       5                    <NA>
#> 5         5                       1                    <NA>
#> 6         6                       2                    <NA>
#> 7         7                       3                       4
#> 8         8                       3                    <NA>
#> 9         9                       2                    <NA>
#> 10       10                       1                       5
#> 11       11                       3                    <NA>
#> 12       12                       4                    <NA>
#>    Why_are_you_not_happy_3 why_are_you_sad_1 why_are_you_sad_2
#> 1                     <NA>                 1                 2
#> 2                     <NA>                 1              <NA>
#> 3                        5                 2              <NA>
#> 4                     <NA>                 3              <NA>
#> 5                     <NA>                 4                 5
#> 6                     <NA>                 2              <NA>
#> 7                     <NA>                 1              <NA>
#> 8                     <NA>                 4              <NA>
#> 9                     <NA>                 3              <NA>
#> 10                    <NA>                 1              <NA>
#> 11                    <NA>                 1              <NA>
#> 12                    <NA>                 1              <NA>
#>    why_are_you_sad_3
#> 1                  3
#> 2               <NA>
#> 3               <NA>
#> 4               <NA>
#> 5                  3
#> 6               <NA>
#> 7               <NA>
#> 8               <NA>
#> 9               <NA>
#> 10              <NA>
#> 11              <NA>
#> 12              <NA>

This is what I think you should use for MCA, e.g.

library(FactoMineR)
library(factoextra)
#> Loading required package: ggplot2

results <- MCA(df1[,2:7])


# Check eigenvalues to see %var for each dimension
fviz_eig(results)


Second approach for handling the data that 'works better' for plotting with e.g. ggplot:

df2 <- df %>%
  pivot_longer(-subjects,
               names_to = "Category",
               values_to = "Score") %>%
  separate(Score, sep = ",", 
           into = c("Score_1", "Score_2", "Score_3"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 21 rows [1, 3, 4,
#> 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, ...].
df2
#> # A tibble: 24 × 5
#>    subjects Category              Score_1 Score_2 Score_3
#>       <int> <chr>                 <chr>   <chr>   <chr>  
#>  1        1 Why_are_you_not_happy 1       <NA>    <NA>   
#>  2        1 why_are_you_sad       1       2       3      
#>  3        2 Why_are_you_not_happy 2       <NA>    <NA>   
#>  4        2 why_are_you_sad       1       <NA>    <NA>   
#>  5        3 Why_are_you_not_happy 1       2       5      
#>  6        3 why_are_you_sad       2       <NA>    <NA>   
#>  7        4 Why_are_you_not_happy 5       <NA>    <NA>   
#>  8        4 why_are_you_sad       3       <NA>    <NA>   
#>  9        5 Why_are_you_not_happy 1       <NA>    <NA>   
#> 10        5 why_are_you_sad       4       5       3      
#> # … with 14 more rows

library(ggplot2)
# convert subjects from an integer to a factor
df2$subjects <- factor(df2$subjects)

group_labels <- c("Why_are_you_not_happy" = "Why are you not happy?",
                  "why_are_you_sad" = "Why are you sad?")

df2 %>%
  pivot_longer(-c(subjects, Category),
               names_to = "Score_number",
               values_to = "Answer") %>%
  na.omit() %>%
  ggplot(aes(x = subjects, y = Answer,
             fill = Category)) +
  geom_tile(color = "white") +
  geom_vline(xintercept = seq(0.5, 11.5, 1),
             color = "black") +
  geom_hline(yintercept = seq(0.5, 4.5, 1),
             color = "black") +
  scale_fill_discrete(labels = group_labels,
                      name = "") +
  theme_bw(base_size = 16) +
  theme(legend.position = "none",
        panel.grid = element_blank()) +
  coord_cartesian(expand = 0) +
  facet_wrap(~Category, nrow = 2,
             labeller = labeller(Category = group_labels))

Created on 2022-10-06 by the reprex package (v2.0.1)


Original answer:

It sounds like you want the separate() function from the tidyr package, e.g.

library(tidyr)

df <- data.frame(subjects = 1:12,
                 Why_are_you_not_happy = c(1,2,"1,2,5",5,1,2,"3,4",3,2,"1,5",3,4))
df
#>    subjects Why_are_you_not_happy
#> 1         1                     1
#> 2         2                     2
#> 3         3                 1,2,5
#> 4         4                     5
#> 5         5                     1
#> 6         6                     2
#> 7         7                   3,4
#> 8         8                     3
#> 9         9                     2
#> 10       10                   1,5
#> 11       11                     3
#> 12       12                     4

df %>%
  separate(Why_are_you_not_happy,
           sep = ",", into = c("Answer_1",
                               "Answer_2",
                               "Answer_3"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 11 rows [1, 2, 4,
#> 5, 6, 7, 8, 9, 10, 11, 12].
#>    subjects Answer_1 Answer_2 Answer_3
#> 1         1        1     <NA>     <NA>
#> 2         2        2     <NA>     <NA>
#> 3         3        1        2        5
#> 4         4        5     <NA>     <NA>
#> 5         5        1     <NA>     <NA>
#> 6         6        2     <NA>     <NA>
#> 7         7        3        4     <NA>
#> 8         8        3     <NA>     <NA>
#> 9         9        2     <NA>     <NA>
#> 10       10        1        5     <NA>
#> 11       11        3     <NA>     <NA>
#> 12       12        4     <NA>     <NA>

Or, perhaps in long format? E.g.

df %>%
  separate(Why_are_you_not_happy,
           sep = ",", into = c("Answer_1",
                               "Answer_2",
                               "Answer_3")) %>%
  pivot_longer(-subjects) %>%
  na.omit()
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 11 rows [1, 2, 4,
#> 5, 6, 7, 8, 9, 10, 11, 12].
#> # A tibble: 16 × 3
#>    subjects name     value
#>       <int> <chr>    <chr>
#>  1        1 Answer_1 1    
#>  2        2 Answer_1 2    
#>  3        3 Answer_1 1    
#>  4        3 Answer_2 2    
#>  5        3 Answer_3 5    
#>  6        4 Answer_1 5    
#>  7        5 Answer_1 1    
#>  8        6 Answer_1 2    
#>  9        7 Answer_1 3    
#> 10        7 Answer_2 4    
#> 11        8 Answer_1 3    
#> 12        9 Answer_1 2    
#> 13       10 Answer_1 1    
#> 14       10 Answer_2 5    
#> 15       11 Answer_1 3    
#> 16       12 Answer_1 4

Created on 2022-10-05 by the reprex package (v2.0.1)