filter() function not working within the for loop

76 views Asked by At

I wrote the code below

miRNA.list <- c("let-7a-5p", "let-7a-1-3p", "let-7b-5p")
summary.df <- data.frame()
for (miRNA in miRNA.list) {
  
  temp.name <- miRNA
  
  temp.df <- df.mirna.pv %>%
              filter(`temp.name` == "yes") %>%
              summarise(downregulated = sum(str_count(status, "downregulated")),
                        upregulated = sum(str_count(status, "upregulated")),
                        all = n())
  
  summary.df <- rbind(summary.df, temp.df)
  
}

to filter the following dataframe based on the "let-xxx" columns and then count number of up or downregulated genes;

print(df.mirna.pv)

          let-7a-5p    let-7a-1-3p   let-7b-5p            status
Xkr4          no            yes          no               upregulated
Mrpl15        yes           yes          no               downregulated
Lypla1        yes           yes          yes              downregulated 
Tcea1         no            yes          no               not significant  

However, for some reason, it cannot match the names in miRNA list with the column names, or at least I think this is the problem, as this is my output:

downregulated upregulated all
1             0           0   0
2             0           0   0
3             0           0   0
4             0           0   0
5             0           0   0
6             0           0   0

Any ideas what could be happening and how can I fix it?

3

There are 3 answers

4
r2evans On

You are mixing normal and programmatic use of dplyr. Namely, filter(`temp.name` == "yes") is looking for a column named "temp.name", not a column that is indirectly referenced in the local variable temp.name.

I think this may be what you want?

library(dplyr)
library(tidyr)
tmp <- pivot_longer(quux, cols = -status)
filter(tmp, value == "yes") |>
  count(name, status) |>
  pivot_wider(id_cols = name, names_from = status, values_from = n, values_fill = 0) |>
  left_join(count(tmp, name, name = "all"), by = "name")
# # A tibble: 3 × 5
#   name        downregulated `not significant` upregulated   all
#   <chr>               <int>             <int>       <int> <int>
# 1 let-7a-1-3p             2                 1           1     4
# 2 let-7a-5p               2                 0           0     4
# 3 let-7b-5p               1                 0           0     4

You can remove `not significant` if you don't need it.


Data

quux <- structure(list("let-7a-5p" = c("no", "yes", "yes", "no"), "let-7a-1-3p" = c("yes", "yes", "yes", "yes"), "let-7b-5p" = c("no", "no", "yes", "no"), status = c("upregulated", "downregulated", "downregulated", "not significant")), row.names = c("Xkr4", "Mrpl15", "Lypla1", "Tcea1"), class = "data.frame")
0
M-- On

If you want to fix your for-loop, you need to pass !!sym(temp.name) to dplyr::filter():

library(dplyr)

df.mirna.pv <- structure(list("let-7a-5p" = c("no", "yes", "yes", "no"), 
                       "let-7a-1-3p" = c("yes", "yes", "yes", "yes"), 
                       "let-7b-5p" = c("no", "no", "yes", "no"), 
                       status = c("upregulated", "downregulated",
                                  "downregulated", "not significant")), 
                  row.names = c("Xkr4", "Mrpl15", "Lypla1", "Tcea1"), 
                  class = "data.frame")

miRNA.list <- c("let-7a-5p", "let-7a-1-3p", "let-7b-5p")
summary.df <- data.frame()
for (miRNA in miRNA.list) {
  
  temp.name <- miRNA
  
  temp.df <- df.mirna.pv %>%
    filter(!!sym(temp.name) == "yes") %>%
    summarise(downregulated = sum(str_count(status, "downregulated")),
              upregulated = sum(str_count(status, "upregulated")),
              all = n())
  
  summary.df <- rbind(summary.df, temp.df)
  
}

summary.df %>% mutate(name = miRNA.list, .before = 1)

#>          name downregulated upregulated all
#> 1   let-7a-5p             2           0   2
#> 2 let-7a-1-3p             2           1   4
#> 3   let-7b-5p             1           0   1

but we can do this easily with tidyr::pivot_longer():

df.mirna.pv %>% 
  tidyr::pivot_longer(-status) %>% 
  filter(value == "yes") %>% 
  summarise(name = first(name), 
            downregulated = sum(status == "downregulated"),
            upregulated = sum(status == "upregulated"),
            all = n(),
            .by = name)

or if we want to absolutely avoid hard-coding:

df.mirna.pv %>% 
  tidyr::pivot_longer(-status, 
                      values_transform = list(value = ~`==`(.x, "yes"))) %>% 
  tidyr::pivot_wider(id_cols = name, names_from = status, 
                     values_from = value, values_fn = sum) %>% 
  janitor::adorn_totals("col") # %>% select(-`not significant`)
1
jay.sf On

You could make a factor out of the "status". Then just loop over the "miRNA.list" columns in an sapply: first equal to "yes", then table the positive statuses and concatenate the sum.

> df.mirna.pv$status <- as.factor(df.mirna.pv$status)
> t(sapply(df.mirna.pv[miRNA.list], \(x) {
+   y <- x == 'yes'
+   c(table(df.mirna.pv$status[y]), all=sum(y))[-2]
+ }))
            downregulated upregulated all
let-7a-5p               2           0   2
let-7a-1-3p             2           1   4
let-7b-5p               1           0   1

BTW, if you just need boolean for miRNA.list, you could do

> (miRNA.list <- startsWith(names(df.mirna.pv), 'let'))
[1]  TRUE  TRUE  TRUE FALSE
> (miRNA.list <- grepl('^let', names(df.mirna.pv)))
[1]  TRUE  TRUE  TRUE FALSE

or for the values

> (miRNA.list <- grep('^let', names(df.mirna.pv), value=TRUE))
[1] "let-7a-5p"   "let-7a-1-3p" "let-7b-5p" 

Works all in the code, and might avoid some typing.


Data:

> dput(df.mirna.pv)
structure(list(`let-7a-5p` = c("no", "yes", "yes", "no"), `let-7a-1-3p` = c("yes", 
"yes", "yes", "yes"), `let-7b-5p` = c("no", "no", "yes", "no"
), status = structure(c(3L, 1L, 1L, 2L), levels = c("downregulated", 
"not significant", "upregulated"), class = "factor")), row.names = c("Xkr4", 
"Mrpl15", "Lypla1", "Tcea1"), class = "data.frame")
> dput(miRNA.list)
c("let-7a-5p", "let-7a-1-3p", "let-7b-5p")