Complete and fill missing data in groups using another data frame

85 views Asked by At

How can I complete missing values by group?

I have a df with recomendations and ranks, and I need to insert default recomendations when I dont have at least 4 of it.

Input example:

library(tidyverse)

fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)

content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)

# content_id rank recomendation_id  name
# 1         1     1                 recomendation_1
# 1         2     2                 recomendation_2
# 2         1     3                 recomendation_3
# 3         1     4                 recomendation_4
# 3         2     5                 recomendation_5
# 3         3     6                 recomendation_6
# 3         4     7                 recomendation_7
# 3         5     8                 recomendation_8
# 3         6     9                 recomendation_9

I have tried to do it with complete/fill but it does not respect the groups and it also cut the values outside of rank range.

df %>% 
  complete(content_id, rank = 1:4, 
           fill = list(
            recomendation_id = fixed_recomendations$recomendation_id,
            name = fixed_recomendations$name
            ))

# content_id  rank recomendation_id name
# 1           1      1               recomendation_1
# 1           2      2               recomendation_2
# 1           3     50              recomendation_50
# 1           4     51              recomendation_51
# 2           1      3               recomendation_3
# 2           2     52              recomendation_52
# 2           3     53              recomendation_53
# 2           4     54              recomendation_54
# 3           1      4               recomendation_4
# 3           2      5               recomendation_5
# 3           3      6               recomendation_6
# 3           4      7               recomendation_7

Desired output:

# content_id  rank recomendation_id name
# 1           1      1               recomendation_1
# 1           2      2               recomendation_2
# 1           3     50              recomendation_50
# 1           4     51              recomendation_51
# 2           1      3               recomendation_3
# 2           2     50              recomendation_50
# 2           3     51              recomendation_51
# 2           4     52              recomendation_52
# 3           1      4               recomendation_4
# 3           2      5               recomendation_5
# 3           3      6               recomendation_6
# 3           4      7               recomendation_7
# 3           5      8               recomendation_8
# 3           6      9               recomendation_9
2

There are 2 answers

0
Icaro Bombonato On BEST ANSWER

I dont know if this was the best approach, but I ended solving it in two steps, first I create a tibble with NA values using complete, then I filtered the NA values and update it group using split/map_df:

add_missing <- function(x){
  for(i in 1:nrow(x)){
    x[i,]$recomendation_id = fixed_recomendations[i,]$recomendation_id
    x[i,]$name = fixed_recomendations[i,]$name     
  }
  x
}

df_missing <- df %>% 
  complete(content_id, rank = 1:4) %>% 
  filter(is.na(recomendation_id)) %>% 
  split(.$content_id) %>% 
  map_df(add_missing)

rbind(df, df_missing) %>% arrange(content_id, rank)

>   content_id rank recomendation_id             name
1           1    1                1  recomendation_1
2           1    2                2  recomendation_2
3           1    3               50 recomendation_50
4           1    4               51 recomendation_51
5           2    1                3  recomendation_3
6           2    2               50 recomendation_50
7           2    3               51 recomendation_51
8           2    4               52 recomendation_52
9           3    1                4  recomendation_4
10          3    2                5  recomendation_5
11          3    3                6  recomendation_6
12          3    4                7  recomendation_7
13          3    5                8  recomendation_8
14          3    6                9  recomendation_9
0
generic_user On

I don't use the same suite of packages you do, but here is a solution using foreach that I whipped up in a couple of minutes. I could have done it using basic for loops, but the .combine functionality in foreach is useful, plus it is amenable to parallelization in case your real problem is big.

fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)

content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)


tab <- table(content_id)
library(foreach)
defaults <- foreach (i = 1:length(tab), .combine = rbind) %do%{
  if (tab[i]<4){
    foreach( j = 1:(4-tab[i]), .combine = rbind) %do% {
      data.frame(names(tab)[i], j + tab[i], fixed_recomendations$recomendation_id[j], fixed_recomendations$name[j])
    }
  }
}
colnames(defaults) <- colnames(df)
df.new <- rbind(df, defaults)
df.new <- df.new[with(df.new, order(content_id, rank)),]
df.new

   content_id rank recomendation_id             name
1           1    1                1  recomendation_1
2           1    2                2  recomendation_2
12          1    3               50 recomendation_50
11          1    4               51 recomendation_51
3           2    1                3  recomendation_3
23          2    2               50 recomendation_50
21          2    3               51 recomendation_51
22          2    4               52 recomendation_52
4           3    1                4  recomendation_4
5           3    2                5  recomendation_5
6           3    3                6  recomendation_6
7           3    4                7  recomendation_7
8           3    5                8  recomendation_8
9           3    6                9  recomendation_9