Complete and fill missing data in groups using another data frame

Question

Complete and fill missing data in groups using another data frame

91 views Asked by Icaro Bombonato At 16 December 2016 at 16:05

How can I complete missing values by group?

I have a df with recomendations and ranks, and I need to insert default recomendations when I dont have at least 4 of it.

Input example:

library(tidyverse)

fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)

content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)

# content_id rank recomendation_id  name
# 1         1     1                 recomendation_1
# 1         2     2                 recomendation_2
# 2         1     3                 recomendation_3
# 3         1     4                 recomendation_4
# 3         2     5                 recomendation_5
# 3         3     6                 recomendation_6
# 3         4     7                 recomendation_7
# 3         5     8                 recomendation_8
# 3         6     9                 recomendation_9

I have tried to do it with complete/fill but it does not respect the groups and it also cut the values outside of rank range.

df %>% 
  complete(content_id, rank = 1:4, 
           fill = list(
            recomendation_id = fixed_recomendations$recomendation_id,
            name = fixed_recomendations$name
            ))

# content_id  rank recomendation_id name
# 1           1      1               recomendation_1
# 1           2      2               recomendation_2
# 1           3     50              recomendation_50
# 1           4     51              recomendation_51
# 2           1      3               recomendation_3
# 2           2     52              recomendation_52
# 2           3     53              recomendation_53
# 2           4     54              recomendation_54
# 3           1      4               recomendation_4
# 3           2      5               recomendation_5
# 3           3      6               recomendation_6
# 3           4      7               recomendation_7

Desired output:

# content_id  rank recomendation_id name
# 1           1      1               recomendation_1
# 1           2      2               recomendation_2
# 1           3     50              recomendation_50
# 1           4     51              recomendation_51
# 2           1      3               recomendation_3
# 2           2     50              recomendation_50
# 2           3     51              recomendation_51
# 2           4     52              recomendation_52
# 3           1      4               recomendation_4
# 3           2      5               recomendation_5
# 3           3      6               recomendation_6
# 3           4      7               recomendation_7
# 3           5      8               recomendation_8
# 3           6      9               recomendation_9

Original Q&A

There are 2 answers

generic_user On 16 December 2016 at 18:08

I don't use the same suite of packages you do, but here is a solution using foreach that I whipped up in a couple of minutes. I could have done it using basic for loops, but the .combine functionality in foreach is useful, plus it is amenable to parallelization in case your real problem is big.

fixed_recomendations <- data.frame(recomendation_id = 50:54, name = paste("recomendation", 50:54, sep = "_"), stringsAsFactors = FALSE)

content_id <- c(1,1,2,rep(3, 6))
rank <- c(1, 2, 1, 1:6)
recomendation_id <- c(1:9)
name <- paste("recomendation", recomendation_id, sep = "_")
df <- data.frame(content_id, rank, recomendation_id, name, stringsAsFactors = FALSE)


tab <- table(content_id)
library(foreach)
defaults <- foreach (i = 1:length(tab), .combine = rbind) %do%{
  if (tab[i]<4){
    foreach( j = 1:(4-tab[i]), .combine = rbind) %do% {
      data.frame(names(tab)[i], j + tab[i], fixed_recomendations$recomendation_id[j], fixed_recomendations$name[j])
    }
  }
}
colnames(defaults) <- colnames(df)
df.new <- rbind(df, defaults)
df.new <- df.new[with(df.new, order(content_id, rank)),]
df.new

   content_id rank recomendation_id             name
1           1    1                1  recomendation_1
2           1    2                2  recomendation_2
12          1    3               50 recomendation_50
11          1    4               51 recomendation_51
3           2    1                3  recomendation_3
23          2    2               50 recomendation_50
21          2    3               51 recomendation_51
22          2    4               52 recomendation_52
4           3    1                4  recomendation_4
5           3    2                5  recomendation_5
6           3    3                6  recomendation_6
7           3    4                7  recomendation_7
8           3    5                8  recomendation_8
9           3    6                9  recomendation_9

**Icaro Bombonato** · Accepted Answer · 2016-12-16T18:26:05+00:00

I dont know if this was the best approach, but I ended solving it in two steps, first I create a tibble with NA values using complete, then I filtered the NA values and update it group using split/map_df:

add_missing <- function(x){
  for(i in 1:nrow(x)){
    x[i,]$recomendation_id = fixed_recomendations[i,]$recomendation_id
    x[i,]$name = fixed_recomendations[i,]$name     
  }
  x
}

df_missing <- df %>% 
  complete(content_id, rank = 1:4) %>% 
  filter(is.na(recomendation_id)) %>% 
  split(.$content_id) %>% 
  map_df(add_missing)

rbind(df, df_missing) %>% arrange(content_id, rank)

>   content_id rank recomendation_id             name
1           1    1                1  recomendation_1
2           1    2                2  recomendation_2
3           1    3               50 recomendation_50
4           1    4               51 recomendation_51
5           2    1                3  recomendation_3
6           2    2               50 recomendation_50
7           2    3               51 recomendation_51
8           2    4               52 recomendation_52
9           3    1                4  recomendation_4
10          3    2                5  recomendation_5
11          3    3                6  recomendation_6
12          3    4                7  recomendation_7
13          3    5                8  recomendation_8
14          3    6                9  recomendation_9

TechQA.

Complete and fill missing data in groups using another data frame

There are 2 answers

Related Questions in R

Related Questions in DPLYR

Related Questions in TIDYR

Related Questions in TIDYVERSE

Popular Questions

Popular Tags

Trending Questions