Add/match rows with NA to matrix based on missing unique IDs

107 views Asked by At

I am using a panel data set and intent to model this as a dynamic affiliation network using SAOMs. The data is unfortunately very messy and a pain to deal with.

I have managed to create adjacency matrices for each panel wave. However, over time the panel grew in size / people left. I need the number of rows in each matrix to be the same and in the same order according to the unique IDs, which are present when inspecting the objects in R. All "added IDs" should show 10s across the whole row.

Here is a reproducible example that should make the issue clear and also shows what I aim for. I assume this can be solved by smart use of the merge() function, but I could not get it to work:

wave1 <- matrix(c(0,0,1,1,0,1,1,0,1,1), nrow = 5, ncol = 2, dimnames = list(c("1","2","4","5","9"), c("group1","group2")))
wave2 <- matrix(c(0,1,1,0,1,0,1,1), nrow = 4, ncol = 2, dimnames = list(c("1","4","8","9"), c("group1","group2")))

wave1_c <- matrix(c(0,0,1,1,10,0,1,1,0,0,10,1), nrow = 6, ncol = 2, dimnames = list(c("1","2","4","5","8","9"), c("group1","group2")))
wave2_c <- matrix(c(0,10,1,10,1,0,1,10,0,10,1,1), nrow = 6, ncol = 2, dimnames = list(c("1","2","4","5","8","9"), c("group1","group2")))

Thanks in advance. Numbers in the matrices are arbitrary except for the 10s.

3

There are 3 answers

0
clp On BEST ANSWER

Solution in base R using dataframes and merge.

Merge and outer join.

dwave1_c <- merge(wave1, wave2, by = 'row.names', all = TRUE, suffixes="")[2:3]
dwave2_c <- merge(wave2, wave1, by = 'row.names', all = TRUE, suffixes="")[2:3]
dwave1_c[is.na(dwave1_c)] <- 10
dwave2_c[is.na(dwave2_c)] <- 10

as.matrix(dwave1_c)
as.matrix(dwave2_c)

Update.

both <- merge(wave1, wave2, by = 'row.names', all = TRUE)

Output.

   Row.names group1.x group2.x group1.y group2.y
 1         1        0        1        0        1
 2         2        0        1       NA       NA
 3         4        1        0        1        0
 4         5        1        1       NA       NA
 5         8       NA       NA        1        1
 6         9        0        1        0        1

dwave1_c <- both[,2:3]; colnames(dwave1_c) <- colnames(wave1)
dwave2_c <- both[,4:5]; colnames(dwave2_c) <- colnames(wave2)
dwave1_c[is.na(dwave1_c)] <- 10
dwave2_c[is.na(dwave2_c)] <- 10

Show result.

as.matrix(dwave1_c)
as.matrix(dwave2_c)

First try.

## Convert matrix to dataframe.
df1 <- as.data.frame(wave1)
df2 <- as.data.frame(wave2)

## Merge df1 and df2 by row name.
m_df1_df2 <- merge(df1, df2, by = 'row.names', all = TRUE)
rownames(m_df1_df2) <- m_df1_df2$Row.names

# Rows not in df1, but in df2,
# rows not in df2, but in df1
not1_2 <- m_df1_df2[is.na(m_df1_df2$group1.x),][c("group1.x", "group2.x")] # not in df1, in df2
not2_1 <- m_df1_df2[is.na(m_df1_df2$group1.y),][c("group1.y", "group2.y")] # not in df2, in df1

## Same column names.   
colnames(not1_2) <- colnames(df1)
colnames(not2_1) <- colnames(df2)

## append
df1_c <- rbind(df1, not1_2)
df2_c <- rbind(df2, not2_1)

## order by row name
df1_c <- df1_c[order(row.names(df1_c)), ]
df2_c <- df2_c[order(row.names(df2_c)), ]

## replace NA by 10
df1_c[is.na(df1_c)] <- 10
df2_c[is.na(df2_c)] <- 10
as.matrix(df1_c)
as.matrix(df2_c)
0
clp On

The conversion of wave1,2 to data frames in my first attempt is redundant and can be omitted. However at the expense of implicit coercions.

## merge wave1 and wave2 by row name.
m_df1_df2 <- merge(wave1, wave2, by = 0, all = TRUE)
rownames(m_df1_df2) <- m_df1_df2$Row.names

# rows not in set 1, but in set 2,
# rows not in set 2, but in set 1.
not1_2 <- m_df1_df2[is.na(m_df1_df2$group1.x),][c("group1.x", "group2.x")]
not2_1 <- m_df1_df2[is.na(m_df1_df2$group1.y),][c("group1.y", "group2.y")]

## Same column names. 
colnames(not1_2) <- colnames(wave1)
colnames(not2_1) <- colnames(wave2)

## append.
wave1_c <- rbind(wave1, not1_2)
wave2_c <- rbind(wave2, not2_1)

## order by row name.
wave1_c <- wave1_c[order(row.names(wave1_c)), ]
wave2_c <- wave2_c[order(row.names(wave2_c)), ]

## replace NA by 10.
wave1_c[is.na(wave1_c)] <- 10
wave2_c[is.na(wave2_c)] <- 10

## show result.
wave1_c
wave2_c
0
clp On

Solution using setdiff.

## rownames not in set 1, but in set 2,
## rownames not in set 2, but in set 1.
rn_not2_1 <- setdiff(rownames(wave1), rownames(wave2))
rn_not1_2 <- setdiff(rownames(wave2), rownames(wave1))

## missing rows to add.
add_to_1 <- wave2[rn_not1_2,,drop=FALSE]
add_to_2 <- wave1[rn_not2_1,,drop=FALSE]
add_to_1[,] <- 10
add_to_2[,] <- 10

## append.
wave1_c <- rbind(wave1, add_to_1)
wave2_c <- rbind(wave2, add_to_2)

## order by row name.
wave1_c <- wave1_c[order(row.names(wave1_c)), ]
wave2_c <- wave2_c[order(row.names(wave2_c)), ]

## show result.
wave1_c
wave2_c