Network D3 Sankey Diagrams link dataframe creation: bypassing nodes with NA, and automatic correction of values?

26 views Asked by At

I am trying to create a Sankey Diagram using NetworkD3 to map out patient flow through an A&E department, with example dataframe:

`First_Contact <- c("UTC", "UTC", "111", "111")Second_Contact <- c(NA, "ED - ED RV", "UTC", "UTC")Third_Contact <- c(NA, NA, "ED - ED RV", "ED - ED RV")Final_Pathway_Outcome <- c("Discharged", "Discharged", "Discharged", "Discharged")

df <- data.frame(First_Contact, Second_Contact, Third_Contact, Final_Pathway_Outcome)

Where NAs mean the patient did not need to go through further patient contacts before discharge, i.e. row 1 patient went to UTC and then was discharged)

I am following a codethrough on https://rpubs.com/droach/CPP526-codethrough to good effect with the code below, but am encountering two issues:

  1. Where patients reach their Final Pathway Outcome without needing to go through second-third contacts, my original dataframe has their journey as UTC -> NA -> NA -> Discharged. filter(!is.na(target)) is filtering out rows where NA is the target, but in my pivotted table, I am currently manually replacing NAs in the source column with the appropriate target from the previous row. Is there a way to do this more elegantly?

  2. My dataset will be much larger than the above example, and this code currently treats each row separately, even the duplicate row. Is there a way to aggregate these automatically and adjust the value accordingly?

##Adding row numbers and pivoting data
links.df <- df %>%
    mutate(row = row_number()) %>%
    pivot_longer(cols= -row, names_to="column", values_to="source")
##Creating target column and specifying link order
links.df <- links.df %>%
    mutate(column= match(column, names(trial))) %>%
    group_by(row) %>%
    mutate(target= lead(source, order_by= column)) %>%
    filter(!is.na(target)) %>%
    ungroup()
##Differentiating between areas in each contact
links.df <- links.df %>%
    mutate(source = paste0(source, "", column)) %>%
    mutate(target= paste0(target, "", column+1)) %>%
    select(row, column, source, target)
##Extra modification to swap the value of "NAs" with the target from the previous row. Currently doing this manually
links.df[2,3] <- "UTC_2"links.df[5,3] <- "ED - ED RV_3"
##Creating data frame for nodes
nodes.df <- data.frame(name=unique(c(links.df$source, links.df$target)))nodes.df$label <- sub('_[0-9]*$', '', nodes.df$name)
##Providing instructions for Sankey Diagram (source and target ids)
links.df$source_id <- match(links.df$source, nodes.df$name) -1
links.df$target_id <- match(links.df$target, nodes.df$name) -1
links.df$value <- 1
##Plotting Sankey
sankeyNetwork(Links= links.df,Nodes = nodes.df,Source= 'source_id',Target= 'target_id',Value= 'value',NodeID= 'label',fontSize= 16,iterations=0)`

reproducible code:

library(dplyr)
library(tidyr)
library(networkD3)

First_Contact <- c("UTC", "UTC", "111", "111")
Second_Contact <- c(NA, "ED - ED RV", "UTC", "UTC")
Third_Contact <- c(NA, NA, "ED - ED RV", "ED - ED RV")
Final_Pathway_Outcome <- c("Discharged", "Discharged", "Discharged", "Discharged")

df <- data.frame(First_Contact, Second_Contact, Third_Contact, Final_Pathway_Outcome)

##Adding row numbers and pivoting data
links.df <- df %>%
  mutate(row = row_number()) %>%
  pivot_longer(cols= -row, names_to="column", values_to="source")

##Creating target column and specifying link order
links.df <- links.df %>%
  mutate(column= match(column, names(df))) %>%
  group_by(row) %>%
  mutate(target= lead(source, order_by= column)) %>%
  filter(!is.na(target)) %>%
  ungroup()

##Differentiating between areas in each contact
links.df <- links.df %>%
  mutate(source = paste0(source, "", column)) %>%
  mutate(target= paste0(target, "", column+1)) %>%
  select(row, column, source, target)

##Extra modification to swap the value of "NAs" with the target from the previous row. Currently doing this manually
links.df[2,3] <- "UTC_2"
links.df[5,3] <- "ED - ED RV_3"

nodes.df <- 
  data.frame(
    name = unique(c(links.df$source, links.df$target)),
    label = unique(c(links.df$source, links.df$target))
  )

##Providing instructions for Sankey Diagram (source and target ids)
links.df$source_id <- match(links.df$source, nodes.df$name) -1
links.df$target_id <- match(links.df$target, nodes.df$name) -1
links.df$value <- 1

##Plotting Sankey
sankeyNetwork(Links= links.df,Nodes = nodes.df,Source= 'source_id',Target= 'target_id',Value= 'value',NodeID= 'label',fontSize= 16,iterations=0)

1

There are 1 answers

0
CJ Yetman On

tidy::fill() is a convenient way to fill in NA values in a data.frame column with previous value (up or down).

dplyr::summarise() can be used to aggregate duplicate links and set the value with dplyr::n()

I also added the column number to the source name before filling in the target and source columns to maintain the order/position of nodes.

library(dplyr)
library(tidyr)
library(networkD3)

First_Contact <- c("UTC", "UTC", "111", "111")
Second_Contact <- c(NA, "ED - ED RV", "UTC", "UTC")
Third_Contact <- c(NA, NA, "ED - ED RV", "ED - ED RV")
Final_Pathway_Outcome <- c("Discharged", "Discharged", "Discharged", "Discharged")

df <- data.frame(First_Contact, Second_Contact, Third_Contact, Final_Pathway_Outcome)

links.df <-
  df %>%
  mutate(row = row_number()) %>%
  pivot_longer(cols = -row, names_to = "column", values_to = "source") %>% 
  mutate(column = match(column, names(df))) %>%
  mutate(source = ifelse(!is.na(source), paste0(source, "-", column), NA)) %>%
  group_by(row) %>%
  mutate(target = lead(source, order_by = column)) %>% 
  fill(source, .direction = "down") %>% 
  filter(!is.na(target)) %>%
  ungroup() %>% 
  summarise(value = n(), .by = c("source", "target"))

nodes.df <- data.frame(name = unique(c(links.df$source, links.df$target)))
nodes.df <- mutate(nodes.df, label = sub("-[0-9]*$", "", name))

links.df$source_id <- match(links.df$source, nodes.df$name) -1
links.df$target_id <- match(links.df$target, nodes.df$name) -1

df
#>   First_Contact Second_Contact Third_Contact Final_Pathway_Outcome
#> 1           UTC           <NA>          <NA>            Discharged
#> 2           UTC     ED - ED RV          <NA>            Discharged
#> 3           111            UTC    ED - ED RV            Discharged
#> 4           111            UTC    ED - ED RV            Discharged
links.df
#> # A tibble: 6 × 5
#>   source       target       value source_id target_id
#>   <chr>        <chr>        <int>     <dbl>     <dbl>
#> 1 UTC-1        Discharged-4     1         0         5
#> 2 UTC-1        ED - ED RV-2     1         0         1
#> 3 ED - ED RV-2 Discharged-4     1         1         5
#> 4 111-1        UTC-2            2         2         3
#> 5 UTC-2        ED - ED RV-3     2         3         4
#> 6 ED - ED RV-3 Discharged-4     2         4         5
nodes.df
#>           name      label
#> 1        UTC-1        UTC
#> 2 ED - ED RV-2 ED - ED RV
#> 3        111-1        111
#> 4        UTC-2        UTC
#> 5 ED - ED RV-3 ED - ED RV
#> 6 Discharged-4 Discharged

sankeyNetwork(
  Links = links.df,
  Nodes = nodes.df,
  Source = 'source_id',
  Target = 'target_id',
  Value = 'value',
  NodeID = 'label',
  fontSize = 16,
  iterations = 0,
  sinksRight = FALSE
)