Create multiple columns with same prefix based on conditions from multiple columns

67 views Asked by At

I have data that look like this, with the dates of Birth and Death for patients.

ID      DOB       Birth_Year    DOD      Death_Year
1    2016-10-01     2016     2019-10-15     2019
2    2017-07-01     2019     2022-01-10     2022
3    2017-04-35     2017     2020-08-15     2020

And I need to create follow up time for each year from 2015 to 2020. If the follow-up time for the birth year is the difference between the DOB and the last date of the Birth Year. Similarly, the follow up time for the death year is the time between the first date of the death year to the date of death. Otherwise, the follow-up time is zero or one year. I am expecting an output dataframe with new columns Year_2015 to Year 2020 as below.

ID      DOB       Birth_Year    DOD      Death_Year Year_2015 Year_2016 Year_2017 Year_2018 Year_2019 Year_2020
1    2016-10-01     2016     2019-06-30     2019        0        0.25       1         1        0.5        0
2    2017-07-01     2019     2022-01-10     2022        0         0        0.5        1         1         1
3    2017-04-15     2017     2020-08-15     2020        0         0        0.3        1         1        0.8

I tried to used case_when in dplyr package, or loop function with if else statements. First I successfully created columns with same prefix Year_2015 to Year_2020, but failed to write loops that iterates through the columns based on multiple conditions. I had trouble referencing varying column names within loops. Or maybe there is a way to use apply function in R. Any help is appreciated!

for (i in 2015:2020) {
  FoUp_Year <- paste0('Year_', i) 
  df[, Birth_Year_end] <- make_date(year = Birth_Year, month = 12, day = 31)
  df[, Death_Year_start] <- make_date(year = Death_Year, month = 1, day = 1)
  if (i<df[, Birth_Year] | i>df[, Death_Year]) { 
    df[,FoUp_Year] <- 0
  }
  else if(i==df[, Birth_Year] && i<df[, Death_Year]) {
    df[,FoUp_Year] <- df[, Birth_Year_end]-df[,DOB]
  }
  else if(i==df[, Death_Year] && i>df[, Birth_Year]) {
    df[,FoUp_Year] <- df[, DOD]-df[,Death_Year_start]
  }
  else if(i==df[, Birth_Year] && i==df[, Death_Year]) {
    df[,FoUp_Year] <- df[, DOD]-df[,DOB]
  }
  else if(i>df[, Birth_Year] && i<df[, Death_Year]) {
    df[,FoUp_Year] <- 1
  }
}
1

There are 1 answers

0
marcguery On

The package lubridate will calculate the time difference between dates. Then the number of months (or days if you want more precision) between the dates can be divided by the total number of months in a year (or days, also taking into account leap years).

library(lubridate) #To calculate time intervals between dates

## Convert to Date format
dob_dod$DOB <- as.Date(dob_dod$DOB)
dob_dod$DOD <- as.Date(dob_dod$DOD)
## Years to process
mindate <- min(dob_dod$DOB)
maxdate <- max(dob_dod$DOD)
all_years <- as.Date(paste0(c(year(mindate):year(maxdate)),"-01-01"))

## Fraction of the years between 'DOB' and 'DOD', precision at the month level
frac_years_month <- data.frame(t(apply(dob_dod[,c("DOB","DOD")],1,function(x){
  time_period <- rep(0,length.out = length(all_years))
  yob <- year(x[1])
  yod <- year(x[2])
  yob_rank <- which(year(all_years)==yob)
  yod_rank <- which(year(all_years)==yod)
  time_period[yob_rank] <- (12-(interval(all_years[yob_rank], x[1]) %/% months(1)))/12
  time_period[yod_rank] <- (interval(all_years[yod_rank], x[2]) %/% months(1))/12
  time_period[c((yob_rank+1):(yod_rank-1))] <- 1
  return(time_period)
})))
colnames(frac_years_month) <- paste0("Year_",year(all_years))

## Fraction of the years between 'DOB' and 'DOD', precision at the day level
frac_years_day <- data.frame(t(apply(dob_dod[,c("DOB","DOD")],1,function(x){
  time_period <- rep(0, length.out = length(all_years))
  yob <- year(x[1])
  yod <- year(x[2])
  yob_rank <- which(year(all_years)==yob)
  yod_rank <- which(year(all_years)==yod)
  yob_days <- ifelse(leap_year(yob), 366, 365)
  yod_days <- ifelse(leap_year(yod), 366, 365)
  time_period[yob_rank] <- (yob_days-(interval(all_years[yob_rank], x[1]) %/% days(1)))/yob_days
  time_period[yod_rank] <- (interval(all_years[yod_rank], x[2]) %/% days(1))/yod_days
  time_period[c((yob_rank+1):(yod_rank-1))] <- 1
  return(time_period)
})))
colnames(frac_years_day) <- paste0("Year_",year(all_years))

## Use frac_years_month or frac_years_day depending on the level of precision desired
dob_dod <- cbind(dob_dod,frac_years_month)

Precision at the month level:

ID  DOB         Birth_Year  DOD         Death_Year  Year_2016  Year_2017  Year_2018  Year_2019  Year_2020  Year_2021  Year_2022
 1  2016-10-01        2016  2019-10-15        2019       0.25       1             1       0.75     0               0          0
 2  2017-07-01        2019  2022-01-10        2022       0          0.5           1       1        1               1          0
 3  2017-04-15        2017  2020-08-15        2020       0          0.75          1       1        0.5833          0          0

Precision at the day level:

ID  DOB         Birth_Year  DOD         Death_Year  Year_2016  Year_2017  Year_2018  Year_2019  Year_2020  Year_2021  Year_2022
 1  2016-10-01        2016  2019-10-15        2019     0.2514     1               1     0.7863     0               0     0
 2  2017-07-01        2019  2022-01-10        2022     0          0.5041          1     1          1               1     0.0247
 3  2017-04-15        2017  2020-08-15        2020     0          0.7151          1     1          0.6202          0     0

Raw data

dob_dod <- read.table(textConnection("ID      DOB       Birth_Year    DOD      Death_Year
1    2016-10-01     2016     2019-10-15     2019
2    2017-07-01     2019     2022-01-10     2022
3    2017-04-15     2017     2020-08-15     2020"),
                   h = T)