Creating multiple new columns from using vectors of input

53 views Asked by At

I have huge datasets that I need to calculate lagged numbers on. The resulting dataset needs to have to for each day, columns for the number of 'look backs' requested. In my example, I look back in 5 day slices for 10 previous periods. I am trying to figure out how to generalize this to cover any number of requested time slice lengths and number of lookbacks (e.g., 365 days, for 5 previous years).

enter image description here

I have included a single station for 60 days and 3 code examples. The first example shows doing it by hard coding. If I have a hundred lookback periods, this gets to be tedious.

The second example shows how I could start to use parameters, for what it is worth.

The third (which doesn't work) shows how I am hoping to write some kind of statement that mutates across a vector of measures to create, using a second vector for the amount of lag. Maybe it is better to use {col} instead of specifying the names up front in a vector? I just can't figure out how to refer to a 'fixed' variable (the first roll-up, but creat a dynamic number of new variables by successively longer periods.

thanks !!!

pacman::p_load(tidyverse, RcppRoll)

# The input dataset
Daily <- structure(list(stnID = c(165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686, 165638621686, 165638621686, 165638621686, 
165638621686, 165638621686), date = structure(c(10957, 10958, 
10959, 10960, 10961, 10962, 10963, 10964, 10965, 10966, 10967, 
10968, 10969, 10970, 10971, 10972, 10973, 10974, 10975, 10976, 
10977, 10978, 10979, 10980, 10981, 10982, 10983, 10984, 10985, 
10986, 10987, 10988, 10989, 10990, 10991, 10992, 10993, 10994, 
10995, 10996, 10997, 10998, 10999, 11000, 11001, 11002, 11003, 
11004, 11005, 11006, 11007, 11008, 11009, 11010, 11011, 11012, 
11013, 11014, 11015, 11016), class = "Date"), measure = c(15.3333333333333, 
15.9791666666667, 16.65, 12.675, 9.32916666666667, 9.71041666666667, 
11.8916666666667, 11.9958333333333, 11.025, 10.94375, 11.3791666666667, 
9.04166666666667, 10.5604166666667, 10.8583333333333, 11.4083333333333, 
10.1979166666667, 10.19375, 13.1645833333333, 13.7604166666667, 
13.21875, 11.16875, 10.43125, 11.0604166666667, 13.4041666666667, 
14.0979166666667, 10.8521739130435, 8.54375, 5.44375, 8.06666666666667, 
9.77291666666667, 10.2676470588235, 11.5979166666667, 12.1375, 
11.7958333333333, 12.3916666666667, 12.7875, 13.4604166666667, 
10.7541666666667, 10.1979166666667, 10.9145833333333, 11.76875, 
13.6291666666667, 12.5, 10.9416666666667, 12.16875, 12.2229166666667, 
12.0541666666667, 11.69375, 11.05, 12.3229166666667, 12.1208333333333, 
11.5020833333333, 13.1770833333333, 11.3833333333333, 9.88, 10.9520833333333, 
11.275, 11.4208333333333, 11.3270833333333, 11.0104166666667)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -60L))

# The hand-code approach

    rolled1 <- Daily %>% group_by(stnID) %>% 
        mutate(measure_01 := roll_meanr(measure, 5, na.rm=TRUE)) %>% 
    
        mutate(measure_02 := lag(measure_01, 10)) %>% 
        mutate(measure_03 := lag(measure_01, 15)) %>% 
        mutate(measure_04 := lag(measure_01, 20)) %>% 
        mutate(measure_05 := lag(measure_01, 25)) %>% 
        mutate(measure_06 := lag(measure_01, 30)) %>% 
        mutate(measure_07 := lag(measure_01, 35)) %>% 
        mutate(measure_08 := lag(measure_01, 40)) %>% 
        mutate(measure_09 := lag(measure_01, 45)) %>% 
        mutate(measure_10 := lag(measure_01, 50)) 
    
# The parameter approach
    
    
periodN    <- 10 # number of lagged periods we want to create
periodSize <- 5  # of days back to look

varList <- paste0("measure_",sprintf("%02d", seq_along(1:periodN)))
    
    rolled2 <- Daily %>% group_by(stnID) %>% 
        mutate(!!varList[1] := roll_meanr(measure, periodSize, na.rm=TRUE)) %>% 
    
        mutate(!!varList[2] := lag(measure_01, periodSize * 2)) %>% 
        mutate(!!varList[3] := lag(measure_01, periodSize * 3)) %>% 
        mutate(!!varList[4] := lag(measure_01, periodSize * 4)) %>% 
        mutate(!!varList[5] := lag(measure_01, periodSize * 5)) %>% 
        mutate(!!varList[6] := lag(measure_01, periodSize * 6)) %>% 
        mutate(!!varList[7] := lag(measure_01, periodSize * 7)) %>% 
        mutate(!!varList[8] := lag(measure_01, periodSize * 8)) %>% 
        mutate(!!varList[9] := lag(measure_01, periodSize * 9)) %>% 
        mutate(!!varList[10]:= lag(measure_01, periodSize * 10))
    
# the vector approach (I have no idea what I am doing here)

varList <- paste0("measure_",sprintf("%02d", seq(2, periodN, 1)))
numList <- periodSize * seq(2, periodN, 1) 

    # I want to loop over the vectors which start from measure_02 to measure_10, and 10 to 50

    rolled3 <- Daily %>% group_by_stnID) %>% 
                mutate(!!varList[1] := roll_meanr(measure, periodSize, na.rm=TRUE)) %>% 
        mutate(across(varList) ~ lag(measure01, numList)) 

I am successful with the first two methods. I have looked up examples, but can't seem to find one that does this kind of thing.

Would I "create" the result columns as a prior step, putting the lag amount in each one, and then mutate across making each become be lag[target date, .x] ???

This hurts my head.

1

There are 1 answers

2
Tung On

Taken from this answer. Is it what you want?

library(data.table)
library(rlang)
library(tidyverse)

n_lags <- 50
step <- 5

lags <- function(var, n = 50, step = 5) {
  var <- enquo(var)
  indices <- seq(5, n, by = 5)
  
  # create a list of quosures by looping over `indices`
  # then give them names for `mutate` to use later
  map(indices, ~ quo(lag(!!var, !!.x))) %>%
    set_names(sprintf("measure_%02d", indices))
}

# unquote the list of quosures so that they are evaluated by `mutate`
Daily %>% 
  group_by(stnID) %>%
  mutate(measure_01 = frollmean(measure, n = 5, na.rm=TRUE)) %>% 
  mutate_at(vars(measure_01), funs(!!!lags(measure_01, n_lags, step)))
#> 
#> # A tibble: 60 × 14
#>           stnID date       measure measure_01 measure_05 measure_10 measure_15
#>           <dbl> <date>       <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
#>  1 165638621686 2000-01-01   15.3        NA         NA           NA         NA
#>  2 165638621686 2000-01-02   16.0        NA         NA           NA         NA
#>  3 165638621686 2000-01-03   16.6        NA         NA           NA         NA
#>  4 165638621686 2000-01-04   12.7        NA         NA           NA         NA
#>  5 165638621686 2000-01-05    9.33       14.0       NA           NA         NA
#>  6 165638621686 2000-01-06    9.71       12.9       NA           NA         NA
#>  7 165638621686 2000-01-07   11.9        12.1       NA           NA         NA
#>  8 165638621686 2000-01-08   12.0        11.1       NA           NA         NA
#>  9 165638621686 2000-01-09   11.0        10.8       NA           NA         NA
#> 10 165638621686 2000-01-10   10.9        11.1       14.0         NA         NA
#> # ℹ 50 more rows
#> # ℹ 7 more variables: measure_20 <dbl>, measure_25 <dbl>, measure_30 <dbl>,
#> #   measure_35 <dbl>, measure_40 <dbl>, measure_45 <dbl>, measure_50 <dbl>

Created on 2023-10-28 with reprex v2.0.2