Cumulative sum ignoring breaks

265 views Asked by At

Modified data:

structure(list(hour = c(0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L), cs = c(0L, 0L, 0L, 0L, 
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L
), cs_acum = c(0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 1L, 2L, 0L, 0L), cs_wanted = c(0L, 0L, 0L, 0L, 
0L, 1L, 2L, 3L, 0L, 0L, 4L, 5L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 
3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 0L, 0L
), cs_acum2 = c(0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 4L, 5L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 2L, 3L, 0L, 4L, 5L, 0L, 0L)), .Names = c("hour", "cs", "cs_acum", 
"cs_wanted", "cs_acum2"), class = c("data.table", "data.frame"
), row.names = c(NA, -36L), .internal.selfref = <pointer: 0x00000000001f0788>)

cs_acum is cumulative sum of cs with restart at 0.

df1$cs_acum <- with(df1, ave(df1$cs, cumsum(df1$cs == 0), FUN = cumsum))

I need this accumulation to continue if there is value of 1 in 5 rows of hour after the accumulation of 1's from cs has stopped.
Desired output is in col cs_wanted.

Further explanation: çs_acum is accumulation of hours (rows f cs) that meet certain criteria. After this, it has nothing to do with cs any more, it is then related to col: hour. The accumulation should continue if there is a value of 1 in 5 hour window after it has stopped.

Probably a new function checking five lines in hour from the position in cs_acum turns to 0, would be in order, continuing accumulation from where it has stopped in cs_acum.
Possible steps:
find position where accumulation stops
look at next five rows in hour
if there are values of 1, continue accumulation for that line,
look again in next five hours,
if there is no values of 1, do nothing.


New data:

df3 <- structure(list(hour = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
                      cs = c(0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
                      cs_acum = c(0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), 
                      cs_acum2 = c(0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 8, 9, 10, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28)), 
                 .Names = c("hour", "cs", "cs_acum", "cs_acum2"), class = "data.frame", row.names = c(NA, -68L))
2

There are 2 answers

7
Jaap On

Using:

library(data.table)

rl <- rle(df1$hour)

setDT(df1)[, grp := rleid(rep(rl$lengths >5 & rl$values == 0, rl$lengths))
           ][hour == 1, cs_acum2 := cumsum(hour), grp
             ][is.na(cs_acum2), cs_acum2 := 0][]

gives:

    hour cs cs_acum cs_wanted grp cs_acum2
 1:    1  1       1         1   1        1
 2:    1  1       2         2   1        2
 3:    1  1       3         3   1        3
 4:    0  0       0         0   1        0
 5:    0  0       0         0   1        0
 6:    1  0       0         4   1        4
 7:    1  0       0         5   1        5
 8:    0  0       0         0   2        0
 9:    0  0       0         0   2        0
10:    0  0       0         0   2        0
11:    0  0       0         0   2        0
12:    0  0       0         0   2        0
13:    0  0       0         0   2        0
14:    1  1       1         1   3        1
15:    1  1       2         2   3        2
16:    1  1       3         3   3        3
17:    0  0       0         0   3        0

Explanation:

  • Convert the dataframe to a datatable with setDT(df1).
  • With rl <- rle(d1$hour) and grp := rleid(rep(rl$lengths >5 & rl$values == 0, rl$lengths)) you create a grouping variable that only changes when there are more than 5 zero's.
  • Next you filter by hour == 1 and create a get the cumulative sum with cumsum(hour). If your the values in hour are only 1's and 0's, you could also create a counter with seq_along or 1:.N which will give the same result.
  • Finally, with is.na(cs_acum2), cs_acum2 := 0 you change the NA's to zero's.

Update 1: For the new example data (df2):

rl2 <- rle(df2$hour)

setDT(df2)[, `:=` (rn = .I, grp = rleid(rep(rl2$lengths >5 & rl2$values == 0, rl2$lengths)))
           ][hour == 1 & rn >= df2[, .I[cs == 1]][1], cs_acum2 := cumsum(hour), grp
             ][is.na(cs_acum2), cs_acum2 := 0][, c('rn','grp') := NULL][]

which gives:

    hour cs cs_acum cs_wanted cs_acum2
 1:    0  0       0         0        0
 2:    1  0       0         0        0
 3:    1  0       0         0        0
 4:    1  0       0         0        0
 5:    0  0       0         0        0
 6:    1  1       1         1        1
 7:    1  1       2         2        2
 8:    1  1       3         3        3
 9:    0  0       0         0        0
10:    0  0       0         0        0
11:    1  0       0         4        4
12:    1  0       0         5        5
13:    0  0       0         0        0
14:    0  0       0         0        0
15:    0  0       0         0        0
16:    0  0       0         0        0
17:    0  0       0         0        0
18:    0  0       0         0        0
19:    1  1       1         1        1
20:    1  1       2         2        2
21:    1  1       3         3        3
22:    0  0       0         0        0

The way I understood it is that the cumsum of hour is only allowed to start after the first appearance of cs == 1.

Additional explanation:

  • With rn = .I you creat a rowindexnumber.
  • df2[, .I[cs == 1]][1] give you the rownumber where cs == 1 for the first time.
  • With rn >= df2[, .I[cs == 1]][1] you select only the rows from that point onward.

Update 2: With regard to the latest (fourth) dataset, you could do:

rl4 <- rle(df4$hour)

setDT(df4)[, grp := rleid(rep(rl4$lengths >5 & rl4$values == 0, rl4$lengths))]

i1 <- df4[, .I[cs == 1][1], grp][!is.na(V1)]$V1
i2 <- df4[, .I[1:.N==5], rleid(cs)]$V1[-1] + 1

df4[i1, cs.inc := 1
    ][i2, cs.inc := -1
      ][is.na(cs.inc), cs.inc := 0
        ][, cs.inc := cumsum(cs.inc)
          ][hour == 1 & cs.inc == 1, cs_acum3 := cumsum(hour), grp
            ][is.na(cs_acum3), cs_acum3 := 0][, c('grp','cs.inc') := NULL][]

which gives:

    hour cs cs_acum cs_wanted cs_acum2 cs_acum3
 1:    0  0       0         0        0        0
 2:    1  0       0         0        0        0
 3:    1  0       0         0        0        0
 4:    1  0       0         0        0        0
 5:    0  0       0         0        0        0
 6:    1  1       1         1        1        1
 7:    1  1       2         2        2        2
 8:    1  1       3         3        3        3
 9:    0  0       0         0        0        0
10:    0  0       0         0        0        0
11:    1  0       0         4        4        4
12:    1  0       0         5        5        5
13:    0  0       0         0        0        0
14:    0  0       0         0        0        0
15:    0  0       0         0        0        0
16:    0  0       0         0        0        0
17:    0  0       0         0        0        0
18:    0  0       0         0        0        0
19:    1  1       1         1        1        1
20:    1  1       2         2        2        2
21:    1  1       3         3        3        3
22:    0  0       0         0        0        0
23:    0  0       0         0        0        0
24:    0  0       0         0        0        0
25:    0  0       0         0        0        0
26:    0  0       0         0        0        0
27:    0  0       0         0        0        0
28:    0  0       0         0        0        0
29:    1  0       0         0        1        0
30:    1  0       0         0        2        0
31:    1  0       0         0        3        0
32:    0  0       0         0        0        0
33:    1  1       1         1        4        1
34:    1  1       2         2        5        2
35:    0  0       0         0        0        0
36:    0  0       0         0        0        0

Used data

First example dataset:

df1 <- structure(list(hour = c(1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L), 
                      cs = c(1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L), 
                      cs_acum = c(1L, 2L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L), 
                      cs_wanted = c(1L, 2L, 3L, 0L, 0L, 4L, 5L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L)),
                 .Names = c("hour", "cs", "cs_acum", "cs_wanted"), class = "data.frame", row.names = c(NA, -17L))

Second dataset:

df2 <- structure(list(hour = c(0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L),
                      cs = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L),
                      cs_acum = c(0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L),
                      cs_wanted = c(0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 4L, 5L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L)),
                 .Names = c("hour", "cs", "cs_acum", "cs_wanted"), class = "data.frame", row.names = c(NA, -22L))

Fourth dataset:

df4 <- structure(list(hour = c(0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L), 
                      cs = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L), 
                      cs_acum = c(0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 0L, 0L), 
                      cs_wanted = c(0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 4L, 5L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 0L, 0L), 
                      cs_acum2 = c(0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 4L, 5L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 0L, 4L, 5L, 0L, 0L)), 
                 .Names = c("hour", "cs", "cs_acum", "cs_wanted", "cs_acum2"), class = "data.frame", row.names = c(NA, -36L))
4
akrun On

We can try this using only data.table methods

library(data.table)
setDT(df1)[,  grp := shift(cumsum(hour == 1 & (Reduce(`+`, 
   shift(hour, 1:5, fill = 1, type = "lead"))==0)), fill=0)
     ][hour ==1, cs_acum1 := cumsum(hour) , grp
      ][is.na(cs_acum1), cs_acum1 := 0][, grp := NULL][]
#     hour cs cs_acum cs_wanted cs_acum1
# 1:    1  1       1         1        1
# 2:    1  1       2         2        2
# 3:    1  1       3         3        3
# 4:    0  0       0         0        0
# 5:    0  0       0         0        0
# 6:    1  0       0         4        4
# 7:    1  0       0         5        5
# 8:    0  0       0         0        0
# 9:    0  0       0         0        0
#10:    0  0       0         0        0
#11:    0  0       0         0        0
#12:    0  0       0         0        0
#13:    0  0       0         0        0
#14:    1  1       1         1        1
#15:    1  1       2         2        2
#16:    1  1       3         3        3
#17:    0  0       0         0        0

Explanation

We convert the 'data.frame' to 'data.table' (setDT(df1)), create a grouping variable using the lead values of 'hour' to create the condition in the OP's post, specify the 'i' (hour==1) grouped by 'grp' and assign (:=) the cumsum of 'hour' as 'cs_acum1', change the NA elements ito 0 and lastly remove the 'grp' by assigning it to NULL