Ordering data based on two variables but with a cut-off point

65 views Asked by At

I am able to construct a new data frame based on the data below where each row comprises the expectation values of each categorical variable in the ID column, taken in ascending order of time. But how can I do this up until a cut off point in time. For example, if I only want values to be taken in chronological order until time = 5.

library('dplyr')
library('purrr')
df <- read.csv("data.csv", header = TRUE)
# df
      ID Time Expectation
1  NJL.1    3         0.1
2  NJL.1    1         0.1
3  NJL.1    2         0.1
4  NJL.1    4         0.1
5  NJL.1    6         0.1
6  NJL.1    5       100.0
7  NJL.1   10         0.1
8  NJL.1    8         0.1
9  NJL.1    9         0.1
10 NJL.1    7         0.1
11 NJL.2   10         0.1
12 NJL.2    1         0.1
13 NJL.2    3         0.1
14 NJL.2    6         0.1
15 NJL.2    4         0.1
16 NJL.2    2         6.0
17 NJL.2    5         0.1
18 NJL.2    8         7.0
19 NJL.2    9         8.0
20 NJL.2    7         0.1
21 NJL.3    3         0.1
22 NJL.3    1         0.1
23 NJL.3    2         0.1
24 NJL.3    4         0.1
25 NJL.3    6         0.1
26 NJL.3    5        10.0
27 NJL.3   10         0.1
28 NJL.3    8         0.1
29 NJL.3    9         0.1
30 NJL.3    7         0.1

df <- df %>%
  group_by(ID) %>%
  summarise(var = list(Expectation[order(Time)]), 
            var_ts = purrr::map(var, ts))

So for example, for NJL.1, values would be (0.1, 0.1, 0.1, 0.1. 100) and all other expectation values are ignored.

Many thanks!

3

There are 3 answers

2
Wimpel On BEST ANSWER

a data.table approach

sample data

library(data.table)

setDT(df)
#or
df <- fread("row ID Time Expectation
1  NJL.1    3         0.1
2  NJL.1    1         0.1
3  NJL.1    2         0.1
4  NJL.1    4         0.1
5  NJL.1    6         0.1
6  NJL.1    5       100.0
7  NJL.1   10         0.1
8  NJL.1    8         0.1
9  NJL.1    9         0.1
10 NJL.1    7         0.1
11 NJL.2   10         0.1
12 NJL.2    1         0.1
13 NJL.2    3         0.1
14 NJL.2    6         0.1
15 NJL.2    4         0.1
16 NJL.2    2         6.0
17 NJL.2    5         0.1
18 NJL.2    8         7.0
19 NJL.2    9         8.0
20 NJL.2    7         0.1
21 NJL.3    3         0.1
22 NJL.3    1         0.1
23 NJL.3    2         0.1
24 NJL.3    4         0.1
25 NJL.3    6         0.1
26 NJL.3    5        10.0
27 NJL.3   10         0.1
28 NJL.3    8         0.1
29 NJL.3    9         0.1
30 NJL.3    7         0.1")

code

#set keys for sorting
setkey( df, ID, Time )

#filter values by group
ans <- df[ df[, .I[Time <= 5], by = ID]$V1 ]
#    row    ID Time Expectation
# 1:   2 NJL.1    1         0.1
# 2:   3 NJL.1    2         0.1
# 3:   1 NJL.1    3         0.1
# 4:   4 NJL.1    4         0.1
# 5:   6 NJL.1    5       100.0
# 6:  12 NJL.2    1         0.1
# 7:  16 NJL.2    2         6.0
# 8:  13 NJL.2    3         0.1
# 9:  15 NJL.2    4         0.1
# 10: 17 NJL.2    5         0.1
# 11: 22 NJL.3    1         0.1
# 12: 23 NJL.3    2         0.1
# 13: 21 NJL.3    3         0.1
# 14: 24 NJL.3    4         0.1
# 15: 26 NJL.3    5        10.0

Now you can easily summarise, paste+collapse, dcast, etc.. to get desired output.

Examples:

ans[, .(values = paste0( Expectation, collapse = "," ) ), by = ID ]
#       ID              values
# 1: NJL.1 0.1,0.1,0.1,0.1,100
# 2: NJL.2   0.1,6,0.1,0.1,0.1
# 3: NJL.3  0.1,0.1,0.1,0.1,10

or

dcast(ans, ID ~ Time, value.var = "Expectation")
#       ID   1   2   3   4     5
# 1: NJL.1 0.1 0.1 0.1 0.1 100.0
# 2: NJL.2 0.1 6.0 0.1 0.1   0.1
# 3: NJL.3 0.1 0.1 0.1 0.1  10.0
2
Chris Ruehlemann On

This subsets your dfto the desired values:

df[which(df$Time <= 5),]    

        row    ID Time Expectation
 1:   1 NJL.1    3         0.1
 2:   2 NJL.1    1         0.1
 3:   3 NJL.1    2         0.1
 4:   4 NJL.1    4         0.1
 5:   6 NJL.1    5       100.0
 6:  12 NJL.2    1         0.1
 7:  13 NJL.2    3         0.1
 8:  15 NJL.2    4         0.1
 9:  16 NJL.2    2         6.0
10:  17 NJL.2    5         0.1
11:  21 NJL.3    3         0.1
12:  22 NJL.3    1         0.1
13:  23 NJL.3    2         0.1
14:  24 NJL.3    4         0.1
15:  26 NJL.3    5        10.0

To order the dataframe, first save it, say, as dfnew:

dfnew <- df[df$Time <= 3 & df$Time <= 5,]

Then simply use order thus:

dfnew[order(ID, Time), ]

   row    ID Time Expectation
1:   2 NJL.1    1         0.1
2:   3 NJL.1    2         0.1
3:   1 NJL.1    3         0.1
4:  12 NJL.2    1         0.1
5:  16 NJL.2    2         6.0
6:  13 NJL.2    3         0.1
7:  22 NJL.3    1         0.1
8:  23 NJL.3    2         0.1
9:  21 NJL.3    3         0.1
0
jay.sf On

Assuming you indeed want the Times ordered as asked, in base R you can do

dat <- with(dat, {dat <- dat[Time <= 5, ];dat[order(ID, Time), ]})
dat
#       ID Time Expectation
# 2  NJL.1    1         0.1
# 3  NJL.1    2         0.1
# 1  NJL.1    3         0.1
# 4  NJL.1    4         0.1
# 6  NJL.1    5       100.0
# 12 NJL.2    1         0.1
# 16 NJL.2    2         6.0
# 13 NJL.2    3         0.1
# 15 NJL.2    4         0.1
# 17 NJL.2    5         0.1
# 22 NJL.3    1         0.1
# 23 NJL.3    2         0.1
# 21 NJL.3    3         0.1
# 24 NJL.3    4         0.1
# 26 NJL.3    5        10.0

Data

dat <- structure(list(ID = c("NJL.1", "NJL.1", "NJL.1", "NJL.1", "NJL.1", 
"NJL.1", "NJL.1", "NJL.1", "NJL.1", "NJL.1", "NJL.2", "NJL.2", 
"NJL.2", "NJL.2", "NJL.2", "NJL.2", "NJL.2", "NJL.2", "NJL.2", 
"NJL.2", "NJL.3", "NJL.3", "NJL.3", "NJL.3", "NJL.3", "NJL.3", 
"NJL.3", "NJL.3", "NJL.3", "NJL.3"), Time = c(3L, 1L, 2L, 4L, 
6L, 5L, 10L, 8L, 9L, 7L, 10L, 1L, 3L, 6L, 4L, 2L, 5L, 8L, 9L, 
7L, 3L, 1L, 2L, 4L, 6L, 5L, 10L, 8L, 9L, 7L), Expectation = c(0.1, 
0.1, 0.1, 0.1, 0.1, 100, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 
0.1, 6, 0.1, 7, 8, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 10, 0.1, 0.1, 
0.1, 0.1)), row.names = c(NA, -30L), class = "data.frame")