Reshuffling a column by group many times and put the results in new columns

97 views Asked by At

I have the following hypothetical data:

district <- c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,3,3)                                       
village <- c(1,2,3,4,1,2,3,4,5,1,2,3,4,5,6,7)                              
status <- c(1,0,1,0,1,1,1,0,0,1,1,1,1,0,0,0)
datei <- data.table(district, village, status) 

I want to reshuffle status based on district and put the results in new columns. I know how to do it once using the following codes:

datei[, randomstat := sample(status), district]

Now, I want to reshuffle status 1000 times and put the results in new columns. I tried the following codes:

n <- 1000
datei[, paste0("randomstat", 1:n) := replicate(n, list(sample(status), district))]

but failed. Can someone help me with this? Thank you.

3

There are 3 answers

0
ThomasIsCoding On BEST ANSWER

You had a typo in your code: district is put inside list, i.e., list(sample(status), district)), which is not correct.


You have the follow options:

  • Enable simplify = FALSE when using replicate, e.g.,
datei[, paste0("randomstat", 1:n) := replicate(n, sample(status), simplify = FALSE), district]
  • Or, wrap sample(status) with list()
datei[, paste0("randomstat", 1:n) := replicate(n, list(sample(status))), district]
0
jblood94 On

A few vectorized options using the Rfast package if performance is a concern:

library(Rfast)

n <- 1e3

microbenchmark::microbenchmark(
  replicate = dt[,paste0("randomstat", 1:n) := replicate(n, sample(status), FALSE), district],
  reshuffle = dt[,paste0("randomstat", 1:n) := lapply(1:n, \(i) reshuffle(status, district))],
  colShuffle = dt[,paste0("randomstat", 1:n) := as.data.frame(colShuffle(matrix(rep(status, n), .N, n))), district],
  colRanks = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n))], .N, n)), district],
  colRanksAll = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n) + match(district, unique(district)))], .N, n))],
  setup = {dt <- copy(datei)}
)
#> Unit: milliseconds
#>         expr      min        lq       mean    median        uq      max neval
#>    replicate  15.1496  17.55985  21.687518  20.56755  24.95855  37.4373   100
#>    reshuffle 120.1781 135.82605 153.436604 146.41200 160.21505 242.8556   100
#>   colShuffle   4.5537   4.94215   6.118132   5.11265   5.82665  19.3036   100
#>     colRanks   5.7100   6.08095   7.730242   6.40530   8.20650  19.7137   100
#>  colRanksAll   5.0109   5.35780   7.171746   5.59560   8.67885  17.7488   100

Timing a larger dataset:

set.seed(2037213561)

datei <- data.table(district = rep(1:1e3, sample(10, 1e3, 1)))[
  ,`:=`(village = rowid(district), status = runif(.N)%/%0.5)
]

microbenchmark::microbenchmark(
  replicate = dt[,paste0("randomstat", 1:n) := replicate(n, sample(status), FALSE), district],
  reshuffle = dt[,paste0("randomstat", 1:n) := lapply(1:n, \(i) reshuffle(status, district))],
  colShuffle = dt[,paste0("randomstat", 1:n) := as.data.frame(colShuffle(matrix(rep(status, n), .N, n))), district],
  colRanks = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n))], .N, n)), district],
  colRanksAll = dt[,paste0("randomstat", 1:n) := as.data.frame(matrix(status[colRanks(matrix(runif(.N*n), .N, n) + match(district, unique(district)))], .N, n))],
  setup = {dt <- copy(datei)},
  times = 10
)
#> Unit: milliseconds
#>         expr       min        lq      mean     median         uq        max neval
#>    replicate 5602.0342 6649.0791 7237.3350  7318.1749  7641.8868  8434.8158    10
#>    reshuffle 7348.9950 8237.6315 9946.4001 10728.3559 11320.5539 11680.5648    10
#>   colShuffle 1521.4851 1578.3194 2024.9663  2181.7678  2312.6861  2368.4342    10
#>     colRanks 2063.0544 2633.7768 2784.5394  2868.1073  3097.8065  3232.0499    10
#>  colRanksAll  374.6916  403.4064  467.1745   478.0322   521.1787   580.5176    10
0
TarJae On

Or you could create your own shuffle function, here is an example for 10 columns:

n <- 10

reshuffle <- function(status, district) {
  unlist(mapply(function(s, d) sample(s), 
                split(status, district), 
                split(district, district), 
                SIMPLIFY = FALSE))
}

datei[, (paste0("randomstat", 1:n)) := lapply(1:n, \(i) shuffle(status, district))]

datei
   district village status randomstat1 randomstat2 randomstat3 randomstat4 randomstat5 randomstat6 randomstat7 randomstat8 randomstat9 randomstat10
 1:        1       1      1           1           1           1           1           1           0           1           1           1            1
 2:        1       2      0           0           0           0           0           0           0           0           0           0            0
 3:        1       3      1           1           0           0           1           1           1           0           0           0            1
 4:        1       4      0           0           1           1           0           0           1           1           1           1            0
 5:        2       1      1           1           1           1           0           0           1           1           1           1            1
 6:        2       2      1           0           0           1           0           1           0           0           1           0            0
 7:        2       3      1           1           1           0           1           1           0           1           1           1            1
 8:        2       4      0           0           1           0           1           1           1           1           0           1            0
 9:        2       5      0           1           0           1           1           0           1           0           0           0            1
10:        3       1      1           0           0           0           1           1           0           1           1           1            1
11:        3       2      1           1           0           0           1           1           1           1           0           1            1
12:        3       3      1           0           0           0           0           1           1           1           1           1            0
13:        3       4      1           1           1           1           0           0           1           1           0           0            0
14:        3       5      0           0           1           1           1           0           0           0           1           0            1
15:        3       6      0           1           1           1           0           1           1           0           1           1            1
16:        3       7      0           1           1           1           1           0           0           0           0           0            0