Quickly binning values within R

68 views Asked by At

I have a massive data frame in R that looks something like this:

Dataframe 1

ID start stop
+ 3 5
- 9 13

I also have another dataframe that looks something like this:

Dataframe 2

ID name position
+ hello 4
+ there 7
- my 11
+ name 12

What I want to do is for every row in dataframe 1, I want to determine it's "distance" within a given range or print NA if it doesn't fall into a range or the "ID" (+/-) doesn't match. So the output would be something like:

ID name position relative_position
- hello 4 2
+ there 7 NA
- my 12 3
+ name 12 NA

I currently have a relatively long loop that uses apply() and a few layers of if functions to run through this process. The problem I'm having is that I need to do this for a multiple REALLY big dataframes (gigabytes in size), so efficiency is key. What's the most efficient way of going through this process?

I've also played around with the cut() function but I can't figure out how to print the results I want or how to make intervals discontinuous. Even if it works I don't know if it would be more efficient.

2

There are 2 answers

0
Sophie On BEST ANSWER

Update

I have now the full code. It works even if there is overlap of intervals. In this case, the algorithm finds the closest interval by default. If the option closestOnly = FALSE, then it looks farther away if id does not match.

library(data.table)
library(microbenchmark)

findDistance <- function(intervals, values, closestOnly = TRUE, verbose = FALSE)
{
    ## The trick is to sort by position everything,
    ## `type` is 0 if it is an interval start position,
    ##           1 = interval end position, and
    ##           2 = is a position in `values`
    ## `id` is the index (row number) either in `intervals` or `values`
    n.interval <- nrow(intervals)
    dt <- rbind(data.table(id = 1:n.interval, pos = intervals$start, type = 0),
                data.table(id = 1:nrow(values), pos = values$pos, type = 2),
                data.table(id = 1:n.interval, pos = intervals$end, type = 1)
                )
    ## The sorting algorithm needs to be stable (original order is preserved for ties)
    setorder(dt, pos)
    len <- nrow(dt)

    values[, relative_position := as.integer(NA)]
    ## A stack (behaving like a set on pop(end)) that will contain the interval row number
    depth <- 0
    stack <- c()

     ## loop over the rows of `dt`
    for (i in 1:len)
    {
        
        if ( 0 == dt$type[i] ) ## start
        {
            depth <- depth + 1
            ## add on stack the row numbre of the interval
            stack[depth] <- dt$id[i]
        }
        else if ( 1 == dt$type[i] && 0 < depth ) ## end
        {
            ## pop from the stack
            depth <- depth - 1
            ## Remove the interval from the stack
            ## setdiff preserve element position in the vector
            ## exactly what we want for a "stack" behaving like a set on removal
            stack <- setdiff(stack, dt$id[i])
        }
        else if ( 2 == dt$type[i]) ## Value
        {
            if (verbose) cat("=====\nThe value:\n")
            value.id <- dt$id[i]
            if (verbose) print(values[value.id,])
            if ( 0 < depth )
            {
                
                ## the intervals on the `stack` all contain the current value
                n.depth <- if (closestOnly) depth else 1
                for (i.depth in depth:n.depth)
                {
                    index <- stack[i.depth]
                    if (values[value.id,id] != intervals[index,id])
                    {
                        if (verbose) cat(paste("\nstack depth", depth - i.depth, "is not matching\n"))
                    } else
                    {
                        distance <- values[value.id, pos] - intervals[index, start] + 1
                        if (verbose) cat("\nis inside this interval (most imbricated):\n")
                        if (verbose) print(intervals[index,])
                        if (verbose) print(paste("Distance =", distance))
                        values[value.id, relative_position := distance]
                        break
                    }
                }
            } else {
                if (verbose) cat("\nis outside any interval\n")
            }
        } else {
            warning("should not be here")
            break
        }
        ## print(stack)
    }
    
    return(values)
}

Wimpel is really fast but it does not necessary find the closest distance. This is illustrated by this example:

findDistanceWimpel <- function (intervals, values)
{
    values[intervals, relative_position := x.pos - i.start + 1, on = .(id, pos >= start, pos <= end)]
}

## DF1
intervals <- data.table(id = c('-', '-'),
                        start = c(2, 1),
                        end = c(5, 16))
## DF2
values <- data.table(id = c('-'),
                     name = c("test"),
                     pos = c(4))


res <- copy(findDistance(intervals, values, closestOnly = FALSE))
resW <- findDistanceWimpel(intervals, values)[]
merge(res, resW, by= c("id", "name", "pos"))[]
#>   id name pos relative_position.x relative_position.y
#>1:  - test   4                   3                   4

If overlapping intervals is fine as for Wimpel method, go for it as performance is a lot better. But if you need better interval overlapping solution, this algorithm is for you.

For speed comparison,

## DF1
n.intervals <- 1000
n.range <- n.intervals %/% 4
start <- sample.int(n.range, n.intervals, replace = TRUE)
end <- start + sample.int(10, n.intervals, replace = TRUE) - 1
intervals <- data.table(id = c('+', '-')[sample.int(2, n.intervals, replace = TRUE)],
                        start = start,
                        end = end)

## DF2
n.values <- 5000
values <- data.table(id =  c('+', '-')[sample.int(2, n.values, replace = TRUE)],
                     name = paste0("name", 1:n.values),
                     pos = sample.int(n.range, n.values, replace = TRUE))


microbenchmark({ findDistance(intervals, values, closestOnly = FALSE) },
               { findDistanceWimpel(intervals, values) },
               times = 3)
#> Unit: milliseconds
#>                                                         expr         min           lq        mean      median          uq         max neval cld
#> {     findDistance(intervals, values, closestOnly = FALSE) }  8844.04990 98920.261426 8960.048019 8996.472944 9018.047074 9039.621204     3  a
#>                {     findDistanceWimpel(intervals, values) }    5.341248     5.413585    6.388424    5.485922    6.912013    8.338103     3   b

A reimplementation of my algorithm in C (what data.table does under the hood), should reach similar performance (or better).

Old post

It is mainly a problem of finding if many positions are in intervals. This is an algorithmic issue.

Here is an efficient algorithm to find for all position the containing intervals. I leave in exercise the computation of distance and the proper formatting of the result ;-) (it should be easy from this canvas).

library(data.table)

## DF1
intervals <- data.table(id = c('+', '-'),
                        start = c(3, 9),
                        end = c(5, 13))

## DF2
values <- data.table(id = c('-', '+', '-', '+'),
                     name = c("hello", "there", "my", "name"),
                     pos = c(4, 7, 12, 12))

## The trick is to sort by position everything,
## `type` is 0 if it is an interval start position,
##           1 = interval end position, and
##           2 = is a position in `values`
## `id` is the index (row number) either in `intervals` or `values`
n.interval <- nrow(intervals)
dt <- rbind(data.table(id = 1:n.interval, pos = intervals$start, type = 0),
            data.table(id = 1:n.interval, pos = intervals$end, type = 1),
            data.table(id = 1:nrow(values), pos = values$pos, type = 2))
setorder(dt, pos)
len <- nrow(dt)

## A stack that will contain the interval row number
depth <- 0
stack <- numeric(16384)

## loop over the rows of `dt`                     
for (i in 1:len)
{
    if ( 0 == dt$type[i] ) ## start
    {
        depth <- depth + 1
        ## add on stack the row numbre of the interval
        stack[depth] <- dt$id[i]
    }
    else if ( 1 == dt$type[i] && 0 < depth ) ## end
    {
        ## pop from the stack
        depth <- depth - 1
    }
    else if ( 2 == dt$type[i]) ## Value
    {
        cat("=====\nThe value:\n")
        value.id <- dt$id[i]
        print(values[value.id,])
        if ( 0 < depth )
        {
            ## the intervals on the `stack` all contain the current value
            index <- stack[depth]
            cat("\nis inside this interval (most imbricated):\n")
            print(intervals[index,])
        } else {
            cat("\nis outside any interval\n\n")
        }
    } else {
        warning("should not be here")
        break
    }
}

EDIT: Careful, the algorithm only works properly if start and end are well imbricated. (1, 5) and (3, 7) are not properly imbricated (3 is in (1, 5) and 7 outside). The pop on 5 from the stack will remove the interval (3, 7). If the interval are not properly imbricated, the stack should become a set and the pop on an interval end should remove the matching start.

0
Wimpel On

here is a simple data.table approach...

also, I suspect your desired output is wrong, since - hello 4 does not fall in an interval (the +- ID does not match)

library(data.table)
# use setDT() if df1 and df2 are not already data.table
# setDT(df);setDT(df2)

df2[df1, rel_pos := x.position - i.start + 1, on = .(ID, position >= start, position <= stop)][]
#    ID  name position rel_pos
# 1:  - hello        4      NA
# 2:  + there        7      NA
# 3:  -    my       11       3
# 4:  +  name       12      NA

sample data

# sample data
df1 <- fread(text = "ID     start   stop
+   3   5
-   9   13")

df2 <- fread(text = "ID     name    position
-   hello   4
+   there   7
-   my  11
+   name    12")