Find rows with overlapping ranges

387 views Asked by At

I have 3 large data frames that look like this:

library(tibble)

df1 <- tibble(peak=c("peak1","peak2","peak3"), 
              coord1=c(100,500,1000),
              coord2=c(250,700,1250))


df2 <- tibble(peak=c("peak5","peak6","peak7"), 
              coord1=c(120,280,900),
              coord2=c(300,400,1850))


df3 <- tibble(peak=c("peak8","peak9","peak10"), 
              coord1=c(900,3000,5600),
              coord2=c(2000,3400,5850))

df1
#> # A tibble: 3 × 3
#>   peak  coord1 coord2
#>   <chr>  <dbl>  <dbl>
#> 1 peak1    100    250
#> 2 peak2    500    700
#> 3 peak3   1000   1250
df2
#> # A tibble: 3 × 3
#>   peak  coord1 coord2
#>   <chr>  <dbl>  <dbl>
#> 1 peak5    120    300
#> 2 peak6    280    400
#> 3 peak7    900   1850
df3
#> # A tibble: 3 × 3
#>   peak   coord1 coord2
#>   <chr>   <dbl>  <dbl>
#> 1 peak8     900   2000
#> 2 peak9    3000   3400
#> 3 peak10   5600   5850

I am relative new to R and I am trying to find the overlapping area within coordinates (coord1, coord2) that are unique to each data frame, overlap between two data frames, and overlap within all data frames.

I want these data frames as an ouptut. At the moment Its hard for me to find how to specify in R, dplyr that I want to filter based on the overlapping ranges. There is a command that I am missing

unique the ranges of these peaks do not overlap with the ranges of peaks of other data frames

> unique

peak    coord1  coord2
peak6    280     400
peak9    3000    3400
peak10   5600    5850

common between df1-df2

>df1df2 
peak       coord1  coord2
peak1       100     250
peak5       120     300
peak3      1000    1250
peak7       900    1850

common between df1-df3

peak       coord1  coord2
peak3       1000    1250
peak8       900   2000

and then common between df1-df2-df3

1

There are 1 answers

0
Scipione Sarlo On

To be honest, I don't understand which is the final goal of your search. At any rate, there is a solution that uses tidyverse approach and functions from ivs package in order to check vectors' intervals. It is not an elegant solution, and it does not consider overlapping vectors within the same data frame.

# load packages
library(tidyverse)
library(ivs)

Your data

df1 <- tibble(peak=c("peak1","peak2","peak3"), 
              coord1 = c(100, 500, 1000),
              coord2 = c(250, 700, 1250))


df2 <- tibble(peak=c("peak5","peak6","peak7"), 
              coord1 = c(120, 280, 900),
              coord2 = c(300, 400, 1850))


df3 <- tibble(peak=c("peak8","peak9","peak10"), 
              coord1 = c(900, 3000, 5600),
              coord2 = c(2000, 3400, 5850))

Use of function iv_overlaps in order to create intervals

check_df1_df2 <- df1 %>%
      mutate(any_overlap = iv_overlaps(range, df2$range),
             check = "df1-df2")

check_df1_df3 <- df1 %>%
      mutate(any_overlap = iv_overlaps(range, df3$range),
             check = "df1-df3")

check_df2_df1 <- df2 %>%
      mutate(any_overlap = iv_overlaps(range, df1$range),
             check = "df2-df1")

check_df2_df3 <- df2 %>%
      mutate(any_overlap = iv_overlaps(range, df3$range),
             check = "df2-df3")

check_df3_df1 <- df3 %>%
      mutate(any_overlap = iv_overlaps(range, df1$range),
             check = "df3-df1")

check_df3_df2 <- df3 %>%
      mutate(any_overlap = iv_overlaps(range, df2$range),
             check = "df3-df2")

Bind dataframes

final_conclusion <- bind_rows(check_df1_df2, check_df1_df3, check_df2_df1, check_df2_df3, check_df3_df1, check_df3_df2, .id = "df_check") %>% 
      group_by(peak) %>% 
      mutate(overlapping_sum = sum(any_overlap))

Check overlapping intervals between dataframes

overlapping <- final_conclusion %>% 
      filter(overlapping_sum > 0) %>% 
      pivot_wider(id_cols = peak, names_from = check, values_from = range)

> overlapping

# A tibble: 5 × 7
# Groups:   peak [5]
  peak     `df1-df2`    `df1-df3`   `df2-df1`   `df2-df3`   `df3-df1`   `df3-df2`
  <chr>    <iv<dbl>>    <iv<dbl>>   <iv<dbl>>   <iv<dbl>>   <iv<dbl>>   <iv<dbl>>
1 peak1   [100, 250)   [100, 250)    [NA, NA)    [NA, NA)    [NA, NA)    [NA, NA)
2 peak3 [1000, 1250) [1000, 1250)    [NA, NA)    [NA, NA)    [NA, NA)    [NA, NA)
3 peak5     [NA, NA)     [NA, NA)  [120, 300)  [120, 300)    [NA, NA)    [NA, NA)
4 peak7     [NA, NA)     [NA, NA) [900, 1850) [900, 1850)    [NA, NA)    [NA, NA)
5 peak8     [NA, NA)     [NA, NA)    [NA, NA)    [NA, NA) [900, 2000) [900, 2000)

Check non overlapping interval between dataframes

not_overlapping <- final_conclusion %>% 
      filter(overlapping_sum == 0) %>% 
      pivot_wider(id_cols = peak, names_from = check, values_from = range)

> not_overlapping
# A tibble: 4 × 7
# Groups:   peak [4]
  peak    `df1-df2`  `df1-df3`  `df2-df1`  `df2-df3`    `df3-df1`    `df3-df2`
  <chr>   <iv<dbl>>  <iv<dbl>>  <iv<dbl>>  <iv<dbl>>    <iv<dbl>>    <iv<dbl>>
1 peak2  [500, 700) [500, 700)   [NA, NA)   [NA, NA)     [NA, NA)     [NA, NA)
2 peak6    [NA, NA)   [NA, NA) [280, 400) [280, 400)     [NA, NA)     [NA, NA)
3 peak9    [NA, NA)   [NA, NA)   [NA, NA)   [NA, NA) [3000, 3400) [3000, 3400)
4 peak10   [NA, NA)   [NA, NA)   [NA, NA)   [NA, NA) [5600, 5850) [5600, 5850)