How to extract all perfect matches > 10 nucleotides in a pairwise alignement made with Biostrings in R?

63 views Asked by At

I have aligned 2 DNAStringSet, and use the Biostring package. I would like to extract all the portions of the sequences that match perfectly and are at least 10 nucleotides (nt) long, as well as the position of the match in both the pattern and the query sequence.

Reproducible example

library('Biostrings')
set.seed(123)
dna_alphabet <- DNA_ALPHABET[1:4]
seq1_random <- sample(dna_alphabet, 100, replace = TRUE)
seq2_random <- sample(dna_alphabet, 100, replace = TRUE)

match_12 <- sample(dna_alphabet, 12, replace = TRUE)
match_15 <- sample(dna_alphabet, 15, replace = TRUE)
match_23 <- sample(dna_alphabet, 23, replace = TRUE)

seq1_random[15:(15+11)] <- match_12
seq2_random[17:(17+11)] <- match_12 
seq1_random[41:(41+14)] <- match_15
seq2_random[40:(40+14)] <- match_15
seq1_random[70:(70+22)] <- match_23
seq2_random[73:(73+22)] <- match_23

seq1 <- DNAString(paste(seq1_random, collapse = ""))
seq2 <- DNAString(paste(seq2_random, collapse = ""))

alignment <- pairwiseAlignment(pattern = seq1, subject = seq2, type = "global")

alignment
Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: GGGCGCCCGATCCA--CCTATTCGTTTGTCGCACGTCAGGATATTTCACGTGAGTCATCACAAT----TGACAAGACGACCATTTCTAACATGAATTCACTAACGG
subject: ATCATCAGGTTCTGACCCTATTCGTTTGA---ATGCCCTCCCATTTCACGTGAGTCAAGGGAGGCCCGTGGCACTACGACCATTTCTAACATGAATTCGGTAT---
score: -138.1462 

Expected Result

> df_results
            perfect_match start_pos_pattern end_pos_pattern start_pos_subject end_pos_subject
1            CCTATTCGTTTG                15              26                17              28
2         ATTTCACGTGAGTCA                41              55                40              54
3 ACGACCATTTCTAACATGAATTC                70              92                73              95

My actual alignment is >15,000 nt long, so I need to find an efficient way to proceed. I am more familiar with the {tidyverse} than {data.table} options, if possible.

Current attempt:

What I found so far is: Extract the aligned sequences:

aligned_pattern_str <- as.character(alignment@pattern)
aligned_pattern_str
[1] "GGGCGCCCGATCCA--CCTATTCGTTTGTCGCACGTCAGGATATTTCACGTGAGTCATCACAAT----TGACAAGACGACCATTTCTAACATGAATTCACTAA"
aligned_subject_str <- as.character(alignment@subject)
aligned_subject_str
[1] "ATCATCAGGTTCTGACCCTATTCGTTTGA---ATGCCCTCCCATTTCACGTGAGTCAAGGGAGGCCCGTGGCACTACGACCATTTCTAACATGAATTCGGTAT"

Then: I use a rolling window to compare the first character in aligned_pattern_str and the first character in aligned_subject_str. If they don´t match, I compare the second of each etc until they match. Once they match: I keep comparing the next characters until there is a mismtach. Then, if the match is >10 nucleotide, I report it, otherwise Idont. And I keep proceeding until the entire sequences have been covered.

So my code to that step:

match_start <- NULL
match_length <- 0
matches <- list()


for (i in 1:nchar(aligned_pattern_str)) {
  if (substr(aligned_pattern_str, i, i) == substr(aligned_subject_str, i, i) && substr(aligned_pattern_str, i, i) != "-") &&  {
    if (is.null(match_start)) {
      match_start <- i
    }
    match_length <- match_length + 1
  } else {
    if (!is.null(match_start) && match_length > 10) {
      matches[[length(matches) + 1]] <- list(start = match_start, end = i - 1, length = match_length)
    }
    match_start <- NULL
    match_length <- 0
  }
}

if (!is.null(match_start) && match_length > 10) {
  matches[[length(matches) + 1]] <- list(start = match_start, end = nchar(aligned_pattern_str), length = match_length)
}

for (match in matches) {
  cat(sprintf("Match from %d to %d, Length: %d\n", match$start, match$end, match$length))
}

> 
Match from 17 to 28, Length: 12
Match from 43 to 57, Length: 15
Match from 76 to 98, Length: 23

Problems

It is not the final answer:

  1. at this point, I only have the position in the aligned pattern sequence. Which means that the mismatch that occured before the match, noted - -, are counted in the positions. So instead of having the positions 17-28, 40-54, and 73-95 in the subject sequence, the code returns 17-28, 43-57, and 76-98 (cf the expected results versus the results from the current lead).
  2. the current attempt does not provide the matched positions in the subject sequence.
  3. Finally, I don´t have a dataframe but a list at this point.
2

There are 2 answers

0
Mata On BEST ANSWER

Here is what I eventually found. It might not be the most efficient way to go,especially over long alignments, and it uses a loop, which I was told to avoid in R. But it gets the job done:

First, I extract the aligned sequences:

> aligned_pattern_str <- as.character(alignment@pattern)
> aligned_pattern_str 
[1] "GGGCGCCCGATCCA--CCTATTCGTTTGTCGCACGTCAGGATATTTCACGTGAGTCATCACAAT----TGACAAGACGACCATTTCTAACATGAATTCACTAA"
> aligned_subject_str <- as.character(alignment@subject)
> aligned_subject_str 
[1] "ATCATCAGGTTCTGACCCTATTCGTTTGA---ATGCCCTCCCATTTCACGTGAGTCAAGGGAGGCCCGTGGCACTACGACCATTTCTAACATGAATTCGGTAT"

I create 3 variables:

match_start <- NULL
match_length <- 0
matches <- list()

Then I create a function to correct the positions with the number of gaps:

count_gaps_before <- function(seq, pos) {
  sub_seq <- substr(seq, 1, pos)
  char_vec <- strsplit(sub_seq, "")[[1]]
  gap_count <- sum(char_vec == "-")
  return(gap_count)
  }

And I loop over the aligned sequences:

for (i in 1:nchar(aligned_pattern_str)) {
  if (substr(aligned_pattern_str, i, i) == substr(aligned_subject_str, i, i) && substr(aligned_pattern_str, i, i) != "-") {
    if (is.null(match_start)) {
      match_start <- i
    }
    match_length <- match_length + 1
  } else {
    if (!is.null(match_start) && match_length >= 10) {
      pattern_gaps <- count_gaps_before(aligned_pattern_str, match_start)
      subject_gaps <- count_gaps_before(aligned_subject_str, match_start)
      matching_sequence <- substr(aligned_pattern_str, match_start, match_start + match_length - 1)
      matching_sequence <- gsub("-", "", matching_sequence) # Remove gaps from the matching sequence
      matches[[length(matches) + 1]] <- list(
        start_pattern = match_start - pattern_gaps,
        end_pattern = (match_start + match_length - 1) - count_gaps_before(aligned_pattern_str, match_start + match_length - 1),
        start_subject = match_start - subject_gaps,
        end_subject = (match_start + match_length - 1) - count_gaps_before(aligned_subject_str, match_start + match_length - 1),
        length = match_length,
        sequence = matching_sequence
      )
    }
    match_start <- NULL
    match_length <- 0
  }
}


if (!is.null(match_start) && match_length >= 10) {
  pattern_gaps <- count_gaps_before(aligned_pattern_str, match_start)
  subject_gaps <- count_gaps_before(aligned_subject_str, match_start)
  matching_sequence <- substr(aligned_pattern_str, match_start, nchar(aligned_pattern_str))
  matching_sequence <- gsub("-", "", matching_sequence) 
  matches[[length(matches) + 1]] <- list(
    start_pattern = match_start - pattern_gaps,
    end_pattern = nchar(aligned_pattern_str) - count_gaps_before(aligned_pattern_str, nchar(aligned_pattern_str)),
    start_subject = match_start - subject_gaps,
    end_subject = nchar(aligned_subject_str) - count_gaps_before(aligned_subject_str, nchar(aligned_subject_str)),
    length = match_length,
    sequence = matching_sequence
  )
}

I then print the list of matches:

for (match in matches) {
  cat(sprintf("Match from %d to %d in pattern, from %d to %d in subject, Length: %d, Sequence: %s\n",
              match$start_pattern, match$end_pattern, match$start_subject, match$end_subject, match$length, match$sequence))
  }

And I transform the list into a data frame:

matches_df <- do.call(rbind, lapply(matches, function(x) {
     data.frame(
         sequence = x$sequence,
         from_in_pattern = x$start_pattern,
         to_in_pattern = x$end_pattern,
         from_in_subject = x$start_subject,
         to_in_subject = x$end_subject,
         stringsAsFactors = FALSE # Ensure character data remains as character type
     )
 }))

The results is:

> matches_df
                 sequence from_in_pattern to_in_pattern from_in_subject to_in_subject
1            CCTATTCGTTTG              15            26              17            28
2         ATTTCACGTGAGTCA              41            55              40            54
3 ACGACCATTTCTAACATGAATTC              70            92              73            95
4
jblood94 On

Here's an approach that converts every 10 nt to a single value and finds the matches using a data.table join. It should be pretty fast. Be aware that it may break down if the minimum match length gets too long (10 is well within what it can handle).

Set up the functions.

library(data.table)

fn <- function(x, n) {
  # function to convert rolling sequences to integers
  n1 <- n - 1
  y <- c(0, x*4^n1)
  xn <- numeric(length(x) - n1)
  xn[1] <- sum(x[1:n]*4^(n1:0))
  x <- x[-1:-n1]
  if (length(xn) > 1L) {
    for (i in 2:length(x)) xn[i] <- 4*(xn[i - 1] - y[i]) + x[i]
  }
  xn
}

seq.match <- function(n, seq1, seq2) {
  # main function to perform the matching
  n1 <- n - 1L
  
  setorder(
    data.table(
      end = n:length(seq1),
      seq10 = fn(log2(as.numeric(seq1)), n)
    )[ # join to find the matches
      data.table(
        end = n:length(seq2),
        seq10 = fn(log2(as.numeric(seq2)), n)
      ), on = "seq10", allow.cartesian = TRUE, nomatch = FALSE
    ][,d := i.end - end], end, d
  )[ # roll up sequential matches
    ,.(
      seq_match = toString(subseq(seq1, end[1] - n1, end[.N])),
      start1 = end[1] - n1,
      end1 = end[.N],
      start2 = i.end[1] - n1,
      end2 = i.end[.N]
    ), cumsum(c(0L, diff(end) != 1L | diff(i.end) != 1L))
  ][,cumsum := NULL]
}

Example usage:

seq.match(10L, seq1, seq2)[]
#>                  seq_match start1  end1 start2  end2
#>                     <char>  <int> <int>  <int> <int>
#> 1:            CCTATTCGTTTG     15    26     17    28
#> 2:         ATTTCACGTGAGTCA     41    55     40    54
#> 3: ACGACCATTTCTAACATGAATTC     70    92     73    95

Time a larger example.

set.seed(984588177)

seq1 <- DNAString(paste(sample(dna_alphabet, 1e3, 1), collapse = ""))
seq2 <- DNAString(paste(sample(dna_alphabet, 15e3, 1), collapse = ""))

system.time(dt <- seq.match(10L, seq1, seq2))
#>    user  system elapsed 
#>    0.02    0.00    0.02
dt[]
#>       seq_match start1  end1 start2  end2
#>          <char>  <int> <int>  <int> <int>
#> 1:  GGGGGACACAG     45    55  12229 12239
#> 2:   TAGTTCTTTG    142   151   9557  9566
#> 3:   GGGTTTGTTA    179   188   7912  7921
#> 4:  GTTAATCACAG    185   195   7742  7752
#> 5: ATTCCTCTTTAG    311   322   8207  8218
#> 6:   GTATCACCGG    355   364   1149  1158
#> 7:   GGACTCACCC    517   526   5548  5557
#> 8:  CGAGACTTACA    640   650  10134 10144
#> 9:   TGTGGACTAG    988   997   7807  7816