Printing values following comparison of two csv files only if in a specific range using Python 3.3

103 views Asked by At

I'm new at programming and I've got two CSV files that I'm trying to compare. The first file, snp.csv is shown below:

chrom   position    ref var gene        var
1       21421       G   T   WASH7P      snp.LOH
1       1251593     T   C   CPSF3L      snp.somatic
6       107474777   -   A   PDSS2       indel.somatic
14      106586168   G   T   ADAM6       snp.LOH

The second file, quad.csv is shown below:

chrom   Start   End     Sequence
1       21420   21437   GGGACGGGGAGGGTTGGG
1       23058   23078   GGGCTGGGGCGGGGGGAGGG
1       23515   23534   GGGAAGGGACAGGGCAGGG
1       45098   45118   GGGAAAGGGCAGGGCCCGGG
3       1148    1173    GGGCCGGGCAAGGCCGGGTGCAGGG

I want to compare these two files and if the two chrom values match, I want to print only those having position value (in snp.csv file) in the range of the start and end value (in the quad.csv file). So, I am looking for a solution that will give me something like the following (basically the snp.csv file with start, end and sequence value of the quad.csv file)

chrom   position    ref var gene    var     Start   End     Sequence
1       21421       G   T   WASH7P  snp.LOH 21420   21437   GGGACGGGGAGGGTTGGG

I've searched the posts and found some interesting answers that helped me a lot but I’m still experiencing some issues. I’m still learning Python…

Here is my script up to now, I know I have a problem with the range function...I'm stuck

import csv

snp_file = open("snp.csv", "r")
quad_file = open("quad.csv", "r")
out_file = open("results.csv", "wb")

snp = csv.reader(snp_file, delimiter='\t')
quad = csv.reader(quad_file, delimiter='\t')
out = csv.reader(out_file, delimiter='\t')



quadlist = [row for row in quad]

for snp_row in snp:
    row = 1
    found = False
    for quad_row in quadlist:
        results_row = snp_row
        if snp_row[0] == quad_row[0]:
            quad_pos = range(quad_row[1], quad_row[2])
            if snp_row[1] in quad_pos:
                results_row.append(quad_row)
                found = True
                break
        row = row + 1
    if not found:
        pass
    print (results_row)



snp.close()
quad.close()
out.close()
1

There are 1 answers

5
Hugh Bothwell On
from bisect import bisect_right
from collections import defaultdict
import csv

TOO_HIGH = 2147483647   # higher than any actual gene position
SNP_FMT  = "{0:<7} {1:<11} {2:3} {3:3} {4:11} {5:15}".format
QUAD_FMT = " {1:<7} {2:<7} {3}".format

def line_to_quad(line):
    row = line.split()
    return int(row[0]), int(row[1]), int(row[2]), row[3]

def line_to_snp(line):
    row = line.split()
    return int(row[0]), int(row[1]), row[2], row[3], row[4], row[5]

class Quads:
    @classmethod
    def from_file(cls, fname):
        with open(fname, "rU") as inf:
            next(inf, None)   # skip header line
            quads = (line_to_quad(line) for line in inf)
            return cls(quads)

    def __init__(self, rows):
        self.chromosomes = defaultdict(list)
        for row in rows:
            self.chromosomes[row[0]].append(row[1:])
        for segs in self.chromosomes.values():
            segs.sort()

    def find_match(self, chromosome, position):
        segs = self.chromosomes[chromosome]
        index = bisect_right(segs, (position, TOO_HIGH, "")) - 1
        try:
            seg = segs[index]
            if seg[0] <= position <= seg[1]:
                return (chromosome,) + seg
        except IndexError:
            pass

def main():
    quads = Quads.from_file("quad.csv")

    print(  # header
        SNP_FMT("chrom", "position", "ref", "var", "gene", "var") +
        QUAD_FMT("chrom", "Start", "End", "Sequence")
    )

    with open("snp.csv") as inf:
        next(inf, None)   # skip header line
        for line in inf:
            snp = line_to_snp(line)
            quad = quads.find_match(snp[0], snp[1])
            if quad:
                print(SNP_FMT(*snp) + QUAD_FMT(*quad))

if __name__=="__main__":
    main()

which gives

chrom   position    ref var gene        var             Start   End     Sequence
1       21421       G   T   WASH7P      snp.LOH         21420   21437   GGGACGGGGAGGGTTGGG