File: count_orientation.py

package info (click to toggle)
nxtrim 0.4.3%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 360 kB
  • sloc: cpp: 1,593; python: 142; sh: 97; makefile: 43
file content (56 lines) | stat: -rwxr-xr-x 1,961 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/python3

import sys,pysam,itertools,time,collections


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='count orientation of read pairs (for unsorted bam)')
    parser.add_argument('bam', metavar='bam', type=str, help='bam')
    args = parser.parse_args()
    sam = pysam.Samfile(args.bam, "rb" )
    out_fr = pysam.Samfile(args.bam[:-3]+"fr.bam", "wb", header = sam.header)
    out_rf = pysam.Samfile(args.bam[:-3]+"rf.bam", "wb", header = sam.header)
    L = 151
    d = collections.defaultdict(int)
    thr=20000 #how much to clip off contig ends?
    
    for read in sam:
        not_on_end = read.pos>thr and read.pos < (sam.lengths[read.tid] - thr) and read.pnext>thr and read.pnext< (sam.lengths[read.tid] - thr) 
        not_too_small = (max(read.pos+L,read.pnext+L) - min(read.pos,read.pnext))> (L)
        not_identical = read.pos!=read.pnext

        if not read.is_unmapped and not read.mate_is_unmapped and not_identical:            
            if not_on_end and not_too_small:
                if read.pos < read.pnext:
                    if (read.flag & 0x10):
                        k1 = 'R'
                    else: 
                        k1 = 'F'

                    if (read.flag & 0x20):
                        k2='R'
                    else:  
                        k2='F'
                    o=k1+k2
                    d[k1+k2] += 1
                else:
                    if (read.flag & 0x10):
                        k1 = 'R'
                    else: 
                        k1 = 'F'

                    if (read.flag & 0x20):
                        k2='R'
                    else:  
                        k2='F'
                    o=k2+k1

                if o=='RF':
                    out_rf.write((read))
                if o=='FR':
                    out_fr.write((read))
                

    for k in ['FR','RF','FF','RR']:
        print(k,d[k])