File: BPSpanner.py

package info (click to toggle)
pbsuite 15.8.24%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 14,512 kB
  • ctags: 1,951
  • sloc: python: 10,962; sh: 147; xml: 21; makefile: 14
file content (131 lines) | stat: -rwxr-xr-x 4,291 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
import sys, math, json
import pysam
from pbsuite.honey.Force import *

fh = open(sys.argv[1])
fh.readline(); fh.readline() #Header

bam = pysam.Samfile(sys.argv[2])

#400bp around bp must be spanned
def bpCheck(bam, chrom, point, BUFFER=200):
    nSpan = 0
    nCount = 0
    spans = {}
    tot = {}
    for read in bam.fetch(reference=chrom, start = max(0, point-BUFFER), end=point+BUFFER):
        #Get the shawties outta chere
        if read.flag & 0x40 or read.flag & 0x80:
            continue
        if read.pos >= point-BUFFER and read.aend <= point:
            #print "ever?"
            continue
        if read.pos >= point and read.aend <= point-BUFFER:
            #print "ever?"
            continue
        
        if read.qname in tot:
            continue
        
        if read.pos <= point-BUFFER and point+BUFFER <= read.aend:
            spans[read.qname] = 1
        tot[read.qname] = 1
        #elif read.pos < point and point < read.aend:
        #nCount += 1
    return len(spans), len(tot)

def log_choose(n, k):
    r = 0.0
    # swap for efficiency if k is more than half of n
    if k * 2 > n:
        k = n - k

    for  d in xrange(1,k+1):
        r += math.log(n, 10)
        r -= math.log(d, 10)
        n -= 1

    return r

# return the genotype and log10 p-value
def bayes_gt(ref, alt):
    # prob seeing an alt read with true genotype of of hom_ref, het, hom_alt respectively
    p_alt = [0.2, 0.4, 0.8]

    total = ref + alt
    
    lp_homref = log_choose(total, alt) + alt * math.log(p_alt[0], 10) + ref * math.log(1 - p_alt[0], 10)
    lp_het = log_choose(total, alt) + alt * math.log(p_alt[1], 10) + ref * math.log(1 - p_alt[1], 10)
    lp_homalt = log_choose(total, alt) + alt * math.log(p_alt[2], 10) + ref * math.log(1 - p_alt[2], 10)

    return (lp_homref, lp_het, lp_homalt)

gtlookup = {0:'0/0', 1:'0/1', 2:'1/1'}
print "id\tnReads\tnZMWs\tavgCov\tpctVar\tSpan1\tCoverage1\tPct1\tSpan2\tCoverage2\tPct2\tgt\tgt1\tgt2"
for line in fh.readlines():
    data = line.strip().split("\t")
    chr1 = data[2]
    bp1 = int(data[3])
    chr2 = data[5]
    bp2 = int(data[6])

    nReads = int(data[10])
    nZMWs = int(data[11])
    
    nSpan1, nCount1 = bpCheck(bam, chr1, bp1)
    nSpan2, nCount2 = bpCheck(bam, chr2, bp2)
    if nCount1 == 0 or nCount2 == 0:
        continue
    nRef = nSpan1 + nSpan2
    nAlt = (nCount1 + nCount2) - nRef
    
    p_bp1 = bayes_gt(nSpan1, nCount1 - nSpan1)
    p_bp2 = bayes_gt(nSpan2, nCount2 - nSpan2)
    p_bp = bayes_gt(nRef, nAlt)
    
    
    gt1 = gtlookup[p_bp1.index(max(p_bp1))]
    gt2 = gtlookup[p_bp2.index(max(p_bp2))]
    gt = gtlookup[p_bp.index(max(p_bp))]
    
    avgCov = (nCount1 + nCount2) / 2

    var = (float(nReads)/avgCov)
    p1 = (float(nSpan1)/nCount1)
    p2 = (float(nSpan2)/nCount2)

    threshold = 0.15
    #if the percent spans are really low, it's hom variant
    if p1 <= threshold and p2 <= threshold:
        #print 'HOM', data[keygenoType], data[keyisvalid], line,
        genoType = 'HOM'
    #if the percent spans is really close to 50%
    elif 0.5-threshold <= p1 <= 0.5+threshold and 0.5-threshold <= p2 <= 0.5+threshold:
        genoType = "HET"
    #if the fraction of variant reads is close to threshold
    elif abs(var - 0.5) < threshold:
        genoType = "HET*"
    #if the fraction of variant reads is closer to 0
    elif abs(var - 1) < threshold:
        genoType = "HOM*"
    else:#Complex genotype.. I don't know what's happening
        #print 'COX', data[keygenoType], data[keyisvalid], line,
        genoType = "COX"
    
    output = {}
    print "\t".join([str(x) for x in [data[0], \
                                      nReads, \
                                      nZMWs, \
                                      avgCov, \
                                      "%.3f" % var, \
                                      nSpan1, \
                                      nCount1, \
                                      "%.3f" % p1, \
                                      nSpan2, \
                                      nCount2, \
                                      "%.3f" % p2, \
                                      genoType, \
                                      gt, gt1, gt2]])
        
    sys.stdout.flush()