1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
"""
Copyright (c) 2013,
Franziska Zickmann,
ZickmannF@rki.de, Robert Koch-Institute, Berlin, Germany
Distributed under the GNU Lesser General Public License, version 3.0
"""
import pysam
import matplotlib.pyplot as plt
import numpy as np
from math import *
import scipy.stats as stats
import sys
# calculate coverage without x-coverage bases
def covWOzero(cov,x):
cov2 = np.array([])
count = 0
for i in cov:
if i!= x:
count += 1
cov2.resize(count)
cov2[count-1]=i
return cov2
# only computes the maxCov without the need to refine the coverage map
def computeMaxCov(cov,maxCov,it):
if it > 100:
return cov
# first remove all maxCov entries
while (np.max(cov)) == maxCov:
cov = np.delete(cov,(len(cov)-1))
mean = np.mean(cov)
median = np.median(cov)
if (median * 10.0) < mean :
maxCov = np.max(cov)
cov = computeMaxCov(cov,maxCov,(it+1))
maxCov = np.max(cov)
return cov
nameIn = sys.argv[1] # name and path sam file
nameOut = sys.argv[2]
""" extract a genome coverage profile from a sam file. """
sf = pysam.Samfile(nameIn,'r')
cov = np.zeros((sum(sf.lengths),))
start_pos = np.cumsum(sf.lengths)-sf.lengths[0]
read_length = 0
num_reads = 0
for read in sf:
if not read.is_unmapped:
r_start = start_pos[read.tid] + read.pos # start position
r_end = start_pos[read.tid] + read.pos + read.qlen # end
cov[r_start:r_end] += 1
num_reads += 1
read_length += r_end-r_start
#print "length including zero: %s" %(len(cov))
# calculate coverage
covWOZRef = covWOzero(cov,0)
#print "length without zero: %s" %(len(covWOZRef))
mean_cov_wozRef = np.mean(covWOZRef)
percentileQuart_cov_wozRef = np.percentile(covWOZRef,25)
median_cov_wozRef = np.median(covWOZRef)
percentileUpQuart_cov_wozRef = np.percentile(covWOZRef,75)
print "average: %s" %(mean_cov_wozRef)
print "median: %s" %(median_cov_wozRef)
print "25-quart: %s" %(percentileQuart_cov_wozRef)
print "75-quart: %s" %(percentileUpQuart_cov_wozRef)
maxCov = -1
if (median_cov_wozRef * 10.0) < mean_cov_wozRef:
sys.setrecursionlimit(100000)
covSorted = np.sort(covWOZRef)
print "finished sorting!"
maxCov = np.max(covSorted)
itNum = 1
while True:
covSorted = computeMaxCov(covSorted,maxCov,1)
maxCov = np.max(covSorted)
mean = np.mean(covSorted)
median = np.median(covSorted)
print "iter %s max: %s, median: %s, average: %s" %(itNum,maxCov,median,mean)
itNum = itNum + 1
if (median * 10.0) >= mean:
break
print "maximum threshold: %s" %(maxCov)
outfile = open(nameOut,'w')
outfile.write(str(min(percentileQuart_cov_wozRef,(mean_cov_wozRef/5.0))) + "\n");
outfile.write(str(maxCov) + "\n")
outfile.write("25-qua: " + str(percentileQuart_cov_wozRef) + "\n")
outfile.write("median: " + str(median_cov_wozRef) + "\n")
outfile.write("average: " + str(mean_cov_wozRef) + "\n")
outfile.close()
|