File: getMeanCov.py

package info (click to toggle)
giira 0.0.20140625-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 760 kB
  • sloc: java: 8,578; python: 258; xml: 44; sh: 33; makefile: 14
file content (108 lines) | stat: -rw-r--r-- 2,944 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Copyright (c) 2013,
Franziska Zickmann, 
ZickmannF@rki.de, Robert Koch-Institute, Berlin, Germany
Distributed under the GNU Lesser General Public License, version 3.0
"""

import pysam
import matplotlib.pyplot as plt
import numpy as np
from math import *
import scipy.stats as stats
import sys

# calculate coverage without x-coverage bases
def covWOzero(cov,x):
	cov2 = np.array([])
	count = 0
	for i in cov:
		if i!= x:
			count += 1
			cov2.resize(count)
			cov2[count-1]=i
	return cov2
	
# only computes the maxCov without the need to refine the coverage map
def computeMaxCov(cov,maxCov,it):
	if it > 100:
		return cov
	# first remove all maxCov entries
	while (np.max(cov)) == maxCov:
		cov = np.delete(cov,(len(cov)-1))
	
	mean = np.mean(cov)
	median = np.median(cov)	
	if (median * 10.0) < mean :
		maxCov = np.max(cov)
		cov = computeMaxCov(cov,maxCov,(it+1))
		maxCov = np.max(cov)
		
	return cov
	
nameIn = sys.argv[1]       # name and path sam file
nameOut = sys.argv[2]

""" extract a genome coverage profile from a sam file. """
sf = pysam.Samfile(nameIn,'r')
	
cov = np.zeros((sum(sf.lengths),))
start_pos = np.cumsum(sf.lengths)-sf.lengths[0]
read_length = 0
num_reads = 0
for read in sf:
	if not read.is_unmapped:
		r_start = start_pos[read.tid] + read.pos # start position 
		r_end = start_pos[read.tid] + read.pos + read.qlen # end 
		cov[r_start:r_end] += 1 
		num_reads += 1
		read_length += r_end-r_start
				

#print "length including zero: %s" %(len(cov))

# calculate coverage

covWOZRef = covWOzero(cov,0)

#print "length without zero: %s" %(len(covWOZRef))

mean_cov_wozRef = np.mean(covWOZRef)
percentileQuart_cov_wozRef = np.percentile(covWOZRef,25)
median_cov_wozRef = np.median(covWOZRef)
percentileUpQuart_cov_wozRef = np.percentile(covWOZRef,75)

print "average: %s" %(mean_cov_wozRef)
print "median: %s" %(median_cov_wozRef)
print "25-quart: %s" %(percentileQuart_cov_wozRef)
print "75-quart: %s" %(percentileUpQuart_cov_wozRef)

maxCov = -1
if (median_cov_wozRef * 10.0) < mean_cov_wozRef:
	sys.setrecursionlimit(100000)
	covSorted = np.sort(covWOZRef)
	print "finished sorting!"
	maxCov = np.max(covSorted)
	itNum = 1
	while True:
		covSorted = computeMaxCov(covSorted,maxCov,1)
		maxCov = np.max(covSorted)
		mean = np.mean(covSorted)
		median = np.median(covSorted)
		print "iter %s max: %s, median: %s, average: %s" %(itNum,maxCov,median,mean) 
		itNum = itNum + 1
		if (median * 10.0) >= mean:
			break
	
	
print "maximum threshold: %s" %(maxCov)

outfile = open(nameOut,'w')
outfile.write(str(min(percentileQuart_cov_wozRef,(mean_cov_wozRef/5.0))) + "\n");
outfile.write(str(maxCov) + "\n")
outfile.write("25-qua: " + str(percentileQuart_cov_wozRef) + "\n")
outfile.write("median: " + str(median_cov_wozRef) + "\n")
outfile.write("average: " + str(mean_cov_wozRef) + "\n")
outfile.close()