1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
#!/usr/bin/env python
# (c) 2012 - Ryan M. Layer
# Hall Laboratory
# Quinlan Laboratory
# Department of Computer Science
# Department of Biochemistry and Molecular Genetics
# Department of Public Health Sciences and Center for Public Health Genomics,
# University of Virginia
# rl6sf@virginia.edu
import sys
from operator import itemgetter
from optparse import OptionParser
# some constants for sam/bam field ids
SAM_FLAG = 1
SAM_REFNAME = 2
SAM_MATE_REFNAME = 6
SAM_ISIZE = 8
parser = OptionParser()
parser.add_option("-r",
"--read_length",
type="int",
dest="read_length",
help="Read length")
parser.add_option("-X",
dest="X",
type="int",
help="Number of stdevs from mean to extend")
parser.add_option("-N",
dest="N",
type="int",
help="Number to sample")
parser.add_option("-o",
dest="output_file",
help="Output file")
parser.add_option("-m",
dest="mads",
type="int",
default=10,
help="Outlier cutoff in # of median absolute deviations (unscaled, upper only)")
def median(L):
if len(L) % 2 == 1:
return L[int(len(L)/2)] # cast to int since divisions always return floats in python3
mid = int(len(L) / 2) - 1
return (L[mid] + L[mid+1]) / 2.0
def unscaled_upper_mad(xs):
"""Return a tuple consisting of the median of xs followed by the
unscaled median absolute deviation of the values in xs that lie
above the median.
"""
xs.sort()
med = median(xs)
umad = median([x - med for x in xs if x > med])
return med, umad
(options, args) = parser.parse_args()
if not options.read_length:
parser.error('Read length not given')
if not options.X:
parser.error('X not given')
if not options.N:
parser.error('N not given')
if not options.output_file:
parser.error('Output file not given')
def mean_std(L):
s = sum(L)
mean = s / float(len(L))
sq_sum = 0.0
for v in L:
sq_sum += (v - mean)**2.0
var = sq_sum / float(len(L))
return mean, var**0.5
required = 97
restricted = 3484
flag_mask = required | restricted
L = []
c = 0
for l in sys.stdin:
if c >= options.N:
break
A = l.rstrip().split('\t')
flag = int(A[SAM_FLAG])
refname = A[SAM_REFNAME]
mate_refname = A[SAM_MATE_REFNAME]
isize = int(A[SAM_ISIZE])
want = mate_refname == "=" and flag & flag_mask == required and isize >= 0
if want:
c += 1
L.append(isize)
# warn if very few elements in distribution
min_elements = 1000
if len(L) < min_elements:
sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % (len(L), min_elements))
mean = "NA"
stdev = "NA"
else:
# Remove outliers
med, umad = unscaled_upper_mad(L)
upper_cutoff = med + options.mads * umad
L = [v for v in L if v < upper_cutoff]
new_len = len(L)
removed = c - new_len
sys.stderr.write("Removed %d outliers with isize >= %d\n" %
(removed, upper_cutoff))
c = new_len
mean, stdev = mean_std(L)
start = options.read_length
end = int(mean + options.X*stdev)
H = [0] * (end - start + 1)
s = 0
for x in L:
if (x >= start) and (x <= end):
j = int(x - start)
H[j] = H[ int(x - start) ] + 1
s += 1
f = open(options.output_file, 'w')
for i in range(end - start):
o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n"
f.write(o)
f.close()
print(('mean:' + str(mean) + '\tstdev:' + str(stdev)))
|