1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
|
import numpy as np
# own tools
from deeptools import bamHandler
from deeptools import mapReduce
old_settings = np.seterr(all='ignore')
def getFragmentLength_wrapper(args):
return getFragmentLength_worker(*args)
def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins):
"""
Queries the reads at the given region for the distance between
reads and the read length
Parameters
----------
chrom : str
chromosome name
start : int
region start
end : int
region end
bamFile : str
BAM file name
distanceBetweenBins : int
the number of bases at the end of each bin to ignore
Returns
-------
np.array
an np.array, where first column is fragment length, the
second is for read length
"""
bam = bamHandler.openBam(bamFile)
end = max(start + 1, end - distanceBetweenBins)
if chrom in bam.references:
reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
for r in bam.fetch(chrom, start, end)
if r.is_proper_pair and r.is_read1 and not r.is_unmapped])
if not len(reads):
# if the previous operation produces an empty list
# it could be that the data is not paired, then
# we try with out filtering
reads = np.array([(abs(r.template_length), r.infer_query_length(always=False))
for r in bam.fetch(chrom, start, end) if not r.is_unmapped])
else:
raise NameError("chromosome {} not found in bam file".format(chrom))
if not len(reads):
reads = np.array([]).reshape(0, 2)
return reads
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
binSize=50000, distanceBetweenBins=1000000,
numberOfProcessors=None, verbose=False):
"""
Estimates the fragment length and read length through sampling
Parameters
----------
bamFile : str
BAM file name
return_lengths : bool
numberOfProcessors : int
verbose : bool
binSize : int
distanceBetweenBins : int
Returns
-------
d : dict
tuple of two dictionaries, one for the fragment length and the other
for the read length. The dictionaries summarise the mean, median etc. values
"""
bam_handle = bamHandler.openBam(bamFile)
chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))
distanceBetweenBins *= 2
fl = []
# Fix issue #522, allow distanceBetweenBins == 0
if distanceBetweenBins == 0:
imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
getFragmentLength_wrapper,
chrom_sizes,
genomeChunkLength=binSize,
blackListFileName=blackListFileName,
numberOfProcessors=numberOfProcessors,
verbose=verbose)
fl = np.concatenate(imap_res)
# Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed
while len(fl) < 1000 and distanceBetweenBins > 1:
distanceBetweenBins /= 2
stepsize = binSize + distanceBetweenBins
imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
getFragmentLength_wrapper,
chrom_sizes,
genomeChunkLength=stepsize,
blackListFileName=blackListFileName,
numberOfProcessors=numberOfProcessors,
verbose=verbose)
fl = np.concatenate(imap_res)
if len(fl):
fragment_length = fl[:, 0]
read_length = fl[:, 1]
if fragment_length.mean() > 0:
fragment_len_dict = {'sample_size': len(fragment_length),
'min': fragment_length.min(),
'qtile25': np.percentile(fragment_length, 25),
'mean': np.mean(fragment_length),
'median': np.median(fragment_length),
'qtile75': np.percentile(fragment_length, 75),
'max': fragment_length.max(),
'std': np.std(fragment_length),
'mad': np.median(np.abs(fragment_length - np.median(fragment_length))),
'qtile10': np.percentile(fragment_length, 10),
'qtile20': np.percentile(fragment_length, 20),
'qtile30': np.percentile(fragment_length, 30),
'qtile40': np.percentile(fragment_length, 40),
'qtile60': np.percentile(fragment_length, 60),
'qtile70': np.percentile(fragment_length, 70),
'qtile80': np.percentile(fragment_length, 80),
'qtile90': np.percentile(fragment_length, 90),
'qtile99': np.percentile(fragment_length, 99)}
else:
fragment_len_dict = None
if return_lengths and fragment_len_dict is not None:
fragment_len_dict['lengths'] = fragment_length
read_len_dict = {'sample_size': len(read_length),
'min': read_length.min(),
'qtile25': np.percentile(read_length, 25),
'mean': np.mean(read_length),
'median': np.median(read_length),
'qtile75': np.percentile(read_length, 75),
'max': read_length.max(),
'std': np.std(read_length),
'mad': np.median(np.abs(read_length - np.median(read_length))),
'qtile10': np.percentile(read_length, 10),
'qtile20': np.percentile(read_length, 20),
'qtile30': np.percentile(read_length, 30),
'qtile40': np.percentile(read_length, 40),
'qtile60': np.percentile(read_length, 60),
'qtile70': np.percentile(read_length, 70),
'qtile80': np.percentile(read_length, 80),
'qtile90': np.percentile(read_length, 90),
'qtile99': np.percentile(read_length, 99)}
if return_lengths:
read_len_dict['lengths'] = read_length
else:
fragment_len_dict = None
read_len_dict = None
return fragment_len_dict, read_len_dict
|