1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
|
import shutil
import os
import time
import sys
import multiprocessing
import numpy as np
# deepTools packages
import deeptools.utilities
from deeptools import bamHandler
from deeptools import mapReduce
from deeptoolsintervals import GTF
import pyBigWig
debug = 0
old_settings = np.seterr(all='ignore')
def countReadsInRegions_wrapper(args):
"""
Passes the arguments to countReadsInRegions_worker.
This is a step required given
the constrains from the multiprocessing module.
The args var, contains as first element the 'self' value
from the countReadsPerBin object
"""
return CountReadsPerBin.count_reads_in_region(*args)
class CountReadsPerBin(object):
r"""Collects coverage over multiple bam files using multiprocessing
This function collects read counts (coverage) from several bam files and returns
an numpy array with the results. This class uses multiprocessing to compute the coverage.
Parameters
----------
bamFilesList : list
List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']
binLength : int
Length of the window/bin. This value is overruled by ``bedFile`` if present.
numberOfSamples : int
Total number of samples. The genome is divided into ``numberOfSamples``, each
with a window/bin length equal to ``binLength``. This value is overruled
by ``stepSize`` in case such value is present and by ``bedFile`` in which
case the number of samples and bins are defined in the bed file
numberOfProcessors : int
Number of processors to use. Default is 4
verbose : bool
Output messages. Default: False
region : str
Region to limit the computation in the form chrom:start:end.
bedFile : list of file_handles.
Each file handle corresponds to a bed file containing the regions for which to compute the coverage. This option
overrules ``binLength``, ``numberOfSamples`` and ``stepSize``.
blackListFileName : str
A string containing a BED file with blacklist regions.
extendReads : bool, int
Whether coverage should be computed for the extended read length (i.e. the region covered
by the two mates or the regions expected to be covered by single-reads).
If the value is 'int', then then this is interpreted as the fragment length to extend reads
that are not paired. For Illumina reads, usual values are around 300.
This value can be determined using the peak caller MACS2 or can be
approximated by the fragment lengths computed when preparing the library for sequencing. If the value
is of the variable is true and not value is given, the fragment size is sampled from the library but
only if the library is paired-end. Default: False
minMappingQuality : int
Reads of a mapping quality less than the give value are not considered. Default: None
ignoreDuplicates : bool
Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are
to be excluded. Default: false
chrToSkip: list
List with names of chromosomes that do not want to be included in the coverage computation.
This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het').
stepSize : int
the positions for which the coverage is computed are defined as follows:
``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
the coverage at each base pair. If the stepSize is equal to the
binLength then the coverage is computed for consecutive bins. If seepSize is
smaller than the binLength, then teh bins will overlap.
center_read : bool
Determines if reads should be centered with respect to the fragment length.
samFlag_include : int
Extracts only those reads having the SAM flag. For example, to get only
reads that are the first mates a samFlag of 64 could be used. Similarly, the
samFlag_include can be used to select only reads mapping on the reverse strand
or to get only properly paired reads.
samFlag_exclude : int
Removes reads that match the SAM flag. For example to get all reads
that map to the forward strand a samFlag_exlude 16 should be used. Which
translates into exclude all reads that map to the reverse strand.
zerosToNans : bool
If true, zero values encountered are transformed to Nans. Default false.
skipZeroOverZero : bool
If true, skip bins where all input BAM files have no coverage (only applicable to bamCompare).
minFragmentLength : int
If greater than 0, fragments below this size are excluded.
maxFragmentLength : int
If greater than 0, fragments above this size are excluded.
out_file_for_raw_data : str
File name to save the raw counts computed
statsList : list
For each BAM file in bamFilesList, the associated per-chromosome statistics returned by openBam
mappedList : list
For each BAM file in bamFilesList, the number of mapped reads in the file.
bed_and_bin : boolean
If true AND a bedFile is given, compute coverage of each bin of the given size in each region of bedFile
genomeChunkSize : int
If not None, the length of the genome used for multiprocessing.
Returns
-------
numpy array
Each row correspond to each bin/bed region and each column correspond to each of
the bamFiles.
Examples
--------
The test data contains reads for 200 bp.
>>> test = Tester()
The transpose function is used to get a nicer looking output.
The first line corresponds to the number of reads per bin in bam file 1
>>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 50, 4)
>>> np.transpose(c.run())
array([[0., 0., 1., 1.],
[0., 1., 1., 2.]])
"""
def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1,
verbose=False, region=None,
bedFile=None, extendReads=False,
genomeChunkSize=None,
blackListFileName=None,
minMappingQuality=None,
ignoreDuplicates=False,
chrsToSkip=[],
stepSize=None,
center_read=False,
samFlag_include=None,
samFlag_exclude=None,
zerosToNans=False,
skipZeroOverZero=False,
smoothLength=0,
minFragmentLength=0,
maxFragmentLength=0,
out_file_for_raw_data=None,
bed_and_bin=False,
statsList=[],
mappedList=[]):
self.bamFilesList = bamFilesList
self.binLength = binLength
self.numberOfSamples = numberOfSamples
self.blackListFileName = blackListFileName
self.statsList = statsList
self.mappedList = mappedList
self.skipZeroOverZero = skipZeroOverZero
self.bed_and_bin = bed_and_bin
self.genomeChunkSize = genomeChunkSize
if extendReads and len(bamFilesList):
from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0],
return_lengths=False,
blackListFileName=blackListFileName,
numberOfProcessors=numberOfProcessors,
verbose=verbose)
if extendReads is True:
# try to guess fragment length if the bam file contains paired end reads
if frag_len_dict:
self.defaultFragmentLength = int(frag_len_dict['median'])
else:
exit("*ERROR*: library is not paired-end. Please provide an extension length.")
if verbose:
print(("Fragment length based on paired en data "
"estimated to be {}".format(frag_len_dict['median'])))
elif extendReads < read_len_dict['median']:
sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
"Reads will not be extended.\n".format(int(read_len_dict['median'])))
self.defaultFragmentLength = 'read length'
elif extendReads > 2000:
exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads))
else:
self.defaultFragmentLength = int(extendReads)
else:
self.defaultFragmentLength = 'read length'
self.numberOfProcessors = numberOfProcessors
self.verbose = verbose
self.region = region
self.bedFile = bedFile
self.minMappingQuality = minMappingQuality
self.ignoreDuplicates = ignoreDuplicates
self.chrsToSkip = chrsToSkip
self.stepSize = stepSize
self.center_read = center_read
self.samFlag_include = samFlag_include
self.samFlag_exclude = samFlag_exclude
self.minFragmentLength = minFragmentLength
self.maxFragmentLength = maxFragmentLength
self.zerosToNans = zerosToNans
self.smoothLength = smoothLength
if out_file_for_raw_data:
self.save_data = True
self.out_file_for_raw_data = out_file_for_raw_data
else:
self.save_data = False
self.out_file_for_raw_data = None
# check that wither numberOfSamples or stepSize are set
if numberOfSamples is None and stepSize is None and bedFile is None:
raise ValueError("either stepSize, numberOfSamples or bedFile have to be set")
if self.defaultFragmentLength != 'read length':
self.maxPairedFragmentLength = 4 * self.defaultFragmentLength
else:
self.maxPairedFragmentLength = 1000
if self.maxFragmentLength > 0:
self.maxPairedFragmentLength = self.maxFragmentLength
if len(self.mappedList) == 0:
try:
for fname in self.bamFilesList:
bam, mapped, unmapped, stats = bamHandler.openBam(fname, returnStats=True, nThreads=self.numberOfProcessors)
self.mappedList.append(mapped)
self.statsList.append(stats)
bam.close()
except:
self.mappedList = []
self.statsList = []
def get_chunk_length(self, bamFilesHandles, genomeSize, chromSizes, chrLengths):
# Try to determine an optimal fraction of the genome (chunkSize) that is sent to
# workers for analysis. If too short, too much time is spent loading the files
# if too long, some processors end up free.
# the following values are empirical
if self.stepSize is None:
if self.region is None:
self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
else:
# compute the step size, based on the number of samples
# and the length of the region studied
(chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)
# number of samples is better if large
if np.mean(chrLengths) < self.stepSize and self.bedFile is None:
min_num_of_samples = int(genomeSize / np.mean(chrLengths))
raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))
max_mapped = 0
if len(self.mappedList) > 0:
max_mapped = max(self.mappedList)
# If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin
if max_mapped == 0:
chunkSize = 10000 * self.binLength
self.stepSize = self.binLength
else:
reads_per_bp = float(max_mapped) / genomeSize
chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandles)))
# Ensure that chunkSize is always at least self.stepSize
if chunkSize < self.stepSize:
chunkSize = self.stepSize
# Ensure that chunkSize is always at least self.binLength
if self.binLength and chunkSize < self.binLength:
chunkSize = self.binLength
return chunkSize
def run(self, allArgs=None):
bamFilesHandles = []
for x in self.bamFilesList:
try:
y = bamHandler.openBam(x)
except SystemExit:
sys.exit(sys.exc_info()[1])
except:
y = pyBigWig.open(x)
bamFilesHandles.append(y)
chromsizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandles, verbose=self.verbose)
# skip chromosome in the list. This is usually for the
# X chromosome which may have either one copy in a male sample
# or a mixture of male/female and is unreliable.
# Also the skip may contain heterochromatic regions and
# mitochondrial DNA
if len(self.chrsToSkip):
chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip]
chrNames, chrLengths = list(zip(*chromsizes))
genomeSize = sum(chrLengths)
chunkSize = None
if self.bedFile is None:
if self.genomeChunkSize is None:
chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths)
else:
chunkSize = self.genomeChunkSize
[bam_h.close() for bam_h in bamFilesHandles]
if self.verbose:
print("step size is {}".format(self.stepSize))
if self.region:
# in case a region is used, append the tilesize
self.region += ":{}".format(self.binLength)
# Handle GTF options
transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)
# use map reduce to call countReadsInRegions_wrapper
imap_res = mapReduce.mapReduce([],
countReadsInRegions_wrapper,
chromsizes,
self_=self,
genomeChunkLength=chunkSize,
bedFile=self.bedFile,
blackListFileName=self.blackListFileName,
region=self.region,
numberOfProcessors=self.numberOfProcessors,
transcriptID=transcriptID,
exonID=exonID,
keepExons=keepExons,
transcript_id_designator=transcript_id_designator)
if self.out_file_for_raw_data:
if len(non_common):
sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
"the chromosomes that were not common between the bigwig files\n")
# concatenate intermediary bedgraph files
ofile = open(self.out_file_for_raw_data, "w")
for _values, tempFileName in imap_res:
if tempFileName:
# concatenate all intermediate tempfiles into one
_foo = open(tempFileName, 'r')
shutil.copyfileobj(_foo, ofile)
_foo.close()
os.remove(tempFileName)
ofile.close()
try:
num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
return num_reads_per_bin
except ValueError:
if self.bedFile:
sys.exit('\nNo coverage values could be computed.\n\n'
'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
'The valid chromosome names are:\n{}'.format(chrNames))
else:
sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
'contain mapped reads.')
def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
"""Counts the reads in each bam file at each 'stepSize' position
within the interval (start, end) for a window or bin of size binLength.
The stepSize controls the distance between bins. For example,
a step size of 20 and a bin size of 20 will create bins next to
each other. If the step size is smaller than the bin size the
bins will overlap.
If a list of bedRegions is given, then the number of reads
that overlaps with each region is counted.
Parameters
----------
chrom : str
Chrom name
start : int
start coordinate
end : int
end coordinate
bed_regions_list: list
List of list of tuples of the form (start, end)
corresponding to bed regions to be processed.
If not bed file was passed to the object constructor
then this list is empty.
Returns
-------
numpy array
The result is a numpy array that as rows each bin
and as columns each bam file.
Examples
--------
Initialize some useful values
>>> test = Tester()
>>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)
The transpose is used to get better looking numbers. The first line
corresponds to the number of reads per bin in the first bamfile.
>>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
>>> _array
array([[0., 0.],
[0., 1.],
[1., 1.],
[1., 2.]])
"""
if start > end:
raise NameError("start %d bigger that end %d" % (start, end))
if self.stepSize is None and bed_regions_list is None:
raise ValueError("stepSize is not set!")
# array to keep the read counts for the regions
subnum_reads_per_bin = []
start_time = time.time()
bam_handles = []
for fname in self.bamFilesList:
try:
bam_handles.append(bamHandler.openBam(fname))
except SystemExit:
sys.exit(sys.exc_info()[1])
except:
bam_handles.append(pyBigWig.open(fname))
blackList = None
if self.blackListFileName is not None:
blackList = GTF(self.blackListFileName)
# A list of lists of tuples
transcriptsToConsider = []
if bed_regions_list is not None:
if self.bed_and_bin:
transcriptsToConsider.append([(x[1][0][0], x[1][0][1], self.binLength) for x in bed_regions_list])
else:
transcriptsToConsider = [x[1] for x in bed_regions_list]
else:
if self.stepSize == self.binLength:
transcriptsToConsider.append([(start, end, self.binLength)])
else:
for i in range(start, end, self.stepSize):
if i + self.binLength > end:
break
if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength):
continue
transcriptsToConsider.append([(i, i + self.binLength)])
if self.save_data:
_file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
_file_name = _file.name
else:
_file_name = ''
for bam in bam_handles:
for trans in transcriptsToConsider:
tcov = self.get_coverage_of_region(bam, chrom, trans)
if bed_regions_list is not None and not self.bed_and_bin:
subnum_reads_per_bin.append(np.sum(tcov))
else:
subnum_reads_per_bin.extend(tcov)
subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')
if self.save_data:
idx = 0
for i, trans in enumerate(transcriptsToConsider):
if len(trans[0]) != 3:
starts = ",".join([str(x[0]) for x in trans])
ends = ",".join([str(x[1]) for x in trans])
_file.write("\t".join([chrom, starts, ends]) + "\t")
_file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n")
else:
for exon in trans:
for startPos in range(exon[0], exon[1], exon[2]):
if idx >= subnum_reads_per_bin.shape[0]:
# At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
# Counts there are added to the bin before them, but range() will still try to include them.
break
_file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, min(startPos + exon[2], exon[1])))
_file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n")
idx += 1
_file.close()
if self.verbose:
endTime = time.time()
rows = subnum_reads_per_bin.shape[0]
print("%s countReadsInRegions_worker: processing %d "
"(%.1f per sec) @ %s:%s-%s" %
(multiprocessing.current_process().name,
rows, rows / (endTime - start_time), chrom, start, end))
return subnum_reads_per_bin, _file_name
def get_coverage_of_region(self, bamHandle, chrom, regions,
fragmentFromRead_func=None):
"""
Returns a numpy array that corresponds to the number of reads
that overlap with each tile.
>>> test = Tester()
>>> import pysam
>>> c = CountReadsPerBin([], stepSize=1, extendReads=300)
For this case the reads are length 36. The number of overlapping
read fragments is 4 and 5 for the positions tested.
>>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
... [(5000833, 5000834), (5000834, 5000835)])
array([4., 5.])
In the following example a paired read is extended to the fragment length which is 100
The first mate starts at 5000000 and the second at 5000064. Each mate is
extended to the fragment length *independently*
At position 500090-500100 one fragment of length 100 overlap, and after position 5000101
there should be zero reads.
>>> c.zerosToNans = True
>>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
... [(5000090, 5000100), (5000100, 5000110)])
array([ 1., nan])
In the following case the reads length is 50. Reads are not extended.
>>> c.extendReads=False
>>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
array([1., 2., 2.])
"""
if not fragmentFromRead_func:
fragmentFromRead_func = self.get_fragment_from_read
nbins = len(regions)
if len(regions[0]) == 3:
nbins = 0
for reg in regions:
nbins += (reg[1] - reg[0]) // reg[2]
if (reg[1] - reg[0]) % reg[2] > 0:
nbins += 1
coverages = np.zeros(nbins, dtype='float64')
if self.defaultFragmentLength == 'read length':
extension = 0
else:
extension = self.maxPairedFragmentLength
blackList = None
if self.blackListFileName is not None:
blackList = GTF(self.blackListFileName)
vector_start = 0
for idx, reg in enumerate(regions):
if len(reg) == 3:
tileSize = int(reg[2])
nRegBins = (reg[1] - reg[0]) // tileSize
if (reg[1] - reg[0]) % tileSize > 0:
# Don't eliminate small bins! Issue 887
nRegBins += 1
else:
nRegBins = 1
tileSize = int(reg[1] - reg[0])
# Blacklisted regions have a coverage of 0
if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
continue
regStart = int(max(0, reg[0] - extension))
regEnd = reg[1] + int(extension)
# If alignments are extended and there's a blacklist, ensure that no
# reads originating in a blacklist are fetched
if blackList and reg[0] > 0 and extension > 0:
o = blackList.findOverlaps(chrom, regStart, reg[0])
if o is not None and len(o) > 0:
regStart = o[-1][1]
o = blackList.findOverlaps(chrom, reg[1], regEnd)
if o is not None and len(o) > 0:
regEnd = o[0][0]
start_time = time.time()
# caching seems faster. TODO: profile the function
c = 0
if chrom not in bamHandle.references:
raise NameError("chromosome {} not found in bam file".format(chrom))
prev_pos = set()
lpos = None
# of previous processed read pair
for read in bamHandle.fetch(chrom, regStart, regEnd):
if read.is_unmapped:
continue
if self.minMappingQuality and read.mapq < self.minMappingQuality:
continue
# filter reads based on SAM flag
if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
continue
if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
continue
# Fragment lengths
tLen = deeptools.utilities.getTLen(read)
if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
continue
if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
continue
# get rid of duplicate reads that have same position on each of the
# pairs
if self.ignoreDuplicates:
# Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
if tLen >= 0:
s = read.pos
e = s + tLen
else:
s = read.pnext
e = s - tLen
if read.reference_id != read.next_reference_id:
e = read.pnext
if lpos is not None and lpos == read.reference_start \
and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
continue
if lpos != read.reference_start:
prev_pos.clear()
lpos = read.reference_start
prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
# since reads can be split (e.g. RNA-seq reads) each part of the
# read that maps is called a position block.
try:
position_blocks = fragmentFromRead_func(read)
except TypeError:
# the get_fragment_from_read functions returns None in some cases.
# Those cases are to be skipped, hence the continue line.
continue
last_eIdx = None
for fragmentStart, fragmentEnd in position_blocks:
if fragmentEnd is None or fragmentStart is None:
continue
fragmentLength = fragmentEnd - fragmentStart
if fragmentLength == 0:
continue
# skip reads that are not in the region being
# evaluated.
if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
continue
if fragmentStart < reg[0]:
fragmentStart = reg[0]
if fragmentEnd > reg[0] + len(coverages) * tileSize:
fragmentEnd = reg[0] + len(coverages) * tileSize
sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
if last_eIdx is not None:
sIdx = max(last_eIdx, sIdx)
if sIdx >= eIdx:
continue
sIdx = int(sIdx)
eIdx = int(eIdx)
coverages[sIdx:eIdx] += 1
last_eIdx = eIdx
c += 1
if self.verbose:
endTime = time.time()
print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % (
multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))
vector_start += nRegBins
# change zeros to NAN
if self.zerosToNans:
coverages[coverages == 0] = np.nan
return coverages
def getReadLength(self, read):
return len(read)
@staticmethod
def is_proper_pair(read, maxPairedFragmentLength):
"""
Checks if a read is proper pair meaning that both mates are facing each other and are in
the same chromosome and are not to far away. The sam flag for proper pair can not
always be trusted. Note that if the fragment size is > maxPairedFragmentLength (~2kb
usually) that False will be returned.
:return: bool
>>> import pysam
>>> import os
>>> from deeptools.countReadsPerBin import CountReadsPerBin as cr
>>> root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
>>> bam = pysam.AlignmentFile("{}/test_proper_pair_filtering.bam".format(root))
>>> iter = bam.fetch()
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "keep" read
True
>>> cr.is_proper_pair(read, 200) # "keep" read, but maxPairedFragmentLength is too short
False
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "improper pair"
False
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "mismatch chr"
False
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "same orientation1"
False
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "same orientation2"
False
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "rev first"
False
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "rev first OK"
True
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "for first"
False
>>> read = next(iter)
>>> cr.is_proper_pair(read, 1000) # "for first"
True
"""
if not read.is_proper_pair:
return False
if read.reference_id != read.next_reference_id:
return False
if abs(read.template_length) > maxPairedFragmentLength:
return False
# check that the mates face each other (inward)
if read.is_reverse is read.mate_is_reverse:
return False
if read.is_reverse:
if read.reference_start >= read.next_reference_start:
return True
else:
if read.reference_start <= read.next_reference_start:
return True
return False
def get_fragment_from_read(self, read):
"""Get read start and end position of a read.
If given, the reads are extended as follows:
If reads are paired end, each read mate is extended to match
the fragment length, otherwise, a default fragment length
is used. If reads are split (give by the CIGAR string) then
the multiple positions of the read are returned.
When reads are extended the cigar information is
skipped.
Parameters
----------
read: pysam object.
The following values are defined (for forward reads)::
|-- -- read.tlen -- --|
|-- read.alen --|
-----|===============>------------<==============|----
| | |
read.reference_start
read.reference_end read.pnext
and for reverse reads
|-- -- read.tlen -- --|
|-- read.alen --|
-----|===============>-----------<===============|----
| | |
read.pnext read.reference_start read.reference_end
this is a sketch of a pair-end reads
The function returns the fragment start and end, either
using the paired end information (if available) or
extending the read in the appropriate direction if this
is single-end.
Parameters
----------
read : pysam read object
Returns
-------
list of tuples
[(fragment start, fragment end)]
>>> test = Tester()
>>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True)
>>> c.defaultFragmentLength=100
>>> c.get_fragment_from_read(test.getRead("paired-forward"))
[(5000000, 5000100)]
>>> c.get_fragment_from_read(test.getRead("paired-reverse"))
[(5000000, 5000100)]
>>> c.defaultFragmentLength = 200
>>> c.get_fragment_from_read(test.getRead("single-forward"))
[(5001491, 5001691)]
>>> c.get_fragment_from_read(test.getRead("single-reverse"))
[(5001536, 5001736)]
>>> c.defaultFragmentLength = 'read length'
>>> c.get_fragment_from_read(test.getRead("single-forward"))
[(5001491, 5001527)]
>>> c.defaultFragmentLength = 'read length'
>>> c.extendReads = False
>>> c.get_fragment_from_read(test.getRead("paired-forward"))
[(5000000, 5000036)]
Tests for read centering.
>>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True, center_read=True)
>>> c.defaultFragmentLength = 100
>>> assert c.get_fragment_from_read(test.getRead("paired-forward")) == [(5000032, 5000068)]
>>> c.defaultFragmentLength = 200
>>> assert c.get_fragment_from_read(test.getRead("single-reverse")) == [(5001618, 5001654)]
"""
# if no extension is needed, use pysam get_blocks
# to identify start and end reference positions.
# get_blocks return a list of start and end positions
# based on the CIGAR if skipped regions are found.
# E.g for a cigar of 40M260N22M
# get blocks return two elements for the first 40 matches
# and the for the last 22 matches.
if self.defaultFragmentLength == 'read length':
return read.get_blocks()
else:
if self.is_proper_pair(read, self.maxPairedFragmentLength):
if read.is_reverse:
fragmentStart = read.next_reference_start
fragmentEnd = read.reference_end
else:
fragmentStart = read.reference_start
# the end of the fragment is defined as
# the start of the forward read plus the insert length
fragmentEnd = read.reference_start + abs(read.template_length)
# Extend using the default fragment length
else:
if read.is_reverse:
fragmentStart = read.reference_end - self.defaultFragmentLength
fragmentEnd = read.reference_end
else:
fragmentStart = read.reference_start
fragmentEnd = read.reference_start + self.defaultFragmentLength
if self.center_read:
fragmentCenter = fragmentEnd - (fragmentEnd - fragmentStart) / 2
fragmentStart = int(fragmentCenter - read.infer_query_length(always=False) / 2)
fragmentEnd = fragmentStart + read.infer_query_length(always=False)
assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \
"end for read {}".format(read.query_name)
return [(fragmentStart, fragmentEnd)]
def getSmoothRange(self, tileIndex, tileSize, smoothRange, maxPosition):
"""
Given a tile index position and a tile size (length), return the a new indices
over a larger range, called the smoothRange.
This region is centered in the tileIndex an spans on both sizes
to cover the smoothRange. The smoothRange is trimmed in case it is less
than zero or greater than maxPosition ::
---------------|==================|------------------
tileStart
|--------------------------------------|
| <-- smoothRange --> |
|
tileStart - (smoothRange-tileSize)/2
Test for a smooth range that spans 3 tiles.
Examples
--------
>>> c = CountReadsPerBin([], 1, 1, 1, 0)
>>> c.getSmoothRange(5, 1, 3, 10)
(4, 7)
Test smooth range truncated on start.
>>> c.getSmoothRange(0, 10, 30, 200)
(0, 2)
Test smooth range truncated on start.
>>> c.getSmoothRange(1, 10, 30, 4)
(0, 3)
Test smooth range truncated on end.
>>> c.getSmoothRange(5, 1, 3, 5)
(4, 5)
Test smooth range not multiple of tileSize.
>>> c.getSmoothRange(5, 10, 24, 10)
(4, 6)
"""
smoothTiles = int(smoothRange / tileSize)
if smoothTiles == 1:
return (tileIndex, tileIndex + 1)
smoothTilesSide = float(smoothTiles - 1) / 2
smoothTilesLeft = int(np.ceil(smoothTilesSide))
smoothTilesRight = int(np.floor(smoothTilesSide)) + 1
indexStart = max(tileIndex - smoothTilesLeft, 0)
indexEnd = min(maxPosition, tileIndex + smoothTilesRight)
return (indexStart, indexEnd)
def remove_row_of_zeros(matrix):
# remove rows containing all zeros or all nans
_mat = np.nan_to_num(matrix)
to_keep = _mat.sum(1) != 0
return matrix[to_keep, :]
def estimateSizeFactors(m):
"""
Compute size factors in the same way as DESeq2.
The inverse of that is returned, as it's then compatible with bamCoverage.
m : a numpy ndarray
>>> m = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 10, 0], [10, 5, 100]])
>>> sf = estimateSizeFactors(m)
>>> assert np.all(np.abs(sf - [1.305, 0.9932, 0.783]) < 1e-4)
>>> m = np.array([[0, 0], [0, 1], [1, 1], [1, 2]])
>>> sf = estimateSizeFactors(m)
>>> assert np.all(np.abs(sf - [1.1892, 0.8409]) < 1e-4)
"""
loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1]
# Mask after computing the geometric mean
m = np.ma.masked_where(m <= 0, m)
loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans)
# DESeq2 ratio-based size factor
sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0))
return 1. / sf
class Tester(object):
def __init__(self):
"""
The distribution of reads between the two bam files is as follows.
They cover 200 bp
0 100 200
|------------------------------------------------------------|
A ===============
===============
B =============== ===============
===============
===============
"""
self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
# self.root = "./test/test_data/"
self.bamFile1 = self.root + "testA.bam"
self.bamFile2 = self.root + "testB.bam"
self.bamFile_PE = self.root + "test_paired2.bam"
self.chrom = '3R'
global debug
debug = 0
def getRead(self, readType):
""" prepare arguments for test
"""
bam = bamHandler.openBam(self.bamFile_PE)
if readType == 'paired-reverse':
read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
elif readType == 'single-forward':
read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
elif readType == 'single-reverse':
read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
else: # by default a forward paired read is returned
read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
return read
|