File: countReadsPerBin.py

package info (click to toggle)
python-deeptools 3.5.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 34,456 kB
  • sloc: python: 14,503; xml: 4,212; sh: 33; makefile: 5
file content (1033 lines) | stat: -rw-r--r-- 42,159 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
import shutil
import os
import time
import sys
import multiprocessing
import numpy as np

# deepTools packages
import deeptools.utilities
from deeptools import bamHandler
from deeptools import mapReduce
from deeptoolsintervals import GTF
import pyBigWig

debug = 0
old_settings = np.seterr(all='ignore')


def countReadsInRegions_wrapper(args):
    """
    Passes the arguments to countReadsInRegions_worker.
    This is a step required given
    the constrains from the multiprocessing module.
    The args var, contains as first element the 'self' value
    from the countReadsPerBin object

    """
    return CountReadsPerBin.count_reads_in_region(*args)


class CountReadsPerBin(object):

    r"""Collects coverage over multiple bam files using multiprocessing

    This function collects read counts (coverage) from several bam files and returns
    an numpy array with the results. This class uses multiprocessing to compute the coverage.

    Parameters
    ----------
    bamFilesList : list
        List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']

    binLength : int
        Length of the window/bin. This value is overruled by ``bedFile`` if present.

    numberOfSamples : int
        Total number of samples. The genome is divided into ``numberOfSamples``, each
        with a window/bin length equal to ``binLength``. This value is overruled
        by ``stepSize`` in case such value is present and by ``bedFile`` in which
        case the number of samples and bins are defined in the bed file

    numberOfProcessors : int
        Number of processors to use. Default is 4

    verbose : bool
        Output messages. Default: False

    region : str
        Region to limit the computation in the form chrom:start:end.

    bedFile : list of file_handles.
        Each file handle corresponds to a bed file containing the regions for which to compute the coverage. This option
        overrules ``binLength``, ``numberOfSamples`` and ``stepSize``.

    blackListFileName : str
        A string containing a BED file with blacklist regions.

    extendReads : bool, int

        Whether coverage should be computed for the extended read length (i.e. the region covered
        by the two mates or the regions expected to be covered by single-reads).
        If the value is 'int', then then this is interpreted as the fragment length to extend reads
        that are not paired. For Illumina reads, usual values are around 300.
        This value can be determined using the peak caller MACS2 or can be
        approximated by the fragment lengths computed when preparing the library for sequencing. If the value
        is of the variable is true and not value is given, the fragment size is sampled from the library but
        only if the library is paired-end. Default: False


    minMappingQuality : int
        Reads of a mapping quality less than the give value are not considered. Default: None

    ignoreDuplicates : bool
        Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are
        to be excluded. Default: false

    chrToSkip: list
        List with names of chromosomes that do not want to be included in the coverage computation.
        This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het').

    stepSize : int
        the positions for which the coverage is computed are defined as follows:
        ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
        the coverage at each base pair. If the stepSize is equal to the
        binLength then the coverage is computed for consecutive bins. If seepSize is
        smaller than the binLength, then teh bins will overlap.

    center_read : bool
        Determines if reads should be centered with respect to the fragment length.

    samFlag_include : int
        Extracts only those reads having the SAM flag. For example, to get only
        reads that are the first mates a samFlag of 64 could be used. Similarly, the
        samFlag_include can be used to select only reads mapping on the reverse strand
        or to get only properly paired reads.

    samFlag_exclude : int
        Removes reads that match the SAM flag. For example to get all reads
        that map to the forward strand a samFlag_exlude 16 should be used. Which
        translates into exclude all reads that map to the reverse strand.

    zerosToNans : bool
        If true, zero values encountered are transformed to Nans. Default false.

    skipZeroOverZero : bool
        If true, skip bins where all input BAM files have no coverage (only applicable to bamCompare).

    minFragmentLength : int
        If greater than 0, fragments below this size are excluded.

    maxFragmentLength : int
        If greater than 0, fragments above this size are excluded.

    out_file_for_raw_data : str
        File name to save the raw counts computed

    statsList : list
        For each BAM file in bamFilesList, the associated per-chromosome statistics returned by openBam

    mappedList : list
        For each BAM file in bamFilesList, the number of mapped reads in the file.

    bed_and_bin : boolean
        If true AND a bedFile is given, compute coverage of each bin of the given size in each region of bedFile

    genomeChunkSize : int
        If not None, the length of the genome used for multiprocessing.

    Returns
    -------
    numpy array

        Each row correspond to each bin/bed region and each column correspond to each of
        the bamFiles.


    Examples
    --------

    The test data contains reads for 200 bp.

    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1

    >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 50, 4)
    >>> np.transpose(c.run())
    array([[0., 0., 1., 1.],
           [0., 1., 1., 2.]])
    """

    def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1,
                 verbose=False, region=None,
                 bedFile=None, extendReads=False,
                 genomeChunkSize=None,
                 blackListFileName=None,
                 minMappingQuality=None,
                 ignoreDuplicates=False,
                 chrsToSkip=[],
                 stepSize=None,
                 center_read=False,
                 samFlag_include=None,
                 samFlag_exclude=None,
                 zerosToNans=False,
                 skipZeroOverZero=False,
                 smoothLength=0,
                 minFragmentLength=0,
                 maxFragmentLength=0,
                 out_file_for_raw_data=None,
                 bed_and_bin=False,
                 statsList=[],
                 mappedList=[]):

        self.bamFilesList = bamFilesList
        self.binLength = binLength
        self.numberOfSamples = numberOfSamples
        self.blackListFileName = blackListFileName
        self.statsList = statsList
        self.mappedList = mappedList
        self.skipZeroOverZero = skipZeroOverZero
        self.bed_and_bin = bed_and_bin
        self.genomeChunkSize = genomeChunkSize

        if extendReads and len(bamFilesList):
            from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
            frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0],
                                                                        return_lengths=False,
                                                                        blackListFileName=blackListFileName,
                                                                        numberOfProcessors=numberOfProcessors,
                                                                        verbose=verbose)
            if extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    self.defaultFragmentLength = int(frag_len_dict['median'])
                else:
                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                if verbose:
                    print(("Fragment length based on paired en data "
                          "estimated to be {}".format(frag_len_dict['median'])))

            elif extendReads < read_len_dict['median']:
                sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
                                 "Reads will not be extended.\n".format(int(read_len_dict['median'])))
                self.defaultFragmentLength = 'read length'

            elif extendReads > 2000:
                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads))
            else:
                self.defaultFragmentLength = int(extendReads)

        else:
            self.defaultFragmentLength = 'read length'

        self.numberOfProcessors = numberOfProcessors
        self.verbose = verbose
        self.region = region
        self.bedFile = bedFile
        self.minMappingQuality = minMappingQuality
        self.ignoreDuplicates = ignoreDuplicates
        self.chrsToSkip = chrsToSkip
        self.stepSize = stepSize
        self.center_read = center_read
        self.samFlag_include = samFlag_include
        self.samFlag_exclude = samFlag_exclude
        self.minFragmentLength = minFragmentLength
        self.maxFragmentLength = maxFragmentLength
        self.zerosToNans = zerosToNans
        self.smoothLength = smoothLength

        if out_file_for_raw_data:
            self.save_data = True
            self.out_file_for_raw_data = out_file_for_raw_data
        else:
            self.save_data = False
            self.out_file_for_raw_data = None

        # check that wither numberOfSamples or stepSize are set
        if numberOfSamples is None and stepSize is None and bedFile is None:
            raise ValueError("either stepSize, numberOfSamples or bedFile have to be set")

        if self.defaultFragmentLength != 'read length':
            self.maxPairedFragmentLength = 4 * self.defaultFragmentLength
        else:
            self.maxPairedFragmentLength = 1000
        if self.maxFragmentLength > 0:
            self.maxPairedFragmentLength = self.maxFragmentLength

        if len(self.mappedList) == 0:
            try:
                for fname in self.bamFilesList:
                    bam, mapped, unmapped, stats = bamHandler.openBam(fname, returnStats=True, nThreads=self.numberOfProcessors)
                    self.mappedList.append(mapped)
                    self.statsList.append(stats)
                    bam.close()
            except:
                self.mappedList = []
                self.statsList = []

    def get_chunk_length(self, bamFilesHandles, genomeSize, chromSizes, chrLengths):
        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
        # workers for analysis. If too short, too much time is spent loading the files
        # if too long, some processors end up free.
        # the following values are empirical
        if self.stepSize is None:
            if self.region is None:
                self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
            else:
                # compute the step size, based on the number of samples
                # and the length of the region studied
                (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
                self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)

        # number of samples is better if large
        if np.mean(chrLengths) < self.stepSize and self.bedFile is None:
            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
            raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))

        max_mapped = 0
        if len(self.mappedList) > 0:
            max_mapped = max(self.mappedList)

        # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin
        if max_mapped == 0:
            chunkSize = 10000 * self.binLength
            self.stepSize = self.binLength
        else:
            reads_per_bp = float(max_mapped) / genomeSize
            chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandles)))

        # Ensure that chunkSize is always at least self.stepSize
        if chunkSize < self.stepSize:
            chunkSize = self.stepSize

        # Ensure that chunkSize is always at least self.binLength
        if self.binLength and chunkSize < self.binLength:
            chunkSize = self.binLength

        return chunkSize

    def run(self, allArgs=None):
        bamFilesHandles = []
        for x in self.bamFilesList:
            try:
                y = bamHandler.openBam(x)
            except SystemExit:
                sys.exit(sys.exc_info()[1])
            except:
                y = pyBigWig.open(x)
            bamFilesHandles.append(y)

        chromsizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandles, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = list(zip(*chromsizes))

        genomeSize = sum(chrLengths)

        chunkSize = None
        if self.bedFile is None:
            if self.genomeChunkSize is None:
                chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths)
            else:
                chunkSize = self.genomeChunkSize

        [bam_h.close() for bam_h in bamFilesHandles]

        if self.verbose:
            print("step size is {}".format(self.stepSize))

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # Handle GTF options
        transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce([],
                                       countReadsInRegions_wrapper,
                                       chromsizes,
                                       self_=self,
                                       genomeChunkLength=chunkSize,
                                       bedFile=self.bedFile,
                                       blackListFileName=self.blackListFileName,
                                       region=self.region,
                                       numberOfProcessors=self.numberOfProcessors,
                                       transcriptID=transcriptID,
                                       exonID=exonID,
                                       keepExons=keepExons,
                                       transcript_id_designator=transcript_id_designator)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                                 "the chromosomes that were not common between the bigwig files\n")

            # concatenate intermediary bedgraph files
            ofile = open(self.out_file_for_raw_data, "w")
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    _foo = open(tempFileName, 'r')
                    shutil.copyfileobj(_foo, ofile)
                    _foo.close()
                    os.remove(tempFileName)

            ofile.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit('\nNo coverage values could be computed.\n\n'
                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                         'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                         'contain mapped reads.')

    def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
        """Counts the reads in each bam file at each 'stepSize' position
        within the interval (start, end) for a window or bin of size binLength.

        The stepSize controls the distance between bins. For example,
        a step size of 20 and a bin size of 20 will create bins next to
        each other. If the step size is smaller than the bin size the
        bins will overlap.

        If a list of bedRegions is given, then the number of reads
        that overlaps with each region is counted.

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        bed_regions_list: list
            List of list of tuples of the form (start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        numpy array
            The result is a numpy array that as rows each bin
            and as columns each bam file.


        Examples
        --------
        Initialize some useful values

        >>> test = Tester()
        >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)

        The transpose is used to get better looking numbers. The first line
        corresponds to the number of reads per bin in the first bamfile.

        >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
        >>> _array
        array([[0., 0.],
               [0., 1.],
               [1., 1.],
               [1., 2.]])

        """

        if start > end:
            raise NameError("start %d bigger that end %d" % (start, end))

        if self.stepSize is None and bed_regions_list is None:
            raise ValueError("stepSize is not set!")
        # array to keep the read counts for the regions
        subnum_reads_per_bin = []

        start_time = time.time()

        bam_handles = []
        for fname in self.bamFilesList:
            try:
                bam_handles.append(bamHandler.openBam(fname))
            except SystemExit:
                sys.exit(sys.exc_info()[1])
            except:
                bam_handles.append(pyBigWig.open(fname))

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        # A list of lists of tuples
        transcriptsToConsider = []
        if bed_regions_list is not None:
            if self.bed_and_bin:
                transcriptsToConsider.append([(x[1][0][0], x[1][0][1], self.binLength) for x in bed_regions_list])
            else:
                transcriptsToConsider = [x[1] for x in bed_regions_list]
        else:
            if self.stepSize == self.binLength:
                transcriptsToConsider.append([(start, end, self.binLength)])
            else:
                for i in range(start, end, self.stepSize):
                    if i + self.binLength > end:
                        break
                    if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength):
                        continue
                    transcriptsToConsider.append([(i, i + self.binLength)])

        if self.save_data:
            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
            _file_name = _file.name
        else:
            _file_name = ''

        for bam in bam_handles:
            for trans in transcriptsToConsider:
                tcov = self.get_coverage_of_region(bam, chrom, trans)
                if bed_regions_list is not None and not self.bed_and_bin:
                    subnum_reads_per_bin.append(np.sum(tcov))
                else:
                    subnum_reads_per_bin.extend(tcov)

        subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F')

        if self.save_data:
            idx = 0
            for i, trans in enumerate(transcriptsToConsider):
                if len(trans[0]) != 3:
                    starts = ",".join([str(x[0]) for x in trans])
                    ends = ",".join([str(x[1]) for x in trans])
                    _file.write("\t".join([chrom, starts, ends]) + "\t")
                    _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n")
                else:
                    for exon in trans:
                        for startPos in range(exon[0], exon[1], exon[2]):
                            if idx >= subnum_reads_per_bin.shape[0]:
                                # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size
                                # Counts there are added to the bin before them, but range() will still try to include them.
                                break
                            _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, min(startPos + exon[2], exon[1])))
                            _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n")
                            idx += 1
            _file.close()

        if self.verbose:
            endTime = time.time()
            rows = subnum_reads_per_bin.shape[0]
            print("%s countReadsInRegions_worker: processing %d "
                  "(%.1f per sec) @ %s:%s-%s" %
                  (multiprocessing.current_process().name,
                   rows, rows / (endTime - start_time), chrom, start, end))

        return subnum_reads_per_bin, _file_name

    def get_coverage_of_region(self, bamHandle, chrom, regions,
                               fragmentFromRead_func=None):
        """
        Returns a numpy array that corresponds to the number of reads
        that overlap with each tile.

        >>> test = Tester()
        >>> import pysam
        >>> c = CountReadsPerBin([], stepSize=1, extendReads=300)

        For this case the reads are length 36. The number of overlapping
        read fragments is 4 and 5 for the positions tested.

        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000833, 5000834), (5000834, 5000835)])
        array([4., 5.])

        In the following example a paired read is extended to the fragment length which is 100
        The first mate starts at 5000000 and the second at 5000064. Each mate is
        extended to the fragment length *independently*
        At position 500090-500100 one fragment  of length 100 overlap, and after position 5000101
        there should be zero reads.

        >>> c.zerosToNans = True
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
        ... [(5000090, 5000100), (5000100, 5000110)])
        array([ 1., nan])

        In the following  case the reads length is 50. Reads are not extended.

        >>> c.extendReads=False
        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
        array([1., 2., 2.])


        """
        if not fragmentFromRead_func:
            fragmentFromRead_func = self.get_fragment_from_read
        nbins = len(regions)
        if len(regions[0]) == 3:
            nbins = 0
            for reg in regions:
                nbins += (reg[1] - reg[0]) // reg[2]
                if (reg[1] - reg[0]) % reg[2] > 0:
                    nbins += 1
        coverages = np.zeros(nbins, dtype='float64')

        if self.defaultFragmentLength == 'read length':
            extension = 0
        else:
            extension = self.maxPairedFragmentLength

        blackList = None
        if self.blackListFileName is not None:
            blackList = GTF(self.blackListFileName)

        vector_start = 0
        for idx, reg in enumerate(regions):
            if len(reg) == 3:
                tileSize = int(reg[2])
                nRegBins = (reg[1] - reg[0]) // tileSize
                if (reg[1] - reg[0]) % tileSize > 0:
                    # Don't eliminate small bins! Issue 887
                    nRegBins += 1
            else:
                nRegBins = 1
                tileSize = int(reg[1] - reg[0])

            # Blacklisted regions have a coverage of 0
            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
                continue
            regStart = int(max(0, reg[0] - extension))
            regEnd = reg[1] + int(extension)

            # If alignments are extended and there's a blacklist, ensure that no
            # reads originating in a blacklist are fetched
            if blackList and reg[0] > 0 and extension > 0:
                o = blackList.findOverlaps(chrom, regStart, reg[0])
                if o is not None and len(o) > 0:
                    regStart = o[-1][1]
                o = blackList.findOverlaps(chrom, reg[1], regEnd)
                if o is not None and len(o) > 0:
                    regEnd = o[0][0]

            start_time = time.time()
            # caching seems faster. TODO: profile the function
            c = 0
            if chrom not in bamHandle.references:
                raise NameError("chromosome {} not found in bam file".format(chrom))

            prev_pos = set()
            lpos = None
            # of previous processed read pair
            for read in bamHandle.fetch(chrom, regStart, regEnd):
                if read.is_unmapped:
                    continue
                if self.minMappingQuality and read.mapq < self.minMappingQuality:
                    continue

                # filter reads based on SAM flag
                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
                    continue
                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
                    continue

                # Fragment lengths
                tLen = deeptools.utilities.getTLen(read)
                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
                    continue
                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
                    continue

                # get rid of duplicate reads that have same position on each of the
                # pairs
                if self.ignoreDuplicates:
                    # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
                    if tLen >= 0:
                        s = read.pos
                        e = s + tLen
                    else:
                        s = read.pnext
                        e = s - tLen
                    if read.reference_id != read.next_reference_id:
                        e = read.pnext
                    if lpos is not None and lpos == read.reference_start \
                            and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
                        continue
                    if lpos != read.reference_start:
                        prev_pos.clear()
                    lpos = read.reference_start
                    prev_pos.add((s, e, read.next_reference_id, read.is_reverse))

                # since reads can be split (e.g. RNA-seq reads) each part of the
                # read that maps is called a position block.
                try:
                    position_blocks = fragmentFromRead_func(read)
                except TypeError:
                    # the get_fragment_from_read functions returns None in some cases.
                    # Those cases are to be skipped, hence the continue line.
                    continue

                last_eIdx = None
                for fragmentStart, fragmentEnd in position_blocks:
                    if fragmentEnd is None or fragmentStart is None:
                        continue
                    fragmentLength = fragmentEnd - fragmentStart
                    if fragmentLength == 0:
                        continue
                    # skip reads that are not in the region being
                    # evaluated.
                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                        continue

                    if fragmentStart < reg[0]:
                        fragmentStart = reg[0]
                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
                        fragmentEnd = reg[0] + len(coverages) * tileSize

                    sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
                    eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
                    if last_eIdx is not None:
                        sIdx = max(last_eIdx, sIdx)
                        if sIdx >= eIdx:
                            continue
                    sIdx = int(sIdx)
                    eIdx = int(eIdx)
                    coverages[sIdx:eIdx] += 1
                    last_eIdx = eIdx

                c += 1

            if self.verbose:
                endTime = time.time()
                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" % (
                    multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))

            vector_start += nRegBins

        # change zeros to NAN
        if self.zerosToNans:
            coverages[coverages == 0] = np.nan

        return coverages

    def getReadLength(self, read):
        return len(read)

    @staticmethod
    def is_proper_pair(read, maxPairedFragmentLength):
        """
        Checks if a read is proper pair meaning that both mates are facing each other and are in
        the same chromosome and are not to far away. The sam flag for proper pair can not
        always be trusted. Note that if the fragment size is > maxPairedFragmentLength (~2kb
        usually) that False will be returned.
        :return: bool

        >>> import pysam
        >>> import os
        >>> from deeptools.countReadsPerBin import CountReadsPerBin as cr
        >>> root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        >>> bam = pysam.AlignmentFile("{}/test_proper_pair_filtering.bam".format(root))
        >>> iter = bam.fetch()
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "keep" read
        True
        >>> cr.is_proper_pair(read, 200) # "keep" read, but maxPairedFragmentLength is too short
        False
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "improper pair"
        False
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "mismatch chr"
        False
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "same orientation1"
        False
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "same orientation2"
        False
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "rev first"
        False
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "rev first OK"
        True
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "for first"
        False
        >>> read = next(iter)
        >>> cr.is_proper_pair(read, 1000) # "for first"
        True
        """
        if not read.is_proper_pair:
            return False
        if read.reference_id != read.next_reference_id:
            return False
        if abs(read.template_length) > maxPairedFragmentLength:
            return False
        # check that the mates face each other (inward)
        if read.is_reverse is read.mate_is_reverse:
            return False
        if read.is_reverse:
            if read.reference_start >= read.next_reference_start:
                return True
        else:
            if read.reference_start <= read.next_reference_start:
                return True
        return False

    def get_fragment_from_read(self, read):
        """Get read start and end position of a read.
        If given, the reads are extended as follows:
        If reads are paired end, each read mate is extended to match
        the fragment length, otherwise, a default fragment length
        is used. If reads are split (give by the CIGAR string) then
        the multiple positions of the read are returned.
        When reads are extended the cigar information is
        skipped.

        Parameters
        ----------
        read: pysam object.

        The following values are defined (for forward reads)::


                 |--          -- read.tlen --              --|
                 |-- read.alen --|
            -----|===============>------------<==============|----
                 |               |            |
            read.reference_start
                        read.reference_end  read.pnext

              and for reverse reads


                 |--             -- read.tlen --           --|
                                             |-- read.alen --|
            -----|===============>-----------<===============|----
                 |                           |               |
              read.pnext           read.reference_start  read.reference_end

        this is a sketch of a pair-end reads

        The function returns the fragment start and end, either
        using the paired end information (if available) or
        extending the read in the appropriate direction if this
        is single-end.

        Parameters
        ----------
        read : pysam read object


        Returns
        -------
        list of tuples
            [(fragment start, fragment end)]


        >>> test = Tester()
        >>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True)
        >>> c.defaultFragmentLength=100
        >>> c.get_fragment_from_read(test.getRead("paired-forward"))
        [(5000000, 5000100)]
        >>> c.get_fragment_from_read(test.getRead("paired-reverse"))
        [(5000000, 5000100)]
        >>> c.defaultFragmentLength = 200
        >>> c.get_fragment_from_read(test.getRead("single-forward"))
        [(5001491, 5001691)]
        >>> c.get_fragment_from_read(test.getRead("single-reverse"))
        [(5001536, 5001736)]
        >>> c.defaultFragmentLength = 'read length'
        >>> c.get_fragment_from_read(test.getRead("single-forward"))
        [(5001491, 5001527)]
        >>> c.defaultFragmentLength = 'read length'
        >>> c.extendReads = False
        >>> c.get_fragment_from_read(test.getRead("paired-forward"))
        [(5000000, 5000036)]

        Tests for read centering.

        >>> c = CountReadsPerBin([], 1, 1, 200, extendReads=True, center_read=True)
        >>> c.defaultFragmentLength = 100
        >>> assert c.get_fragment_from_read(test.getRead("paired-forward")) == [(5000032, 5000068)]
        >>> c.defaultFragmentLength = 200
        >>> assert c.get_fragment_from_read(test.getRead("single-reverse")) == [(5001618, 5001654)]
        """
        # if no extension is needed, use pysam get_blocks
        # to identify start and end reference positions.
        # get_blocks return a list of start and end positions
        # based on the CIGAR if skipped regions are found.
        # E.g for a cigar of 40M260N22M
        # get blocks return two elements for the first 40 matches
        # and the for the last 22 matches.
        if self.defaultFragmentLength == 'read length':
            return read.get_blocks()

        else:
            if self.is_proper_pair(read, self.maxPairedFragmentLength):
                if read.is_reverse:
                    fragmentStart = read.next_reference_start
                    fragmentEnd = read.reference_end
                else:
                    fragmentStart = read.reference_start
                    # the end of the fragment is defined as
                    # the start of the forward read plus the insert length
                    fragmentEnd = read.reference_start + abs(read.template_length)

            # Extend using the default fragment length
            else:
                if read.is_reverse:
                    fragmentStart = read.reference_end - self.defaultFragmentLength
                    fragmentEnd = read.reference_end
                else:
                    fragmentStart = read.reference_start
                    fragmentEnd = read.reference_start + self.defaultFragmentLength

        if self.center_read:
            fragmentCenter = fragmentEnd - (fragmentEnd - fragmentStart) / 2
            fragmentStart = int(fragmentCenter - read.infer_query_length(always=False) / 2)
            fragmentEnd = fragmentStart + read.infer_query_length(always=False)

        assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \
                                            "end for read {}".format(read.query_name)
        return [(fragmentStart, fragmentEnd)]

    def getSmoothRange(self, tileIndex, tileSize, smoothRange, maxPosition):
        """
        Given a tile index position and a tile size (length), return the a new indices
        over a larger range, called the smoothRange.
        This region is centered in the tileIndex  an spans on both sizes
        to cover the smoothRange. The smoothRange is trimmed in case it is less
        than zero or greater than  maxPosition ::


             ---------------|==================|------------------
                        tileStart
                   |--------------------------------------|
                   |    <--      smoothRange     -->      |
                   |
             tileStart - (smoothRange-tileSize)/2

        Test for a smooth range that spans 3 tiles.

        Examples
        --------

        >>> c = CountReadsPerBin([], 1, 1, 1, 0)
        >>> c.getSmoothRange(5, 1, 3, 10)
        (4, 7)

        Test smooth range truncated on start.

        >>> c.getSmoothRange(0, 10, 30, 200)
        (0, 2)

        Test smooth range truncated on start.

        >>> c.getSmoothRange(1, 10, 30, 4)
        (0, 3)

        Test smooth range truncated on end.

        >>> c.getSmoothRange(5, 1, 3, 5)
        (4, 5)

        Test smooth range not multiple of tileSize.

        >>> c.getSmoothRange(5, 10, 24, 10)
        (4, 6)
        """
        smoothTiles = int(smoothRange / tileSize)
        if smoothTiles == 1:
            return (tileIndex, tileIndex + 1)

        smoothTilesSide = float(smoothTiles - 1) / 2
        smoothTilesLeft = int(np.ceil(smoothTilesSide))
        smoothTilesRight = int(np.floor(smoothTilesSide)) + 1

        indexStart = max(tileIndex - smoothTilesLeft, 0)
        indexEnd = min(maxPosition, tileIndex + smoothTilesRight)
        return (indexStart, indexEnd)


def remove_row_of_zeros(matrix):
    # remove rows containing all zeros or all nans
    _mat = np.nan_to_num(matrix)
    to_keep = _mat.sum(1) != 0
    return matrix[to_keep, :]


def estimateSizeFactors(m):
    """
    Compute size factors in the same way as DESeq2.
    The inverse of that is returned, as it's then compatible with bamCoverage.

    m : a numpy ndarray

    >>> m = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 10, 0], [10, 5, 100]])
    >>> sf = estimateSizeFactors(m)
    >>> assert np.all(np.abs(sf - [1.305, 0.9932, 0.783]) < 1e-4)
    >>> m = np.array([[0, 0], [0, 1], [1, 1], [1, 2]])
    >>> sf = estimateSizeFactors(m)
    >>> assert np.all(np.abs(sf - [1.1892, 0.8409]) < 1e-4)
    """
    loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1]
    # Mask after computing the geometric mean
    m = np.ma.masked_where(m <= 0, m)
    loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans)
    # DESeq2 ratio-based size factor
    sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0))
    return 1. / sf


class Tester(object):

    def __init__(self):
        """
        The distribution of reads between the two bam files is as follows.

        They cover 200 bp

          0                              100                           200
          |------------------------------------------------------------|
        A                                ===============
                                                        ===============


        B                 ===============               ===============
                                         ===============
                                                        ===============
        """
        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        # self.root = "./test/test_data/"
        self.bamFile1 = self.root + "testA.bam"
        self.bamFile2 = self.root + "testB.bam"
        self.bamFile_PE = self.root + "test_paired2.bam"
        self.chrom = '3R'
        global debug
        debug = 0

    def getRead(self, readType):
        """ prepare arguments for test
        """
        bam = bamHandler.openBam(self.bamFile_PE)
        if readType == 'paired-reverse':
            read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
        elif readType == 'single-forward':
            read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
        elif readType == 'single-reverse':
            read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
        else:  # by default a forward paired read is returned
            read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
        return read