File: test_bamCoverage_and_bamCompare.py

package info (click to toggle)
python-deeptools 3.5.0-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 34,624 kB
  • sloc: python: 14,765; xml: 4,090; sh: 38; makefile: 11
file content (463 lines) | stat: -rw-r--r-- 17,172 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
from nose.tools import assert_equal
import deeptools.bamCoverage as bam_cov
import deeptools.bamCompare as bam_comp
import deeptools.getScaleFactor as gs
import os.path
import filecmp
from os import unlink

ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_data/"
BAMFILE_A = ROOT + "testA.bam"
BAMFILE_B = ROOT + "testB.bam"
BAMFILE_FILTER1 = ROOT + "test_filtering.bam"
BAMFILE_FILTER2 = ROOT + "test_filtering2.bam"
CRAMFILE_A = ROOT + "testA.cram"
CRAMFILE_B = ROOT + "testB.cram"
CRAMFILE_FILTER1 = ROOT + "test_filtering.cram"
CRAMFILE_FILTER2 = ROOT + "test_filtering2.cram"
BEDFILE_FILTER = ROOT + "test_filtering.blacklist.bed"


"""
The distribution of reads for the bam file is:

              0                              100                           200
              |------------------------------------------------------------|
testA.bam  3R                                ==============>
                                                            <==============


testB.bam  3R                 <==============               ==============>
                                             ==============>
                                                            ==============>
        """


def test_bam_coverage_arguments():
    """
    Test minimal command line args for bamCoverage
    """
    outfile = '/tmp/test_file.bg'
    for fname in [BAMFILE_B, CRAMFILE_B]:
        args = "--bam {} -o {} --outFileFormat bedgraph".format(fname, outfile).split()
        bam_cov.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t0\t50\t0\n', '3R\t50\t150\t1\n', '3R\t150\t200\t2\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_coverage_extend():
    outfile = '/tmp/test_file.bg'
    for fname in [BAMFILE_B, CRAMFILE_B]:
        args = "-b {} -o {} --extendReads 100 --outFileFormat bedgraph".format(fname, outfile).split()
        bam_cov.main(args)
        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t0\t150\t1\n', '3R\t150\t200\t3\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_coverage_extend_and_normalizeUsingRPGC():

    outfile = '/tmp/test_file.bg'
    for fname in [BAMFILE_B, CRAMFILE_B]:
        args = "-b {} -o {} --normalizeUsing RPGC --effectiveGenomeSize 200 --extendReads 100 " \
               "--outFileFormat bedgraph".format(fname, outfile).split()
        bam_cov.main(args)
        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        # the scale factor should be 0.5, thus the result is similar to
        # that of the previous test divided by 0.5
        expected = ['3R\t0\t150\t0.5\n', '3R\t150\t200\t1.5\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_coverage_skipnas():
    outfile = '/tmp/test_file.bg'
    for fname in [BAMFILE_B, CRAMFILE_B]:
        args = "--bam {} -o {} --outFileFormat bedgraph --skipNAs".format(fname, outfile).split()
        bam_cov.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t50\t150\t1\n', '3R\t150\t200\t2\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_coverage_filtering():
    outfile = '/tmp/test_file.bg'
    for fname in [BAMFILE_B, CRAMFILE_B]:
        args = "--bam {} -o {} --outFileFormat bedgraph --ignoreDuplicates --verbose".format(fname, outfile).split()
        bam_cov.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t0\t50\t0\n', '3R\t50\t200\t1\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_compare_arguments():
    """
    Test minimal command line args for bamCoverage. The ratio
    between the same file is taken, therefore, the expected value
    is 1.0 for all bins.
    """
    outfile = '/tmp/test_file.bg'
    for fname in [BAMFILE_B, CRAMFILE_B]:
        args = "--bamfile1 {} --bamfile2 {} " \
               "-o {} -p 1 --outFileFormat bedgraph --operation ratio".format(fname, fname, outfile).split()
        bam_comp.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t0\t200\t1\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_compare_diff_files():
    """
    Test with two different files
    """
    outfile = '/tmp/test_file.bg'
    for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
        args = "--bamfile1 {} --bamfile2 {} --scaleFactors 1:1 --operation subtract " \
               "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split()
        bam_comp.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t-1\n', '3R\t100\t150\t0\n', '3R\t150\t200\t-1\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_compare_pseudocounts():
    """
    Test with different pseudocounts
    """
    outfile = '/tmp/test_file.bg'
    args = "--bamfile1 {} --bamfile2 {} --outFileFormat bedgraph --scaleFactors 1:1 -o {} " \
           "--pseudocount 1 0".format(BAMFILE_A, BAMFILE_B, outfile).split()
    bam_comp.main(args)

    _foo = open(outfile, 'r')
    resp = _foo.readlines()
    _foo.close()
    expected = ['3R\t0\t50\tinf\n', '3R\t50\t100\t0\n', '3R\t100\t150\t1\n', '3R\t150\t200\t0\n']
    assert_equal(resp, expected)
    unlink(outfile)


def test_bam_compare_ZoverZ():
    """
    Ensure --skipZeroOverZero works in bamCompare
    """
    outfile = '/tmp/test_file.bg'
    args = "--bamfile1 {} --bamfile2 {} --outFileFormat bedgraph --scaleFactors 1:1 -o {} " \
           "--skipZeroOverZero".format(BAMFILE_A, BAMFILE_B, outfile).split()
    bam_comp.main(args)

    _foo = open(outfile, 'r')
    resp = _foo.readlines()
    _foo.close()
    expected = ['3R\t50\t100\t-1\n', '3R\t100\t150\t0\n', '3R\t150\t200\t-0.584963\n']
    assert_equal(resp, expected)
    unlink(outfile)


def test_get_num_kept_reads():
    """
    Test the scale factor functions
    """
    for fname in [BAMFILE_A, CRAMFILE_A]:
        args = "--bam {}  -o /tmp/test".format(fname).split()

        args = bam_cov.process_args(args)
        num_kept_reads, total_reads = gs.get_num_kept_reads(args, None)

        # bam file 1 has 2 reads in 3R and 2 read in chr_cigar
        assert num_kept_reads == 3, "num_kept_reads is wrong"
        assert total_reads == 3, "num total reads is wrong"

        # ignore chr_cigar to count the total number of reads
        args = "--bam {} --ignoreForNormalization chr_cigar  -o /tmp/test".format(fname).split()
        args = bam_cov.process_args(args)
        num_kept_reads, total_reads = gs.get_num_kept_reads(args, None)

        # the  number of kept reads should be 2 as the read on chr_cigar is skipped
        assert num_kept_reads == 2, "num_kept_reads is wrong ({})".format(num_kept_reads)

        # test filtering by read direction. Only forward reads are kept
        args = "--bam {}  -o /tmp/test --samFlagExclude 16 --ignoreForNormalization chr_cigar ".format(fname).split()

        args = bam_cov.process_args(args)
        num_kept_reads, total_reads = gs.get_num_kept_reads(args, None)

        # only one forward read is expected in
        assert num_kept_reads == 1, "num_kept_reads is wrong"


def test_bam_compare_diff_files_skipnas():
    """
    Test skipnas
    Compared to the previous tests, any region that do not have coverage (in either of the bam files)
    is not included in the bedgraph file.
    """
    outfile = '/tmp/test_file.bg'
    for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
        args = "--bamfile1 {} --bamfile2 {} --scaleFactors 1:1 --operation subtract " \
               "-o {} -p 1 --outFileFormat bedgraph --skipNAs".format(A, B, outfile).split()
        bam_comp.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t100\t150\t0\n', '3R\t150\t200\t-1\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_compare_extend():
    """
    Test read extension
    """
    outfile = '/tmp/test_file.bg'
    for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
        args = "--bamfile1 {} --bamfile2 {} --extend 100 --scaleFactors 1:1 --operation subtract " \
               "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split()
        bam_comp.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t0\t100\t-1\n', '3R\t100\t150\t1\n', '3R\t150\t200\t-1\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_compare_scale_factors_ratio():
    """
    Test scale factor
    """
    outfile = '/tmp/test_file.bg'
    for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
        args = "--bamfile1 {} --bamfile2 {} --operation ratio --ignoreForNormalization chr_cigar " \
               "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split()
        bam_comp.main(args)

        # The scale factors are [ 1.   0.5] because BAMFILE_B has double the amount of reads (4) compared to BAMFILE_A

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()

        """
        The distribution of reads for the bam file is:

                      0                              100                           200
                      |------------------------------------------------------------|
        testA.bam  3R                                ==============>
                                                                    <==============


        testB.bam  3R                 <==============               ==============>
                                                     ==============>
                                                                    ==============>

        ------------------------------------------------------------------------------

        ratio:             0      (0+1)/(1*0.5+1)=0.67             (1+1)/(1+2*0.5)=1
        (scale factors [1,0.5])                   (1+1)/(1+1*0.5)=1.33
        """

        expected = ['3R\t0\t50\t1\n', '3R\t50\t100\t0.666667\n', '3R\t100\t150\t1.33333\n', '3R\t150\t200\t1\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_compare_scale_factors_subtract():
    """
    Test scale factor
    """
    outfile = '/tmp/test_file.bg'
    for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]:
        args = "--bamfile1 {} --bamfile2 {} --operation subtract --ignoreForNormalization chr_cigar " \
               "-o {} -p 1 --outFileFormat bedgraph --scaleFactorsMethod None --normalizeUsing CPM".format(A, B, outfile).split()
        bam_comp.main(args)

        # The scale factors are [ 1.   0.5] because BAMFILE_B has dowble the amount of reads (4) compared to BAMFILE_A

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()

        """
        The distribution of reads for the bam file is:

                      0                              100                           200
                      |------------------------------------------------------------|
        testA.bam  3R                                ==============>
                                                                    <==============


        testB.bam  3R                 <==============               ==============>
                                                     ==============>
                                                                    ==============>

        ------------------------------------------------------------------------------

        subtract: After applying CPM normalization, the scale factors are [500000,250000]

        after applying factors:    0         -25k              25k              0

        """

        expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t-250000\n', '3R\t100\t150\t250000\n', '3R\t150\t200\t0\n']
        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_coverage_filter_blacklist():
    """
    Test --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates and --blackListFileName
    """
    outfile = '/tmp/test_file_filter.bg'
    for fname in [BAMFILE_FILTER1, CRAMFILE_FILTER1]:
        args = "--bam {} --normalizeUsing RPGC --effectiveGenomeSize 1400 -p 1 -o {} -of bedgraph --samFlagInclude 512 " \
               "--samFlagExclude 256 --minMappingQuality 5 --ignoreDuplicates " \
               "--blackListFileName {}".format(fname, outfile, BEDFILE_FILTER)
        args = args.split()
        bam_cov.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()

        expected = ['3R\t0\t100\t0\n', '3R\t100\t150\t1.42338\n',
                    '3R\t150\t250\t4.88017\n', '3R\t250\t300\t3.05011\n',
                    '3R\t300\t400\t2.23675\n', '3R\t400\t450\t3.86347\n',
                    '3R\t450\t500\t4.06681\n', '3R\t500\t550\t2.03341\n',
                    '3R\t550\t600\t2.44009\n', '3R\t600\t650\t4.47349\n',
                    '3R\t650\t700\t3.45679\n', '3R\t700\t750\t3.66013\n',
                    '3R\t750\t800\t4.06681\n', '3R\t900\t950\t2.44009\n',
                    '3R\t950\t1000\t1.62672\n', '3R\t1000\t1050\t0.813362\n',
                    '3R\t1050\t1500\t0\n']

        assert_equal(resp, expected)
        unlink(outfile)


def test_bam_coverage_offset1():
    """
    Test -bs 1 --Offset 1
    """
    outfile = '/tmp/test_offset.bw'
    for fname in [BAMFILE_A, CRAMFILE_A]:
        args = "--Offset 1 --bam {} -p 1 -bs 1 -o {}".format(fname, outfile)
        args = args.split()
        bam_cov.main(args)
        try:
            # python 3 only
            filecmp.clear_cache()
        except:
            pass
        assert(filecmp.cmp(outfile, "{}testA_offset1.bw".format(ROOT)) is True)
        unlink(outfile)


def test_bam_coverage_offset1_10():
    """
    Test -bs 1 --Offset 1 10
    """
    outfile = '/tmp/test_offset.bw'
    for fname in [BAMFILE_A, CRAMFILE_A]:
        args = "--Offset 1 10 -b {} -p 1 -bs 1 -o {}".format(fname, outfile)
        args = args.split()
        bam_cov.main(args)
        try:
            # python 3 only
            filecmp.clear_cache()
        except:
            pass
        assert(filecmp.cmp(outfile, "{}testA_offset1_10.bw".format(ROOT)) is True)
        unlink(outfile)


def test_bam_coverage_offset_minus1():
    """
    Test -bs 1 --Offset -1
    """
    outfile = '/tmp/test_offset.bw'
    for fname in [BAMFILE_A, CRAMFILE_A]:
        args = "--Offset -1 -b {} -p 1 -bs 1 -o {}".format(fname, outfile)
        args = args.split()
        bam_cov.main(args)
        try:
            # python 3 only
            filecmp.clear_cache()
        except:
            pass
        assert(filecmp.cmp(outfile, "{}testA_offset-1.bw".format(ROOT)) is True)
        unlink(outfile)


def test_bam_coverage_offset20_minus4():
    """
    Test -bs 1 --Offset 20 -4
    """
    outfile = '/tmp/test_offset.bw'
    for fname in [BAMFILE_A, CRAMFILE_A]:
        args = "--Offset 20 -4 -b {} -p 1 -bs 1 -o {}".format(fname, outfile)
        args = args.split()
        bam_cov.main(args)
        try:
            # python 3 only
            filecmp.clear_cache()
        except:
            pass
        assert(filecmp.cmp(outfile, "{}testA_offset20_-4.bw".format(ROOT)) is True)
        unlink(outfile)


def test_bam_compare_filter_blacklist():
    """
    Test --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates and --blackListFileName
    """
    outfile = '/tmp/test_file_filter.bg'
    for A, B in [(BAMFILE_FILTER1, BAMFILE_FILTER2), (CRAMFILE_FILTER1, CRAMFILE_FILTER2)]:
        args = "-b1 {} -b2 {} -p 1 -o {} -of bedgraph --samFlagInclude 512 " \
               "--samFlagExclude 256 --minMappingQuality 5 --ignoreDuplicates " \
               "--blackListFileName {}".format(A, B, outfile, BEDFILE_FILTER)
        args = args.split()
        bam_comp.main(args)

        _foo = open(outfile, 'r')
        resp = _foo.readlines()
        _foo.close()
        expected = ['3R\t0\t100\t0\n', '3R\t100\t150\t-0.220909\n',
                    '3R\t150\t200\t-0.159356\n', '3R\t200\t250\t-0.0718929\n',
                    '3R\t250\t300\t0.135883\n', '3R\t300\t350\t0.103093\n',
                    '3R\t350\t400\t-0.0895516\n', '3R\t400\t450\t0.0308374\n',
                    '3R\t450\t500\t0.0989418\n', '3R\t500\t550\t0.207044\n',
                    '3R\t550\t600\t0.0198996\n', '3R\t600\t650\t-0.0957241\n',
                    '3R\t650\t700\t0.00968255\n', '3R\t700\t750\t-0.040642\n',
                    '3R\t750\t800\t-0.123451\n', '3R\t900\t950\t0.212545\n',
                    '3R\t950\t1000\t0.199309\n', '3R\t1000\t1050\t0.167945\n',
                    '3R\t1050\t1500\t0\n']
        assert_equal(resp, expected)
        unlink(outfile)