File: bamCompare.py

package info (click to toggle)
python-deeptools 3.5.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 34,456 kB
  • sloc: python: 14,503; xml: 4,212; sh: 33; makefile: 5
file content (314 lines) | stat: -rw-r--r-- 14,287 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import argparse  # to parse command line arguments
import numpy as np
import sys

# my packages
from deeptools import writeBedGraph
from deeptools.SES_scaleFactor import estimateScaleFactor
from deeptools import parserCommon
from deeptools import bamHandler
from deeptools.getRatio import getRatio
from deeptools.getScaleFactor import get_num_kept_reads
from deeptools.getScaleFactor import get_scale_factor
debug = 0
old_settings = np.seterr(all='ignore')


def parseArguments():
    parentParser = parserCommon.getParentArgParse()
    bamParser = parserCommon.read_options()
    normalizationParser = parserCommon.normalization_options()
    requiredArgs = getRequiredArgs()
    optionalArgs = getOptionalArgs()
    outputParser = parserCommon.output()
    parser = argparse.ArgumentParser(
        parents=[requiredArgs, outputParser, optionalArgs,
                 parentParser, normalizationParser, bamParser],
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='This tool compares two BAM files based on the number of '
        'mapped reads. To compare the BAM files, the genome is partitioned '
        'into bins of equal size, then the number of reads found in each bin'
        ' is counted per file, and finally a summary value is '
        'reported. This value can be the ratio of the number of reads per '
        'bin, the log2 of the ratio, or the difference. This tool can '
        'normalize the number of reads in each BAM file using the SES method '
        'proposed by Diaz et al. (2012) "Normalization, bias correction, and '
        'peak calling for ChIP-seq". Statistical Applications in Genetics '
        'and Molecular Biology, 11(3). Normalization based on read counts '
        'is also available. The output is either a bedgraph or bigWig file '
        'containing the bin location and the resulting comparison value. '
        'Note that *each end* in a pair (for paired-end reads) is treated '
        'independently. If this is undesirable, then use the --samFlagInclude '
        'or --samFlagExclude options.',

        usage='bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw\n'
        'help: bamCompare -h / bamCompare --help',

        add_help=False)

    return parser


def getRequiredArgs():
    parser = argparse.ArgumentParser(add_help=False)

    required = parser.add_argument_group('Required arguments')

    # define the arguments
    required.add_argument('--bamfile1', '-b1',
                          metavar='BAM file',
                          help='Sorted BAM file 1. Usually the BAM file '
                          'for the treatment.',
                          required=True)

    required.add_argument('--bamfile2', '-b2',
                          metavar='BAM file',
                          help='Sorted BAM file 2. Usually the BAM '
                          'file for the control.',
                          required=True)

    return parser


def getOptionalArgs():

    parser = argparse.ArgumentParser(add_help=False)
    optional = parser.add_argument_group('Optional arguments')

    optional.add_argument("--help", "-h", action="help",
                          help="show this help message and exit")

    optional.add_argument('--scaleFactorsMethod',
                          help='Method to use to scale the samples. '
                          'If a method is specified, then it will be used to compensate '
                          'for sequencing depth differences between the samples. '
                          'As an alternative, this can be set to None and an option from '
                          '--normalizeUsing <method> can be used. (Default: %(default)s)',
                          choices=['readCount', 'SES', 'None'],
                          default='readCount')

    optional.add_argument('--sampleLength', '-l',
                          help='*Only relevant when SES is chosen for the '
                          'scaleFactorsMethod.* To compute the SES, specify '
                          'the length (in bases) of the regions (see --numberOfSamples) '
                          'that will be randomly sampled to calculate the scaling factors. '
                          'If you do not have a good sequencing depth for '
                          'your samples consider increasing the sampling '
                          'regions\' size to minimize the probability '
                          'that zero-coverage regions are used. (Default: %(default)s)',
                          default=1000,
                          type=int)

    optional.add_argument('--numberOfSamples', '-n',
                          help='*Only relevant when SES is chosen for the '
                          'scaleFactorsMethod.* Number of samplings taken '
                          'from the genome to compute the scaling factors. (Default: %(default)s)',
                          default=1e5,
                          type=int)

    optional.add_argument('--scaleFactors',
                          help='Set this parameter manually to avoid the computation of '
                          'scaleFactors. The format is scaleFactor1:scaleFactor2.'
                          'For example, --scaleFactor 0.7:1 will cause the first BAM file to'
                          'be multiplied by 0.7, while not scaling '
                          'the second BAM file (multiplication with 1).',
                          default=None,
                          required=False)

    optional.add_argument('--operation',
                          help='The default is to output the log2 ratio of the '
                          'two samples. The reciprocal ratio returns the '
                          'the negative of the inverse of the ratio '
                          'if the ratio is less than 0. The resulting '
                          'values are interpreted as negative fold changes. '
                          'Instead of performing a computation using both files, the scaled signal can '
                          'alternatively be output for the first or second file using '
                          'the \'--operation first\' or \'--operation second\'. (Default: %(default)s)',
                          default='log2',
                          choices=['log2', 'ratio', 'subtract', 'add', 'mean',
                                   'reciprocal_ratio', 'first', 'second'],
                          required=False)

    optional.add_argument('--pseudocount',
                          help='A small number to avoid x/0. Only useful '
                          'together with --operation log2 or --operation ratio. '
                          'You can specify different values as pseudocounts for '
                          'the numerator and the denominator by providing two '
                          'values (the first value is used as the numerator '
                          'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
                          default=[1],
                          type=float,
                          nargs='+',
                          action=parserCommon.requiredLength(1, 2),
                          required=False)

    optional.add_argument('--skipZeroOverZero',
                          help='Skip bins where BOTH BAM files lack coverage. '
                          'This is determined BEFORE any applicable pseudocount '
                          'is added.',
                          action='store_true')

    return parser


def process_args(args=None):
    args = parseArguments().parse_args(args)

    if args.smoothLength and args.smoothLength <= args.binSize:
        print("Warning: the smooth length given ({}) is smaller than the bin "
              "size ({}).\n\n No smoothing will be "
              "done".format(args.smoothLength,
                            args.binSize))
        args.smoothLength = None

    if not args.ignoreForNormalization:
        args.ignoreForNormalization = []

    if not isinstance(args.pseudocount, list):
        args.pseudocount = [args.pseudocount]

    if len(args.pseudocount) == 1:
        args.pseudocount *= 2

    return args

# get_scale_factors function is used for scaling in bamCompare
# while get_scale_factor is used for depth normalization


def get_scale_factors(args, statsList, mappedList):

    if args.scaleFactors:
        scale_factors = list(map(float, args.scaleFactors.split(":")))
    elif args.scaleFactorsMethod == 'SES':
        scalefactors_dict = estimateScaleFactor(
            [args.bamfile1, args.bamfile2],
            args.sampleLength, args.numberOfSamples,
            1,
            mappingStatsList=mappedList,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose,
            chrsToSkip=args.ignoreForNormalization)

        scale_factors = scalefactors_dict['size_factors']

        if args.verbose:
            print("Size factors using SES: {}".format(scale_factors))
            print("%s regions of size %s where used " %
                  (scalefactors_dict['sites_sampled'],
                   args.sampleLength))

            print("ignoring filtering/blacklists, size factors if the number of mapped "
                  "reads would have been used:")
            print(tuple(
                float(min(mappedList)) / np.array(mappedList)))

    elif args.scaleFactorsMethod == 'readCount':
        # change the scaleFactor to 1.0
        args.scaleFactor = 1.0
        # get num of kept reads for bam file 1
        args.bam = args.bamfile1
        bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
        # get num of kept reads for bam file 2
        args.bam = args.bamfile2
        bam2_mapped, _ = get_num_kept_reads(args, statsList[1])

        mapped_reads = [bam1_mapped, bam2_mapped]

        # new scale_factors (relative to min of two bams)
        scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads)
        if args.verbose:
            print("Size factors using total number "
                  "of mapped reads: {}".format(scale_factors))

    elif args.scaleFactorsMethod == 'None':
        scale_factors = None

    return scale_factors


def main(args=None):
    """
    The algorithm is composed of two steps.


    1. Per-sample scaling / depth Normalization:
     + If scaling is used (using the SES or read counts method), appropriate scaling
       factors are determined to account for sequencing depth differences.
     + Optionally scaling can be turned off and individual samples could be depth normalized using
       RPKM, BPM or CPM methods

    2. Ratio calculation between two bam files:
     + The genome is transversed and computing
       the log ratio/ratio/difference etc. for bins of fixed width
       given by the user.

    """
    args = process_args(args)

    if args.normalizeUsing == "RPGC":
        sys.exit("RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!")
    if args.normalizeUsing == 'None':
        args.normalizeUsing = None  # For the sake of sanity
    if args.scaleFactorsMethod != 'None' and args.normalizeUsing:
        sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing))

    # Get mapping statistics
    bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors)
    bam1.close()
    bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors)
    bam2.close()

    scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2])
    if scale_factors is None:
        # check whether one of the depth norm methods are selected
        if args.normalizeUsing is not None:
            args.scaleFactor = 1.0
            # if a normalization is required then compute the scale factors
            args.bam = args.bamfile1
            scale_factor_bam1 = get_scale_factor(args, stats1)
            args.bam = args.bamfile2
            scale_factor_bam2 = get_scale_factor(args, stats2)
            scale_factors = [scale_factor_bam1, scale_factor_bam2]
        else:
            scale_factors = [1, 1]

    if args.verbose:
        print("Individual scale factors are {0}".format(scale_factors))

    # the getRatio function is called and receives
    # the func_args per each tile that is considered
    FUNC = getRatio
    func_args = {'valueType': args.operation,
                 'scaleFactors': scale_factors,
                 'pseudocount': args.pseudocount
                 }

    wr = writeBedGraph.WriteBedGraph([args.bamfile1, args.bamfile2], args.binSize, 0,
                                     stepSize=args.binSize,
                                     region=args.region,
                                     numberOfProcessors=args.numberOfProcessors,
                                     extendReads=args.extendReads,
                                     blackListFileName=args.blackListFileName,
                                     minMappingQuality=args.minMappingQuality,
                                     ignoreDuplicates=args.ignoreDuplicates,
                                     center_read=args.centerReads,
                                     zerosToNans=args.skipNonCoveredRegions,
                                     skipZeroOverZero=args.skipZeroOverZero,
                                     samFlag_include=args.samFlagInclude,
                                     samFlag_exclude=args.samFlagExclude,
                                     minFragmentLength=args.minFragmentLength,
                                     maxFragmentLength=args.maxFragmentLength,
                                     chrsToSkip=args.ignoreForNormalization,
                                     verbose=args.verbose
                                     )

    wr.run(FUNC, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength)


if __name__ == "__main__":
    main()