File: callpeak_cmd.py

package info (click to toggle)
macs 3.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 378,728 kB
  • sloc: ansic: 5,879; python: 4,342; sh: 451; makefile: 83
file content (390 lines) | stat: -rw-r--r-- 17,848 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
# Time-stamp: <2020-11-28 17:06:30 Tao Liu>

"""Description: MACS 3 call peak main executable

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file LICENSE included with
the distribution).
"""

# ------------------------------------
# python modules
# ------------------------------------

import os
import sys
from time import strftime
import tempfile

# ------------------------------------
# MACS3 python modules
# ------------------------------------
from MACS3.Utilities.Constants import *
from MACS3.Utilities.OptValidator import opt_validate_callpeak
from MACS3.Signal.Prob import binomial_cdf_inv
from MACS3.Signal.PeakModel import PeakModel,NotEnoughPairsException
from MACS3.Signal.PeakDetect import PeakDetect
from MACS3.IO.OutputWriter import model2r_script
# ------------------------------------
# Main function
# ------------------------------------
def check_names(treat, control, error_stream):
    """check common chromosome names"""
    tchrnames = set(treat.get_chr_names())
    cchrnames = set(control.get_chr_names())
    commonnames = tchrnames.intersection(cchrnames)
    if len(commonnames)==0:
        error_stream("No common chromosome names can be found from treatment and control!")
        error_stream("Please make sure that the treatment and control alignment files were generated by using the same genome assembly!")
        error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
        error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
        sys.exit()

def run( args ):
    """The Main function/pipeline for MACS.

    """
    # Parse options...
    options = opt_validate_callpeak( args )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    
    #0 output arguments
    info("\n"+options.argtxt)
    options.PE_MODE = options.format in ('BAMPE','BEDPE')
    if options.PE_MODE:
        tag = 'fragment' # call things fragments not tags
    else:
        tag = 'tag'

    tempfile.tempdir = options.tempdir

    #1 Read tag files
    info("#1 read %s files...", tag)
    if options.PE_MODE:
        (treat, control) = load_frag_files_options (options)
    else:
        (treat, control) = load_tag_files_options  (options)
    if control is not None:
        # check if chromosome names are consistent. quit if not.
        check_names(treat, control, error)

    info("#1 %s size = %.1f", tag, options.tsize)
    tagsinfo  = "# %s size is determined as %d bps\n" % (tag, options.tsize)

    t0 = treat.total
    tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
    info("#1  total %ss in treatment: %d", tag, t0)

    # handle duplicates
    if options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag)
            treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0)
            info("#1  max_dup_tags based on binomial = %d" % (treatment_max_dup_tags))
        else:
            info("#1 user defined the maximum %ss...", tag)
            treatment_max_dup_tags = int(options.keepduplicates)
        if options.PE_MODE:
            info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
        else:
            info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)

        treat.filter_dup(treatment_max_dup_tags)
        t1 = treat.total
        info("#1  %ss after filtering in treatment: %d", tag, t1)
        tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1)
        if options.PE_MODE:
            tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags)
        else:
            tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
        info("#1  Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
        tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
    else:
        t1 = t0

    if control is not None:
        c0 = control.total
        tagsinfo += "# total %ss in control: %d\n" % (tag, c0)
        info("#1  total %ss in control: %d", tag, c0)

        if options.keepduplicates != "all":
            if options.keepduplicates == "auto":
                info("#1  for control, calculate max duplicate %ss in single position based on binomial distribution...", tag)
                control_max_dup_tags = cal_max_dup_tags(options.gsize,c0)
                info("#1  max_dup_tags based on binomial = %d" % (control_max_dup_tags))
            else:
                info("#1 user defined the maximum %ss...", tag)
                control_max_dup_tags = int(options.keepduplicates)
            if options.PE_MODE:
                info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
            else:
                info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
            control.filter_dup(treatment_max_dup_tags)
            #control.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to call addbackdup+refinepeak anymore
            c1 = control.total

            info("#1  %ss after filtering in control: %d", tag, c1)
            tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1)
            if options.PE_MODE:
                tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags)
            else:
                tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags)

            info("#1  Redundant rate of control: %.2f" % (float(c0-c1)/c0))
            tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)
        else:
            c1 = c0
    info("#1 finished!")

    #2 Build Model
    info("#2 Build Peak Model...")

    if options.nomodel:
        info("#2 Skipped...")
        if options.PE_MODE:
            options.d = options.tsize
        else:
            options.d=options.extsize
            info("#2 Use %d as fragment length" % (options.d))
        if options.shift > 0:
            info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % (options.shift))
        elif options.shift < 0:
            info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1))
        options.scanwindow=2*options.d  # remove the effect of --bw
    else:
        peakmodel = PeakModel(treatment = treat,
                              max_pairnum = MAX_PAIRNUM,
                              opt = options
                              )
        try:
            peakmodel.build()
            info("#2 finished!")
            debug("#2  Summary Model:")
            debug("#2   min_tags: %d" % (peakmodel.min_tags))
            debug("#2   d: %d" % (peakmodel.d))
            debug("#2   scan_window: %d" % (peakmodel.scan_window))
            info("#2 predicted fragment length is %d bps" % peakmodel.d)
            info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d)))
            info("#2.2 Generate R script for model : %s" % (options.modelR))
            model2r_script(peakmodel,options.modelR,options.name)
            options.d = peakmodel.d
            options.scanwindow= 2*options.d
            if options.d <= 2*options.tsize:
                warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d))
                if options.onauto:
                    options.d=options.extsize
                    options.scanwindow=2*options.d
                    warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d))
                else:
                    warn("#2 You may need to consider one of the other alternative d(s): %s" %  ','.join(map(str,peakmodel.alternative_d)))
                    warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")

        except NotEnoughPairsException:
            if not options.onauto:
                sys.exit(1)
            warn("#2 Skipped...")
            options.d=options.extsize
            options.scanwindow=2*options.d
            warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d))

    #3 Call Peaks
    info("#3 Call peaks...")
    if options.nolambda:
        info("# local lambda is disabled!")

    if control and options.PE_MODE:
        c1 = c1 * 2             # in PE_MODE, PE data has to be doubled since both ends will be counted for calculating background noise.

    # decide the scaling to balance the depth between treatment and control
    if control:
        if options.downsample:
            # use random sampling to balance treatment and control
            info("#3 User prefers to use random sampling instead of linear scaling.")
            if t1 > c1:
                info("#3 MACS is random sampling treatment %ss...", tag)
                if options.seed < 0:
                    warn("#3 Your results may not be reproducible due to the random sampling!")
                else:
                    info("#3 Random seed (%d) is used." % options.seed)
                treat.sample_num(c1, options.seed)
                info("#3 %d Tags from treatment are kept", treat.total)
            elif c1 > t1:
                info("#3 MACS is random sampling control %ss...", tag)
                if options.seed < 0:
                    warn("#3 Your results may not be reproducible due to the random sampling!")
                else:
                    info("#3 Random seed (%d) is used." % options.seed)
                control.sample_num(t1, options.seed)
                info("#3 %d %ss from control are kept", control.total, tag)
            # set options.tocontrol although it would;t matter now
            options.tocontrol = False
        else:
            if options.scaleto == "large":
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # true, we will scale control to treatment.
                    options.tocontrol = False
                else:
                    # treatment has less tags than control, since tolarge is
                    # true, we will scale treatment to control.
                    options.tocontrol = True
            else:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # false, we will scale treatment to control.
                    options.tocontrol = True
                else:
                    # treatment has less tags than control, since tolarge is
                    # false, we will scale control to treatment.
                    options.tocontrol = False

    peakdetect = PeakDetect(treat = treat,
                            control = control,
                            opt = options
                            )
    peakdetect.call_peaks()

    # filter out low FE peaks
    peakdetect.peaks.filter_fc( fc_low = options.fecutoff )

    #4 output
    #4.1 peaks in XLS
    info("#4 Write output xls file... %s" % (options.peakxls))
    ofhd_xls = open( options.peakxls, "w" )
    ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
    ofhd_xls.write(options.argtxt+"\n")
    ofhd_xls.write(tagsinfo)
    if options.shift > 0:
        ofhd_xls.write("# Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options.shift))
    elif options.shift < 0:
        ofhd_xls.write("# Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options.shift * -1))

    ofhd_xls.write("# d = %d\n" % (options.d))
    try:
        ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d)))
    except:
        # when --nomodel is used, there is no peakmodel object. Simply skip this line.
        pass
    if options.nolambda:
        ofhd_xls.write("# local lambda is disabled!\n")
    # pass write method so we can print too, and include name
    peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name.encode())
    ofhd_xls.close()

    #4.2 peaks in BED
    if options.log_pvalue != None:
        score_column = "pscore"
    elif options.log_qvalue != None:
        score_column = "qscore"
    #4.2 peaks in narrowPeak
    if not options.broad:
        info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
        ofhd_bed = open( options.peakNarrowPeak, "w" )
        peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), score_column=score_column, trackline=options.trackline )
        ofhd_bed.close()
        #4.2-2 summits in BED
        info("#4 Write summits bed file... %s" % (options.summitbed))
        ofhd_summits = open( options.summitbed, "w" )
        peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_".encode(), name=options.name.encode(),
                                              description=("Summits for %s (Made with MACS v2, " + strftime("%x") + ")").encode(),
                                              score_column=score_column, trackline=options.trackline )
        ofhd_summits.close()
    #4.2 broad peaks in bed12 or gappedPeak
    else:
        info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak))
        ofhd_bed = open( options.peakBroadPeak, "w" )
        peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
        ofhd_bed.close()
        info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak))
        ofhd_bed = open( options.peakGappedPeak, "w" )
        peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
        ofhd_bed.close()

    info("Done!")

def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
    """Calculate the maximum duplicated tag number based on genome
    size, total tag number and a p-value based on binomial
    distribution. Brute force algorithm to calculate reverse CDF no
    more than MAX_LAMBDA(100000).

    """
    return binomial_cdf_inv(1-p,tags_number,1.0/genome_size)

def load_frag_files_options ( options ):
    """From the options, load treatment fragments and control fragments (if available).

    """
    options.info("#1 read treatment fragments...")

    tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
    treat = tp.build_petrack()
    #treat.sort()
    if len(options.tfile) > 1:
        # multiple input
        for tfile in options.tfile[1:]:
            tp = options.parser(tfile, buffer_size=options.buffer_size)
            treat = tp.append_petrack( treat )
            #treat.sort()
    treat.finalize()

    options.tsize = tp.d
    if options.cfile:
        options.info("#1.2 read input fragments...")
        cp = options.parser(options.cfile[0], buffer_size=options.buffer_size)
        control = cp.build_petrack()
        control_d = cp.d
        #control.sort()
        if len(options.cfile) > 1:
            # multiple input
            for cfile in options.cfile[1:]:
                cp = options.parser(cfile, buffer_size=options.buffer_size)
                control = cp.append_petrack( control )
                #control.sort()
        control.finalize()
    else:
        control = None
    options.info("#1 mean fragment size is determined as %.1f bp from treatment" % options.tsize)
#    options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
    if control is not None:
        options.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d)
    return (treat, control)

def load_tag_files_options ( options ):
    """From the options, load treatment tags and control tags (if available).

    """
    options.info("#1 read treatment tags...")
    tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
    if not options.tsize:           # override tsize if user specified --tsize
        ttsize = tp.tsize()
        options.tsize = ttsize
    treat = tp.build_fwtrack()
    #treat.sort()
    if len(options.tfile) > 1:
        # multiple input
        for tfile in options.tfile[1:]:
            tp = options.parser(tfile, buffer_size=options.buffer_size)
            treat = tp.append_fwtrack( treat )
            #treat.sort()
    treat.finalize()

    if options.cfile:
        options.info("#1.2 read input tags...")
        control = options.parser(options.cfile[0], buffer_size=options.buffer_size).build_fwtrack()
        #control.sort()
        if len(options.cfile) > 1:
            # multiple input
            for cfile in options.cfile[1:]:
                cp = options.parser(cfile, buffer_size=options.buffer_size)
                control = cp.append_fwtrack( control )
                #control.sort()
        control.finalize()
    else:
        control = None
    options.info("#1 tag size is determined as %d bps" % options.tsize)
    return (treat, control)