File: split_libraries_fastq.py

package info (click to toggle)
qiime 1.8.0%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 130,508 kB
  • ctags: 10,145
  • sloc: python: 110,826; haskell: 379; sh: 169; makefile: 125
file content (360 lines) | stat: -rwxr-xr-x 16,967 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#!/usr/bin/env python
# File created on 07 Jun 2011
from __future__ import division

__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso", "Emily TerAvest"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"

from os import rename

from cogent import DNA
from cogent.util.misc import safe_md5, create_dir
from qiime.util import parse_command_line_parameters, make_option, gzip_open
from qiime.parse import parse_mapping_file
from qiime.split_libraries_fastq import (process_fastq_single_end_read_file,
    BARCODE_DECODER_LOOKUP, process_fastq_single_end_read_file_no_barcode)
from qiime.split_libraries import check_map
from qiime.split_libraries_fastq import get_illumina_qual_chars
from qiime.golay import get_invalid_golay_barcodes
from qiime.quality import phred_to_ascii33, phred_to_ascii64

phred_to_ascii_fs = {'33':phred_to_ascii33, '64':phred_to_ascii64}

script_info = {}

script_info['brief_description'] = ("This script performs demultiplexing of "
    "Fastq sequence data where barcodes and sequences are contained in two "
    "separate fastq files (common on Illumina runs).")

script_info['script_description'] = ""

script_info['script_usage'] = []

script_info['script_usage'].append(("Demultiplex and quality filter "
    "(at Phred >= Q20) one lane of Illumina fastq data and write results to "
    "./slout_q20.", "", "%prog -i lane1_read1.fastq.gz "
    "-b lane1_barcode.fastq.gz --rev_comp_mapping_barcodes -o slout_q20/ "
    "-m map.txt -q 19"))

script_info['script_usage'].append(("Demultiplex and quality filter "
    "(at Phred >= Q20) one lane of Illumina fastq data and write results to "
    "./slout_q20. Store trimmed quality scores in addition to sequence data.",
    "", "%prog -i lane1_read1.fastq.gz -b lane1_barcode.fastq.gz "
    "--rev_comp_mapping_barcodes -o slout_q20/ -m map.txt "
    "--store_qual_scores -q 19"))

script_info['script_usage'].append(("Demultiplex and quality filter "
    "(at Phred >= Q20) two lanes of Illumina fastq data and write results to "
    "./slout_q20.", "", "%prog -i lane1_read1.fastq.gz,lane2_read1.fastq.gz "
    "-b lane1_barcode.fastq.gz,lane2_barcode.fastq.gz "
    "--rev_comp_mapping_barcodes -o slout_q20/ -m map.txt,map.txt "
    "--store_qual_scores -q 19"))

script_info['script_usage'].append(("Quality filter (at Phred >= Q20) one "
    "non-multiplexed lane of Illumina fastq data and write results "
    "to ./slout_single_sample_q20.", "", "%prog -i lane1_read1.fastq.gz "
    "--sample_id my.sample -o slout_single_sample_q20/ "
    "-m map_not_multiplexed.txt  -q 19 --barcode_type 'not-barcoded'"))

script_info['script_usage'].append(("Quality filter (at Phred >= Q20) one "
    "non-multiplexed lane of Illumina fastq data and write results "
    "to ./slout_single_sample_q20.", "", "%prog -i lane1_read1.fastq.gz "
    "--sample_id my.sample.1 -o slout_single_sample_q20/ "
    "-m map_not_multiplexed.txt -q 19 --barcode_type 'not-barcoded'"))

script_info['script_usage'].append(("Quality filter (at Phred >= Q20) two "
    "non-multiplexed lanes of Illumina fastq data with different samples in "
    "each and write results to ./slout_not_multiplexed_q20.", "", 
    "%prog -i lane1_read1.fastq.gz,lane2_read1.fastq.gz "
    "--sample_id my.sample.1,my.sample.2 -o slout_not_multiplexed_q20/ "
    "-m map_not_multiplexed.txt -q 19 --barcode_type 'not-barcoded'"))

script_info['output_description']= ""
script_info['required_options'] = [
    make_option('-i', '--sequence_read_fps', type="existing_filepaths",
        help='the sequence read fastq files (comma-separated if more than '
        'one)'),
    make_option('-o', '--output_dir', type="new_dirpath", help='directory to '
        'store output files'),
    make_option('-m', '--mapping_fps', type="existing_filepaths",
        help='metadata mapping files (comma-separated if more than one)')
]

script_info['optional_options'] = [
     make_option('-b', '--barcode_read_fps', type="existing_filepaths",
        default=None, help='the barcode read fastq files (comma-separated '
        'if more than one) [default: %default]'),
     make_option("--store_qual_scores", default=False, action='store_true',
        help='store qual strings in .qual files [default: %default]'),
     make_option("--sample_ids", default=None, help='comma-separated list of '
        'samples id to be applied to all sequences, must be one per input '
        'file path (used when data is not multiplexed) [default: %default]'),
     make_option("--store_demultiplexed_fastq", default=False,
        action='store_true', help='write demultiplexed fastq files '
        '[default: %default]'),
     make_option("--retain_unassigned_reads", default=False,
        action='store_true', help="retain sequences which don't map to a "
        'barcode in the mapping file (sample ID will be "Unassigned") '
        '[default: %default]'),
     make_option('-r', '--max_bad_run_length', type='int', help='max number '
        'of consecutive low quality base calls allowed before truncating a '
        'read [default: %default]', default=3),
     make_option('-p', '--min_per_read_length_fraction', type='float',
        default=0.75, help='min number of consecutive high quality base calls '
        'to include a read (per single end read) as a fraction of the input '
        'read length [default: %default]'),
     make_option('-n', '--sequence_max_n', type='int', help='maximum number '
        'of N characters allowed in a sequence to retain it -- this is '
        'applied after quality trimming, and is total over combined paired '
        'end reads if applicable [default: %default]', default=0),
     make_option('-s', '--start_seq_id', type='int', help='start seq_ids as '
        'ascending integers beginning with start_seq_id [default: %default]',
        default=0),
     make_option('--rev_comp_barcode', action='store_true', help='reverse '
        'complement barcode reads before lookup [default: %default]',
        default=False),
     make_option('--rev_comp_mapping_barcodes', action='store_true',
        help='reverse complement barcode in mapping before lookup (useful if '
        'barcodes in mapping file are reverse complements of golay codes) '
        '[default: %default]', default=False),
     make_option('--rev_comp', action='store_true', help='reverse complement '
        'sequence before writing to output file (useful for reverse-'
        'orientation reads) [default: %default]', default=False),
     make_option('-q', '--phred_quality_threshold', type='int', help='the '
        'maximum unacceptable Phred quality score (e.g., for Q20 and better, '
        'specify -q 19) [default: %default]', default=3),
     make_option('--last_bad_quality_char', help='DEPRECATED: use -q instead. '
        'This method of setting is not robust to different versions of '
        'CASAVA.'),
     make_option('--barcode_type', type='string', help='The type of barcode '
        'used. This can be an integer, e.g. for length 6 barcodes, or '
        'golay_12 for golay error-correcting barcodes. Error correction will '
         'only be applied for golay_12 barcodes. [default: %default]',
        default='golay_12'),
     make_option('--max_barcode_errors', default=1.5, type='float',
        help='maximum number of errors in barcode [default: %default]'),
     make_option('--phred_offset', default=None, type="choice",
        choices=phred_to_ascii_fs.keys(), help="the ascii offset to use when "
        "decoding phred scores - warning: in most cases you don't need to "
        "pass this value [default: determined automatically]")
     # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
     # make_option('--filter_bad_illumina_qual_digit',
     #    action='store_true',\
     #    help='filter sequences which are tagged as not passing the Illumina'+\
     #    ' quality filter [default: %default]',
     #    default=False),\
]

script_info['version'] = __version__

def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run, 
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error("If not providing barcode reads (because "
            "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error("If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            option_parser.error("Only valid phred offsets are: %s" %
                ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file 
        # figure it out...
        phred_to_ascii_f = None

    if opts.last_bad_quality_char is not None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
         'Use -q instead (see option help text by passing -h)')

    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
            '0 and 1 (inclusive). You passed %1.5f' %
            min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(set([len(sequence_read_fps), len(barcode_read_fps),
            len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp,'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp,'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def qual_writer(h,q):
            qual_f.write('>%s\n%s\n' % (h,q))
    else:
        def qual_writer(h,q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp,'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def fastq_writer(h,s,q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q))
    else:
        def fastq_writer(h,s,q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp,'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp,'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        mapping_f = open(mapping_fp, 'U')
        _, _, barcode_to_sample_id, _, _, _, _ = check_map(mapping_f,
            disable_primer_check=True, has_barcodes=barcode_read_fp is not
            None)

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {DNA.rc(k):v for k, v in
                barcode_to_sample_id.iteritems()}

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay "
                    "codes. Do they need to be reverse complemented? If these "
                    "are not golay barcodes pass --barcode_type 12 to disable "
                    "barcode error correction, or pass --barcode_type # if "
                    "the barcodes are not 12 base pairs, where # is the size "
                    "of the barcodes. Invalid codes:\n\t%s" %
                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        log_f.write('Mapping filepath: %s (md5: %s)\n' %
            (mapping_fp,safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
            (sequence_read_fp,
            str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp,'U')

        seq_id = start_seq_id
        
        if barcode_read_fp is not None:
            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %
                (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp,'U')

            seq_generator = process_fastq_single_end_read_file(
               sequence_read_f, barcode_read_f, barcode_to_sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode,
               seq_max_N=seq_max_N, start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
               log_f=log_f, histogram_f=histogram_f,
               barcode_correction_fn=barcode_correction_fn,
               max_barcode_errors=max_barcode_errors,
               phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
               sequence_read_f, sample_ids[i],
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp, seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
               log_f=log_f, histogram_f=histogram_f,
               phred_to_ascii_f=phred_to_ascii_f)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header,sequence))
            qual_writer(fasta_header,quality)
            fastq_writer(fasta_header,sequence,quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp,qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)

if __name__ == "__main__":
    main()