1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
|
#!/usr/bin/env python
# File created on 07 Jun 2011
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Release"
from os import rename
from cogent import DNA
from cogent.util.misc import safe_md5, create_dir
from qiime.util import parse_command_line_parameters, make_option, gzip_open
from qiime.parse import parse_mapping_file
from qiime.split_libraries_fastq import (process_fastq_single_end_read_file,
BARCODE_DECODER_LOOKUP)
from qiime.split_libraries import check_map
from qiime.split_libraries_illumina import get_illumina_qual_chars
from qiime.golay import get_invalid_golay_barcodes
script_info = {}
script_info['brief_description'] = "This script performs demultiplexing of Fastq sequence data where barcodes and sequences are contained in two separate fastq files (common on Illumina runs)."
script_info['script_description'] = ""
script_info['script_usage'] = [("Demultiplex and quality filter two lanes of Illumina run results and write results to ./sl_out/.","","%prog -i s_7_2_sequence.txt,s_8_2_sequence.txt -b s_7_1_sequence.txt,s_8_1_sequence.txt -o ./sl_out/ -m lane7_map.txt,lane8_map.txt")]
script_info['output_description']= ""
script_info['required_options'] = [\
# Example required option
make_option('-i',
'--sequence_read_fps',
type="existing_filepaths",
help='the sequence read fastq files (comma-separated if more than one)'),
make_option('-b',
'--barcode_read_fps',
type="existing_filepaths",
help='the barcode read fastq files (comma-separated if more than one)'),
make_option('-o',
'--output_dir',
type="new_dirpath",
help='directory to store output files'),
make_option('-m',
'--mapping_fps',
type="existing_filepaths",
help='metadata mapping files (comma-separated if more than one)'),
]
script_info['optional_options'] = [
make_option("--retain_unassigned_reads",
default=False,
action='store_true',
help='retain sequences which don\'t map to a barcode in the '+\
' mapping file (sample ID will be "Unassigned") [default: %default]'),
make_option('-r','--max_bad_run_length',type='int',\
help='max number of consecutive low quality base calls allowed '+\
'before truncating a read [default: 1; the read is trucated at the'+\
'second low quality call]',default=1),\
make_option('-p','--min_per_read_length',type='int',\
help='min number of consecutive high quality base calls to include'+\
'a read (per single end read) [default: %default]',default=75),\
make_option('-n','--sequence_max_n',type='int',\
help='maximum number of N characters allowed in a sequence to retain it -- '
'this is applied after quality trimming, and is total over combined paired '
'end reads if applicable [default: %default]',default=0),
make_option('-s','--start_seq_id',type='int',\
help='start seq_ids as ascending integers beginning with start_seq_id'+\
'[default: %default]',default=0),\
make_option('--rev_comp_barcode',action='store_true',\
help='reverse compliment barcode reads before lookup'+\
' [default: %default]',default=False),\
make_option('--rev_comp_mapping_barcodes',action='store_true',\
help='reverse compliment barcode in mapping before lookup (useful if barcodes'+\
' in mapping file are reverse compliments of golay codes)'+\
' [default: %default]',default=False),\
make_option('--rev_comp',action='store_true',\
help='reverse compliment sequence before writing to output file'+\
' (useful for reverse-orientation reads) [default: %default]',
default=False),\
make_option('--last_bad_quality_char',type='choice',
choices = list(get_illumina_qual_chars()),
help='the last character to be considered low quality '+\
'(i.e., these character and those before it will be considered '+\
'low quality base calls) [default: %default]',
default='B'),\
make_option('--barcode_type',type='string',
help='The type of barcode used. This can be an integer, '+\
'e.g. for length 6 barcodes, or golay_12 for golay '+\
'error-correcting barcodes. Error correction will '+\
'only be applied for golay_12 barcodes. [default: %default]',
default='golay_12'),
make_option('--max_barcode_errors',
default=1.5, type=float,
help='maximum number of errors in barcode [default: %default]'),
# NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
# make_option('--filter_bad_illumina_qual_digit',
# action='store_true',\
# help='filter sequences which are tagged as not passing the Illumina'+\
# ' quality filter [default: %default]',
# default=False),\
]
script_info['version'] = __version__
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
sequence_read_fps = opts.sequence_read_fps
barcode_read_fps = opts.barcode_read_fps
mapping_fps = opts.mapping_fps
retain_unassigned_reads = opts.retain_unassigned_reads
max_bad_run_length = opts.max_bad_run_length
last_bad_quality_char = opts.last_bad_quality_char
min_per_read_length = opts.min_per_read_length
rev_comp = opts.rev_comp
rev_comp_barcode = opts.rev_comp_barcode
rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
seq_max_N = opts.sequence_max_n
start_seq_id = opts.start_seq_id
# NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
barcode_type = opts.barcode_type
max_barcode_errors = opts.max_barcode_errors
try:
barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
except KeyError:
barcode_correction_fn = None
if len(sequence_read_fps) != len(barcode_read_fps):
parser.error("Same number of sequence and barcode files must be provided.")
output_dir = opts.output_dir
create_dir(output_dir)
output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
output_fp = '%s/seqs.fna' % output_dir
output_f = open(output_fp_temp,'w')
log_fp = '%s/split_library_log.txt' % output_dir
log_f = open(log_fp,'w')
histogram_fp = '%s/histograms.txt' % output_dir
histogram_f = open(histogram_fp,'w')
for sequence_read_fp, barcode_read_fp, mapping_fp in\
zip(sequence_read_fps, barcode_read_fps, mapping_fps):
mapping_f = open(mapping_fp, 'U')
h, i, barcode_to_sample_id, warnings, errors, p, a =\
check_map(mapping_f, disable_primer_check=True)
if rev_comp_mapping_barcodes:
barcode_to_sample_id = \
dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()])
if barcode_type == 'golay_12':
invalid_golay_barcodes = \
get_invalid_golay_barcodes(barcode_to_sample_id.keys())
if len(invalid_golay_barcodes) > 0:
option_parser.error("Some or all barcodes are not valid golay codes. "+\
"Do they need to be reverse complimented? If these are not "+\
"golay barcodes pass --barcode_type 12 to disable barcode "+\
"error correction, or pass --barcode_type # if the barcodes "+\
"are not 12 base pairs, where # is the size of the barcodes. "+
" Invalid codes:\n\t%s" % \
' '.join(invalid_golay_barcodes))
log_f.write("Input file paths\n")
log_f.write('Mapping filepath: %s (md5: %s)\n' %\
(mapping_fp,safe_md5(open(mapping_fp)).hexdigest()))
log_f.write('Sequence read filepath: %s (md5: %s)\n' %\
(sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest())))
log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\
(barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest()))
if sequence_read_fp.endswith('.gz'):
sequence_read_f = gzip_open(sequence_read_fp)
else:
sequence_read_f = open(sequence_read_fp,'U')
barcode_read_f = open(barcode_read_fp,'U')
seq_id = start_seq_id
for fasta_header, sequence, quality, seq_id in \
process_fastq_single_end_read_file(
sequence_read_f,
barcode_read_f,
barcode_to_sample_id,
store_unassigned=retain_unassigned_reads,
max_bad_run_length=max_bad_run_length,
last_bad_quality_char=last_bad_quality_char,
min_per_read_length=min_per_read_length,
rev_comp=rev_comp,
rev_comp_barcode=rev_comp_barcode,
seq_max_N=seq_max_N,
start_seq_id=start_seq_id,
filter_bad_illumina_qual_digit=\
filter_bad_illumina_qual_digit,
log_f=log_f,
histogram_f=histogram_f,
barcode_correction_fn=barcode_correction_fn,
max_barcode_errors=max_barcode_errors):
output_f.write('>%s\n%s\n' % (fasta_header,sequence))
start_seq_id = seq_id + 1
log_f.write('\n---\n\n')
output_f.close()
rename(output_fp_temp,output_fp)
if __name__ == "__main__":
main()
|