File: process_iseq.py

package info (click to toggle)
qiime 1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 29,704 kB
  • sloc: python: 77,837; haskell: 379; sh: 113; makefile: 103
file content (101 lines) | stat: -rwxr-xr-x 5,288 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# File created on 30 Mar 2011
from __future__ import division

__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Release"

from glob import glob
from os.path import split, splitext
from cogent.util.misc import create_dir
from qiime.util import (parse_command_line_parameters, make_option,
                        iseq_to_qseq_fields, gzip_open)
from qiime.format import illumina_data_to_fastq
from qiime.split_libraries_illumina import get_illumina_qual_chars

script_info = {}
script_info['brief_description'] = "Given a directory of per-swath qseq files, this script generates a single fastq per lane."
script_info['script_description'] = ""
script_info['script_usage'] = [("","Generate fastq files from lanes 1 and 2 (read 1 data) where barcodes are contained as the first tweleve bases of the sequences.","process_qseq.py -i ./s_1_1_sequence.txt,./s_2_1_sequence.txt -b 12 -o ./fastq/"),
 ("","Generate fastq files from the gzipped lanes 1 and 2 (read 1 data) where barcodes are contained as the first tweleve bases of the sequences.","process_qseq.py -i ./s_1_1_sequence.txt.gz,./s_2_1_sequence.txt.gz -b 12 -o ./fastq/")
                               ]
script_info['output_description']= ""
script_info['required_options'] = [
 make_option('-i','--input_fps',type='existing_filepaths',
  help='the input filepaths (either iseq or gzipped iseq format; comma-separated if more than one). See Processing Illumina Data tutorial for a description of the iseq file type.'),
 make_option('-o','--output_dir',help='the output directory'),
 make_option('-b','--barcode_length',type='int',
  help='length of the barcode'),
]
script_info['optional_options'] = [
 make_option('--barcode_in_header',action='store_true',
  help='pass if barcode is in the header index'+\
  ' field (rather than at the beginning of the sequence)',default=False),
 make_option('--barcode_qual_c',type='choice',
  choices=list(get_illumina_qual_chars()),
  help='if no barcode quality string is available, score each base with'+\
  ' this quality [default: %default]',default='b')
]
script_info['version'] = __version__

def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    input_fps = opts.input_fps
    output_dir = opts.output_dir
    create_dir(output_dir)
    barcode_length = opts.barcode_length
    barcode_in_header = opts.barcode_in_header
    barcode_qual_c = opts.barcode_qual_c
    
    for input_fp in input_fps:
        if input_fp.endswith('.gz'):
            open_f = gzip_open
            input_basename = split(splitext(splitext(input_fp)[0])[0])[1]
        else:              
            input_basename = split(splitext(input_fp)[0])[1]
            open_f = open
        sequence_output_fp =  '%s/%s.fastq' % (output_dir,input_basename)
        sequence_output_f = open(sequence_output_fp,'w')              
        barcode_output_fp =  '%s/%s_barcodes.fastq' % (output_dir,input_basename)
        barcode_output_f = open(barcode_output_fp,'w')
        for line in open_f(input_fp):
            common_fields, sequence, sequence_qual, barcode, barcode_qual =\
             iseq_to_qseq_fields(line, barcode_in_header, barcode_length, barcode_qual_c)
            
            sequence_s, pass_filter_s = illumina_data_to_fastq((common_fields[0],
                                                              common_fields[1],
                                                              common_fields[2],
                                                              common_fields[3],
                                                              common_fields[4],
                                                              common_fields[5],
                                                              common_fields[6],
                                                              common_fields[7],
                                                              sequence,
                                                              sequence_qual))
            
            barcode_s, pass_filter_b = illumina_data_to_fastq((common_fields[0],
                                                             common_fields[1],
                                                             common_fields[2],
                                                             common_fields[3],
                                                             common_fields[4],
                                                             common_fields[5],
                                                             common_fields[6],
                                                             common_fields[7],
                                                             barcode,
                                                             barcode_qual),barcode_length)
            if pass_filter_s != 0:
                sequence_output_f.write('%s\n' % sequence_s)
                barcode_output_f.write('%s\n' % barcode_s)
        sequence_output_f.close()
        barcode_output_f.close()
        
if __name__ == "__main__":
    main()