File: denoiser_preprocess.py

package info (click to toggle)
qiime 1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 29,704 kB
  • sloc: python: 77,837; haskell: 379; sh: 113; makefile: 103
file content (137 lines) | stat: -rwxr-xr-x 5,330 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python 

"""Preprocess 454 sequencing data."""

__author__ = "Jens Reeder"
__copyright__ = "Copyright 2011, The QIIME Project" 
__credits__ = ["Jens Reeder", "Rob Knight"]#remember to add yourself if you make changes
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Jens Reeder"
__email__ = "jens.reeder@gmail.com"
__status__ = "Release"

from os.path import exists
from os import remove, rename, rmdir, makedirs
from random import sample

from qiime.util import get_tmp_filename

from qiime.util import parse_command_line_parameters, get_options_lookup,\
    make_option
from qiime.denoiser.utils import files_exist, store_mapping
from qiime.denoiser.preprocess import preprocess

options_lookup = get_options_lookup()
STANDARD_BACTERIAL_PRIMER = "CATGCTGCCTCCCGTAGGAGT"

#denoiser_preprocess.py
script_info={}
script_info['brief_description']="""Run phase of denoiser algorithm: prefix clustering"""
script_info['script_description']="""The script denoiser_preprocess.py runs the first clustering phase
which groups reads based on common prefixes."""

script_info['script_usage'] = [ \
    ( "",
      """Run program on flowgrams in 454Reads.sff. Remove reads which are not in split_lib_filtered_seqs.fasta. 
Remove primer CATGCTGCCTCCCGTAGGAGT from reads before running phase I""",
      """%prog -i 454Reads.sff.txt -f split_lib_filtered_seqs.fasta -p CATGCTGCCTCCCGTAGGAGT """ )
    ]

script_info['output_description']="""
prefix_dereplicated.sff.txt: human readable sff file containing the flowgram of the
                             cluster representative of each cluster.

prefix_dereplicated.fasta: Fasta file containing the cluster representative of each cluster.

prefix_mapping.txt: This file contains the actual clusters. The cluster centroid is given first,
                    the cluster members follw after the ':'.   
"""

script_info['required_options']=[\

    make_option('-i','--input_file',action='store', type='string',\
                          dest='sff_fp',help='path to flowgram file '+\
                          '[REQUIRED]')
    ]

script_info['optional_options']=[ \
    make_option('-f','--fasta_file',action='store', type='string',\
                    dest='fasta_fp',help='path to fasta input file '+\
                    '[default: %default]', default=None),

    make_option('-s','--squeeze',action='store_true', dest='squeeze',\
                    help='Use run-length encoding for prefix '+\
                    'filtering [default: %default]', default=False),

    make_option('-l','--log_file',action='store',\
                    type='string',dest='log_fp',help='path to log file '+\
                    '[default: %default]', default="preprocess.log"),
 
    make_option('-p','--primer',action='store',\
                    type='string',dest='primer',help='primer sequence '+\
                    'used for the amplification [default: %default]', \
                    default=STANDARD_BACTERIAL_PRIMER),

    make_option('-o','--output_dir',action='store',\
                    type='string',dest='output_dir',\
                    help='path to output directory '+\
                    '[default: %default]', default="/tmp/")
    ]

script_info['version'] = __version__

def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)
     
    if not opts.sff_fp:
        parser.error('Required option flowgram file path (-i) not specified')
    elif not files_exist(opts.sff_fp):
        parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.'
                     % opts.sff_fp) 

    #make tmp and output dir
    tmp_dir = get_tmp_filename(tmp_dir = opts.output_dir+"/", suffix="/")
    try:
        makedirs(tmp_dir)
    except OSError:
        exit("Creating temporary directory failed")
    if(not exists(opts.output_dir)):
        try:
            makedirs(opts.output_dir)
        except OSError:
            exit("Creating output directory failed")
            
    #open logger
    log_fh=None
    if opts.verbose:
        #append to the log file of the master process
        log_fh = open(opts.output_dir+"/"+opts.log_fp, "a", 0)
        log_fh.write("SFF file: %s\n" % opts.sff_fp)
        log_fh.write("Fasta file: %s\n" % opts.fasta_fp)
        log_fh.write("Output dir: %s\n" % opts.output_dir)
        log_fh.write("Squeeze Seqs: %s\n" % opts.squeeze)
        log_fh.write("Primer sequence: %s\n" % opts.primer)

    (deprefixed_sff_fp, l, mapping, seqs) = \
        preprocess(opts.sff_fp, log_fh, fasta_fp=opts.fasta_fp,
                   out_fp=tmp_dir,
                   verbose=opts.verbose, squeeze=opts.squeeze,
                   primer=opts.primer)
        
    # explicitly close log file, as this file can be shared with the master
    # Closing it here assures that all preprocess writes happen before the
    # master writes    
    if log_fh:
        log_fh.close()

    #move files to output dir
    rename(tmp_dir+"/prefix_dereplicated.sff.txt",
           opts.output_dir+"/prefix_dereplicated.sff.txt")
    rename(tmp_dir+"/prefix_dereplicated.fasta",
           opts.output_dir+"/prefix_dereplicated.fasta")
    rename(tmp_dir+"/prefix_mapping.txt", opts.output_dir+"/prefix_mapping.txt")
    rmdir(tmp_dir)

if __name__ == "__main__":
    main()