File: exclude_seqs_by_blast.py

package info (click to toggle)
qiime 1.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 29,704 kB
  • sloc: python: 77,837; haskell: 379; sh: 113; makefile: 103
file content (245 lines) | stat: -rwxr-xr-x 12,076 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env python
# File created on 09 Feb 2010
from __future__ import division
import numpy
from time import time
from os import system, getcwd
from os.path import join, split, abspath
from cogent.app.formatdb import FormatDb
from cogent.app.parameters import FilePath
from cogent.util.misc import remove_files
from qiime.exclude_seqs_by_blast import blast_genome,\
                                        check_options,\
                                        find_homologs,\
                                        sequences_to_file,\
                                        no_filter,\
                                        format_options_as_lines,\
                                        check_align_percent_field,\
                                        make_percent_align_filter,\
                                        query_ids_from_blast_result,\
                                        ids_from_fasta_lines,\
                                        id_from_fasta_label_line,\
                                        seqs_from_file,\
                                        ids_to_seq_file,\
                                        compose_logfile_lines

__author__ = "Jesse Zaneveld"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Jesse Zaneveld","Rob Knight", "Justin Kuczynski"]
__license__ = "GPL"
__version__ = "1.4.0"
__maintainer__ = "Jesse Zaneveld"
__email__ = "zaneveld@gmail.com"
__status__ = "Release"
 

from qiime.util import parse_command_line_parameters
from qiime.util import make_option


script_info={}
script_info['brief_description']="""Exclude contaminated sequences using BLAST"""
script_info['script_description']="""

This code is designed to allow users of the QIIME workflow to conveniently exclude unwanted sequences from their data. This is mostly useful for excluding human sequences from runs to comply with Internal Review Board (IRB) requirements, but may also have other uses (e.g. perhaps excluding a major bacterial contaminant). Sequences from a run are searched against a user-specified subject database, where BLAST hits are screened by e-value and the percentage of the query that aligns to the sequence.

For human screening THINK CAREFULLY about the data set that you screen against. Are you excluding human non-coding sequences? What about mitochondrial sequences? This point is CRITICAL because submitting human sequences that are not IRB-approved is BAD.

(e.g. you would NOT want to just screen against just the coding sequences of the human genome as found in the KEGG .nuc files, for example)

One valid approach is to screen all putative 16S rRNA sequences against greengenes to ensure they are bacterial rather than human.

WARNING: You cannot use this script if there are spaces in the path to the database of fasta files because formatdb cannot handle these paths (this is a limitation of NCBI's tools and we have no control over it).
"""
script_info['script_usage']=[]
script_info['script_usage'].append(("""Examples:""","""The following is a simple example, where the user can take a given FASTA file (i.e. resulting FASTA file from pick_rep_set.py) and blast those sequences against a reference FASTA file containing the set of sequences which are considered contaminated:""","""exclude_seqs_by_blast.py -i repr_set_seqs.fasta -d ref_seq_set.fna -o exclude_seqs/"""))
script_info['script_usage'].append(("""""","""Alternatively, if the user would like to change the percent of aligned sequence coverage ("-p") or the maximum E-value ("-e"), they can use the following command:""","""exclude_seqs_by_blast.py -i repr_set_seqs.fasta -d ref_seq_set.fna -o exclude_seqs/ -p 0.95 -e 1e-10"""))
script_info['output_description']="""Four output files are generated based on the supplied outputpath + unique suffixes:

1. "filename_prefix".matching: A FASTA file of sequences that did pass the screen (i.e. matched the database and passed all filters).

2. "filename_prefix".non-matching: A FASTA file of sequences that did not pass the screen.

3. "filename_prefix".raw_blast_results: Contains the raw BLAST results from the screening.

4. "filename_prefix".sequence_exclusion_log: A log file summarizing the options used and results obtained.

In addition, if the --no_clean option is passed, the files generated by formatdb will be kept in the same directory as subjectdb.
"""
script_info['required_options']=[\
 make_option("-i","--querydb",dest='querydb',default = None,\
        help="The path to a FASTA file containing query sequences"),
 make_option("-d","--subjectdb",dest='subjectdb',default = None,\
        help="The path to a FASTA file to BLAST against"),
 make_option("-o","--outputfilename",dest='outputfilename',\
        default = None,\
        help=""" The base path/filename to save results. Sequences matching the database, not matching the database, raw BLAST results and the log will be saved to your filename + '.matching', '.non-matching', '.raw_blast_results', and '.sequence_exclusion_log' respectively.""")
]
script_info['optional_options']=[\
    make_option("-e","--e_value",type='float',dest='e_value',\
        default = 1e-10,\
        help="The e-value cutoff for blast queries [default: %default]."),\
    make_option("-p","--percent_aligned",type='float',\
        dest='percent_aligned',default = 0.97,\
        help="The % alignment cutoff for blast queries [default: %default]."),\
    make_option("--no_clean",action = 'store_true',\
        dest='no_clean',default = False,\
        help="If set, don't delete files generated by formatdb after running [default: %default]."),\
    make_option("--blastmatroot",dest='blastmatroot',default = None,\
            help="Path to a folder containing blast matrices [default: %default]."),\
    make_option("--working_dir",dest='working_dir',default = "/tmp",\
        help="Working dir for BLAST [default: %default]."),\
    make_option("-m","--max_hits",type='int',dest='max_hits',\
        default = 100,\
        help="""Max hits parameter for BLAST. CAUTION: Because filtering on alignment percentage occurs after BLAST, a max hits value of 1 in combination with an alignment percent filter could miss valid contaminants. [default: %default]"""),\
    make_option("-w","--word_size",type='int',dest='wordsize',\
        default = 28,\
        help="Word size to use for BLAST search [default: %default]"),\
    make_option("-n","--no_format_db",dest = 'no_format_db', action = "store_true",\
        default = False,\
        help="""If this flag is specified, format_db will not be called on the subject database (formatdb will be set to False).  This is  useful if you have already formatted the database and a) it took a very long time or b) you want to run the script in parallel on the pre-formatted database [default: %default]""")
]
script_info['version'] = __version__

FORMAT_BAR =   """------------------------------"""*2


def main():
    option_parser, options, args = parse_command_line_parameters(**script_info)
    DEBUG = options.verbose 
    check_options(option_parser, options)
    start_time = time()
    option_lines = format_options_as_lines(options)
    if DEBUG:
        print FORMAT_BAR
        print "Running with options:"
        for line in sorted(option_lines):
            print line
        print FORMAT_BAR

    #because the blast app controller uses absolute paths, make sure subject
    #db path is fully specified
    
    subject_db = options.subjectdb
    if not subject_db.startswith('/'):
        subject_db = join(getcwd(), subject_db)
    if not options.no_format_db:

        #initialize object 
        inpath = FilePath(abspath(options.subjectdb))
        subject_dir, subj_file = split(inpath)

        fdb = FormatDb(WorkingDir = subject_dir ) 
        # Currently we do not support protein blasts, but
        # this would be easy to add in the future...
        fdb.Parameters['-p'].on('F')
        
        # Create indices for record lookup
        fdb.Parameters['-o'].on('T')
        
        # Set input database 
        fdb.Parameters['-i'].on(subject_db)
        
        
        formatdb_cmd = fdb.BaseCommand
        
        if DEBUG:
            print "Formatting db with command: %s" %formatdb_cmd
        
        app_result = fdb(subject_db)
        formatdb_filepaths = []
        for v in app_result.values():
            try:
                formatdb_filepaths.append(v.name)
            except AttributeError:
                # not a file object, so no path to return
                pass

        db_format_time = time() - start_time

        if DEBUG:
            print "Formatting subject db took: %2.f seconds" % db_format_time
            print "formatdb log file written to: %s" %app_result['log']
            print FORMAT_BAR
    else:
        db_format_time = time() - start_time
        formatdb_cmd = "None (formatdb not called)"
        # Check that User-Supplied subjectdb is valid
        db_ext = [".nhr",".nin",".nsd",".nsi",".nsq"]
        formatdb_filepaths = [subject_db+ext for ext in db_ext]
    
        if DEBUG:
            print "Checking that pre-existing formatdb files exist and can be read."
            print "Files to be checked:"
            for fp in formatdb_filepaths:
                print fp
            print FORMAT_BAR
        
        try:
            formatdb_files = [open(db_f,"U") for db_f in formatdb_filepaths]
            [f.close() for f in formatdb_files]
        except IOError:
            if DEBUG: 
                print "Cannot open user-supplied database file:", db_f
            option_parser.error("""Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files""" % db_f)
        
        if DEBUG:
            print "OK: BLAST Database files exist and can be read."
            print FORMAT_BAR
    
    # Perform BLAST search
    blast_results,hit_ids, removed_hit_ids = find_homologs(options.querydb,\
        subject_db, options.e_value,options.max_hits,\
        options.working_dir,options.blastmatroot, options.wordsize,\
                            options.percent_aligned, DEBUG=DEBUG)

    blast_time = (time() - start_time) - db_format_time

    if DEBUG:
        print "BLAST search took: %2.f minute(s)" % (blast_time/60.0)
        print FORMAT_BAR

    #Record raw blast results
    f=open("%s.raw_blast_results" % options.outputfilename,'w')
    f.writelines(blast_results)
    f.close()

    #Record excluded seqs
    ids_to_seq_file(hit_ids,options.querydb,options.outputfilename,".matching")

    #Record included (screened) seqs
    all_ids = ids_from_fasta_lines(open(options.querydb))
    included_ids  = set(all_ids) - hit_ids
    ids_to_seq_file(included_ids,options.querydb,options.outputfilename,".non-matching")

    log_lines = compose_logfile_lines(start_time, db_format_time, blast_time,\
                                                   option_lines,formatdb_cmd,\
                                               blast_results,options,all_ids,\
                                                     hit_ids,removed_hit_ids,\
                                                          included_ids,DEBUG)

    if DEBUG:
        print "Writing summary to: %s" % options.outputfilename +\
                                          ".sequence_exclusion_log"

    f=open(options.outputfilename + ".sequence_exclusion_log",'w')
    f.writelines(log_lines)
    f.close()

    if not options.no_clean:
        if DEBUG:
            
            print FORMAT_BAR
            print "|                           Cleanup                        |"
            print FORMAT_BAR
        
        if not options.no_format_db:
            if options.verbose:
                print "Cleaning up formatdb files:", formatdb_filepaths
            remove_files(formatdb_filepaths)
        else:
            if options.verbose:
                print "Formatdb not run...nothing to clean"

if __name__ == "__main__":
    main()