File: _blast.py

package info (click to toggle)
q2-feature-classifier 2024.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 1,504 kB
  • sloc: python: 3,730; makefile: 38; sh: 16
file content (285 lines) | stat: -rw-r--r-- 13,166 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
import warnings
import subprocess
import pandas as pd
from q2_types.feature_data import (
    FeatureData, Taxonomy, Sequence, DNAFASTAFormat, DNAIterator, BLAST6,
    BLAST6Format)
from .types import BLASTDBDirFmtV5, BLASTDB
from qiime2.plugin import (
    Int, Str, Float, Choices, Range, Bool, Threads, get_available_cores
)
from .plugin_setup import plugin, citations
from ._consensus_assignment import (
    min_consensus_param, min_consensus_param_description,
    DEFAULTUNASSIGNABLELABEL)

# ---------------------------------------------------------------
# Reason for num_thread not being exposed.
# BLAST doesn't allow threading when a subject is provided(As of 2/19/20).
# num_thread was removed to prevent warning that stated:
# "'Num_thread' is currently ignored when 'subject' is specified"(issue #77).
# Seen here: https://github.com/qiime2/q2-feature-classifier/issues/77.
# A '-subject' input is required in this function.
# Therefore num_thread is not exposable.
# ---------------------------------------------------------------


# Specify default settings for various functions
DEFAULTMAXACCEPTS = 10
DEFAULTPERCENTID = 0.8
DEFAULTQUERYCOV = 0.8
DEFAULTSTRAND = 'both'
DEFAULTEVALUE = 0.001
DEFAULTMINCONSENSUS = 0.51
DEFAULTOUTPUTNOHITS = True
DEFAULTNUMTHREADS = 1


# NOTE FOR THE FUTURE: should this be called blastn? would it be possible to
# eventually generalize to e.g., blastp or blastx? or will this be too
# challenging, e.g., to detect the format internally? A `mode` parameter could
# be added and TypeMapped to the input type, a bit cumbersome but one way to
# accomplish this without code bloat. But the question is: would we want to
# expose different parameters etc? My feeling is let's call this `blast` for
# now and then cross that bridge when we come to it.
def blast(query: DNAFASTAFormat,
          reference_reads: DNAFASTAFormat = None,
          blastdb: BLASTDBDirFmtV5 = None,
          maxaccepts: int = DEFAULTMAXACCEPTS,
          perc_identity: float = DEFAULTPERCENTID,
          query_cov: float = DEFAULTQUERYCOV,
          strand: str = DEFAULTSTRAND,
          evalue: float = DEFAULTEVALUE,
          output_no_hits: bool = DEFAULTOUTPUTNOHITS,
          num_threads: int = DEFAULTNUMTHREADS) -> pd.DataFrame:
    if num_threads == 0:
        num_threads = get_available_cores()

    if reference_reads and blastdb:
        raise ValueError('Only one reference_reads or blastdb artifact '
                         'can be provided as input. Choose one and try '
                         'again.')
    perc_identity = perc_identity * 100
    query_cov = query_cov * 100
    seqs_fp = str(query)
    # TODO: generalize to support other blast types?
    output = BLAST6Format()
    cmd = ['blastn', '-query', seqs_fp, '-evalue', str(evalue), '-strand',
           strand, '-outfmt', '6', '-perc_identity', str(perc_identity),
           '-qcov_hsp_perc', str(query_cov), '-num_threads', str(num_threads),
           '-max_target_seqs', str(maxaccepts), '-out', str(output)]
    if reference_reads:
        cmd.extend(['-subject', str(reference_reads)])
        if num_threads > 1:
            warnings.warn('The num_threads parameters is only compatible '
                          'when using a pre-indexed blastdb. The num_threads '
                          'is ignored when reference_reads are provided as '
                          'input.', UserWarning)
    elif blastdb:
        cmd.extend(['-db', os.path.join(blastdb.path, blastdb.get_basename())])
    else:
        raise ValueError('Either reference_reads or a blastdb must be '
                         'provided as input.')
    _run_command(cmd)
    # load as dataframe to quickly validate (note: will fail now if empty)
    result = output.view(pd.DataFrame)

    # blast will not report reads with no hits. We will report this
    # information here, so that it is explicitly reported to the user.
    if output_no_hits:
        ids_with_hit = set(result['qseqid'].unique())
        query_ids = {seq.metadata['id'] for seq in query.view(DNAIterator)}
        missing_ids = query_ids - ids_with_hit
        if len(missing_ids) > 0:
            # we will mirror vsearch behavior and annotate unassigneds as '*'
            # and fill other columns with 0 values (np.nan makes format error).
            missed = pd.DataFrame(
                [{'qseqid': i, 'sseqid': '*'} for i in missing_ids],
                columns=result.columns).fillna(0)
            result = pd.concat([result, missed], axis=0)
    return result


def classify_consensus_blast(ctx,
                             query,
                             reference_taxonomy,
                             blastdb=None,
                             reference_reads=None,
                             maxaccepts=DEFAULTMAXACCEPTS,
                             perc_identity=DEFAULTPERCENTID,
                             query_cov=DEFAULTQUERYCOV,
                             strand=DEFAULTSTRAND,
                             evalue=DEFAULTEVALUE,
                             output_no_hits=DEFAULTOUTPUTNOHITS,
                             min_consensus=DEFAULTMINCONSENSUS,
                             unassignable_label=DEFAULTUNASSIGNABLELABEL,
                             num_threads=DEFAULTNUMTHREADS):
    if num_threads == 0:
        num_threads = get_available_cores()

    search_db = ctx.get_action('feature_classifier', 'blast')
    lca = ctx.get_action('feature_classifier', 'find_consensus_annotation')
    result, = search_db(query=query, blastdb=blastdb,
                        reference_reads=reference_reads,
                        maxaccepts=maxaccepts, perc_identity=perc_identity,
                        query_cov=query_cov, strand=strand, evalue=evalue,
                        output_no_hits=output_no_hits, num_threads=num_threads)
    consensus, = lca(search_results=result,
                     reference_taxonomy=reference_taxonomy,
                     min_consensus=min_consensus,
                     unassignable_label=unassignable_label)
    # New: add BLAST6Format result as an output. This could just as well be a
    # visualizer generated from these results (using q2-metadata tabulate).
    # Would that be more useful to the user that the QZA?
    return consensus, result


def makeblastdb(sequences: DNAFASTAFormat) -> BLASTDBDirFmtV5:
    database = BLASTDBDirFmtV5()
    build_cmd = ['makeblastdb', '-blastdb_version', '5', '-dbtype', 'nucl',
                 '-title', 'blastdb', '-in', str(sequences),
                 '-out', os.path.join(str(database.path), 'blastdb')]
    _run_command(build_cmd)
    return database


# Replace this function with QIIME2 API for wrapping commands/binaries,
# pending https://github.com/qiime2/qiime2/issues/224
def _run_command(cmd, verbose=True):
    if verbose:
        print("Running external command line application. This may print "
              "messages to stdout and/or stderr.")
        print("The command being run is below. This command cannot "
              "be manually re-run as it will depend on temporary files that "
              "no longer exist.")
        print("\nCommand:", end=' ')
        print(" ".join(cmd), end='\n\n')
    subprocess.run(cmd, check=True)


inputs = {'query': FeatureData[Sequence],
          'blastdb': BLASTDB,
          'reference_reads': FeatureData[Sequence]}

input_descriptions = {'query': 'Query sequences.',
                      'blastdb': 'BLAST indexed database. Incompatible with '
                      'reference_reads.',
                      'reference_reads': 'Reference sequences. Incompatible '
                      'with blastdb.'}

classification_output = ('classification', FeatureData[Taxonomy])

classification_output_description = {
    'classification': 'Taxonomy classifications of query sequences.'}

parameters = {'evalue': Float,
              'maxaccepts': Int % Range(1, None),
              'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
              'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
              'strand': Str % Choices(['both', 'plus', 'minus']),
              'output_no_hits': Bool,
              'num_threads': Threads,
              }

parameter_descriptions = {
    'evalue': 'BLAST expectation value (E) threshold for saving hits.',
    'strand': ('Align against reference sequences in forward ("plus"), '
               'reverse ("minus"), or both directions ("both").'),
    'maxaccepts': ('Maximum number of hits to keep for each query. BLAST will '
                   'choose the first N hits in the reference database that '
                   'exceed perc_identity similarity to query. NOTE: the '
                   'database is not sorted by similarity to query, so these '
                   'are the first N hits that pass the threshold, not '
                   'necessarily the top N hits.'),
    'perc_identity': ('Reject match if percent identity to query is lower.'),
    'query_cov': 'Reject match if query alignment coverage per high-'
                 'scoring pair is lower. Note: this uses blastn\'s '
                 'qcov_hsp_perc parameter, and may not behave identically '
                 'to the query_cov parameter used by classify-consensus-'
                 'vsearch.',
    'output_no_hits': 'Report both matching and non-matching queries. '
                      'WARNING: always use the default setting for this '
                      'option unless if you know what you are doing! If '
                      'you set this option to False, your sequences and '
                      'feature table will need to be filtered to exclude '
                      'unclassified sequences, otherwise you may run into '
                      'errors downstream from missing feature IDs. Set to '
                      'FALSE to mirror default BLAST search.',
    'num_threads': 'Number of threads (CPUs) to use in the BLAST search. '
                   'Pass 0 to use all available CPUs.',
}

blast6_output = ('search_results', FeatureData[BLAST6])

blast6_output_description = {'search_results': 'Top hits for each query.'}


plugin.methods.register_function(
    function=makeblastdb,
    inputs={'sequences': FeatureData[Sequence]},
    parameters={},
    outputs=[('database', BLASTDB)],
    input_descriptions={'sequences': 'Input reference sequences.'},
    parameter_descriptions={},
    output_descriptions={'database': 'Output BLAST database.'},
    name='Make BLAST database.',
    description=('Make BLAST database from custom sequence collection.'),
    citations=[citations['camacho2009blast+']]
)


# Note: name should be changed to blastn if we do NOT generalize this function
plugin.methods.register_function(
    function=blast,
    inputs=inputs,
    parameters=parameters,
    outputs=[blast6_output],
    input_descriptions=input_descriptions,
    parameter_descriptions=parameter_descriptions,
    output_descriptions=blast6_output_description,
    name='BLAST+ local alignment search.',
    description=('Search for top hits in a reference database via local '
                 'alignment between the query sequences and reference '
                 'database sequences using BLAST+. Returns a report '
                 'of the top M hits for each query (where M=maxaccepts).'),
    citations=[citations['camacho2009blast+']]
)


plugin.pipelines.register_function(
    function=classify_consensus_blast,
    inputs={**inputs,
            'reference_taxonomy': FeatureData[Taxonomy]},
    parameters={**parameters,
                **min_consensus_param,
                'unassignable_label': Str},
    outputs=[classification_output, blast6_output],
    input_descriptions={**input_descriptions,
                        'reference_taxonomy': 'reference taxonomy labels.'},
    parameter_descriptions={
        **parameter_descriptions,
        **min_consensus_param_description,
        'unassignable_label': 'Annotation given to sequences without any hits.'
    },
    output_descriptions={**classification_output_description,
                         **blast6_output_description},
    name='BLAST+ consensus taxonomy classifier',
    description=('Assign taxonomy to query sequences using BLAST+. Performs '
                 'BLAST+ local alignment between query and reference_reads, '
                 'then assigns consensus taxonomy to each query sequence from '
                 'among maxaccepts hits, min_consensus of which share '
                 'that taxonomic assignment. Note that maxaccepts selects the '
                 'first N hits with > perc_identity similarity to query, '
                 'not the top N matches. For top N hits, use '
                 'classify-consensus-vsearch.'),
    citations=[citations['camacho2009blast+']]
)