File: _vsearch.py

package info (click to toggle)
q2-feature-classifier 2024.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 1,504 kB
  • sloc: python: 3,730; makefile: 38; sh: 16
file content (428 lines) | stat: -rw-r--r-- 20,951 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import tempfile
import qiime2
import pandas as pd

from q2_types.feature_data import (
    FeatureData, Taxonomy, Sequence, DNAFASTAFormat, BLAST6, BLAST6Format)
from .plugin_setup import plugin, citations
from qiime2.plugin import Int, Str, Float, Choices, Range, Bool, Threads
from ._blast import (_run_command)
from ._consensus_assignment import (DEFAULTUNASSIGNABLELABEL,
                                    min_consensus_param,
                                    min_consensus_param_description)
from ._taxonomic_classifier import TaxonomicClassifier
from .classifier import _classify_parameters, _parameter_descriptions

# Specify default settings for various functions
DEFAULTMAXACCEPTS = 10
DEFAULTPERCENTID = 0.8
DEFAULTQUERYCOV = 0.8
DEFAULTSTRAND = 'both'
DEFAULTSEARCHEXACT = False
DEFAULTTOPHITS = False
DEFAULTMAXHITS = 'all'
DEFAULTMAXREJECTS = 'all'
DEFAULTOUTPUTNOHITS = True
DEFAULTWEAKID = 0.
DEFAULTTHREADS = 1
DEFAULTMINCONSENSUS = 0.51


def vsearch_global(query: DNAFASTAFormat,
                   reference_reads: DNAFASTAFormat,
                   maxaccepts: int = DEFAULTMAXACCEPTS,
                   perc_identity: float = DEFAULTPERCENTID,
                   query_cov: float = DEFAULTQUERYCOV,
                   strand: str = DEFAULTSTRAND,
                   search_exact: bool = DEFAULTSEARCHEXACT,
                   top_hits_only: bool = DEFAULTTOPHITS,
                   maxhits: int = DEFAULTMAXHITS,
                   maxrejects: int = DEFAULTMAXREJECTS,
                   output_no_hits: bool = DEFAULTOUTPUTNOHITS,
                   weak_id: float = DEFAULTWEAKID,
                   threads: str = DEFAULTTHREADS) -> BLAST6Format:
    seqs_fp = str(query)
    ref_fp = str(reference_reads)
    if maxaccepts == 'all':
        maxaccepts = 0
    if maxrejects == 'all':
        maxrejects = 0

    if search_exact:
        cmd = [
            'vsearch',
            '--search_exact', seqs_fp,
            '--strand', strand,
            '--db', ref_fp,
            '--threads', str(threads),
        ]
    else:
        cmd = [
            'vsearch',
            '--usearch_global', seqs_fp,
            '--id', str(perc_identity),
            '--query_cov', str(query_cov),
            '--strand', strand,
            '--maxaccepts', str(maxaccepts),
            '--maxrejects', str(maxrejects),
            '--db', ref_fp,
            '--threads', str(threads),
        ]

    if top_hits_only:
        cmd.append('--top_hits_only')
    if output_no_hits:
        cmd.append('--output_no_hits')
    if weak_id > 0 and weak_id < perc_identity:
        cmd.extend(['--weak_id', str(weak_id)])
    if maxhits != 'all':
        cmd.extend(['--maxhits', str(maxhits)])
    output = BLAST6Format()
    cmd.extend(['--blast6out', str(output)])
    _run_command(cmd)
    return output


def classify_consensus_vsearch(ctx,
                               query,
                               reference_reads,
                               reference_taxonomy,
                               maxaccepts=DEFAULTMAXACCEPTS,
                               perc_identity=DEFAULTPERCENTID,
                               query_cov=DEFAULTQUERYCOV,
                               strand=DEFAULTSTRAND,
                               search_exact=DEFAULTSEARCHEXACT,
                               top_hits_only=DEFAULTTOPHITS,
                               maxhits=DEFAULTMAXHITS,
                               maxrejects=DEFAULTMAXREJECTS,
                               output_no_hits=DEFAULTOUTPUTNOHITS,
                               weak_id=DEFAULTWEAKID,
                               threads=DEFAULTTHREADS,
                               min_consensus=DEFAULTMINCONSENSUS,
                               unassignable_label=DEFAULTUNASSIGNABLELABEL):
    search_db = ctx.get_action('feature_classifier', 'vsearch_global')
    lca = ctx.get_action('feature_classifier', 'find_consensus_annotation')
    result, = search_db(query=query, reference_reads=reference_reads,
                        maxaccepts=maxaccepts, perc_identity=perc_identity,
                        query_cov=query_cov, strand=strand,
                        search_exact=search_exact, top_hits_only=top_hits_only,
                        maxhits=maxhits, maxrejects=maxrejects,
                        output_no_hits=output_no_hits, weak_id=weak_id,
                        threads=threads)
    consensus, = lca(search_results=result,
                     reference_taxonomy=reference_taxonomy,
                     min_consensus=min_consensus,
                     unassignable_label=unassignable_label)
    # New: add BLAST6Format result as an output. This could just as well be a
    # visualizer generated from these results (using q2-metadata tabulate).
    # Would that be more useful to the user that the QZA?
    return consensus, result


def _annotate_method(taxa, method):
    taxa = taxa.view(pd.DataFrame)
    taxa['Method'] = method
    return qiime2.Artifact.import_data('FeatureData[Taxonomy]', taxa)


def classify_hybrid_vsearch_sklearn(ctx,
                                    query,
                                    reference_reads,
                                    reference_taxonomy,
                                    classifier,
                                    maxaccepts=DEFAULTMAXACCEPTS,
                                    perc_identity=0.5,
                                    query_cov=DEFAULTQUERYCOV,
                                    strand=DEFAULTSTRAND,
                                    min_consensus=DEFAULTMINCONSENSUS,
                                    maxhits=DEFAULTMAXHITS,
                                    maxrejects=DEFAULTMAXREJECTS,
                                    reads_per_batch='auto',
                                    confidence=0.7,
                                    read_orientation='auto',
                                    threads=DEFAULTTHREADS,
                                    prefilter=True,
                                    sample_size=1000,
                                    randseed=0):
    exclude = ctx.get_action('quality_control', 'exclude_seqs')
    ccv = ctx.get_action('feature_classifier', 'classify_consensus_vsearch')
    cs = ctx.get_action('feature_classifier', 'classify_sklearn')
    filter_seqs = ctx.get_action('taxa', 'filter_seqs')
    merge = ctx.get_action('feature_table', 'merge_taxa')

    # randomly subsample reference sequences for rough positive filter
    if prefilter:
        ref = str(reference_reads.view(DNAFASTAFormat))
        with tempfile.NamedTemporaryFile() as output:
            cmd = ['vsearch', '--fastx_subsample', ref, '--sample_size',
                   str(sample_size), '--randseed', str(randseed),
                   '--fastaout', output.name]
            _run_command(cmd)
            sparse_reference = qiime2.Artifact.import_data(
                'FeatureData[Sequence]', output.name)

            # perform rough positive filter on query sequences
            query, misses, = exclude(
                query_sequences=query, reference_sequences=sparse_reference,
                method='vsearch', perc_identity=perc_identity,
                perc_query_aligned=query_cov, threads=threads)

    # find exact matches, perform LCA consensus classification
    # note: we only keep the taxonomic assignments, not the search report
    taxa1, _, = ccv(
        query=query, reference_reads=reference_reads,
        reference_taxonomy=reference_taxonomy, maxaccepts=maxaccepts,
        strand=strand, min_consensus=min_consensus, search_exact=True,
        threads=threads, maxhits=maxhits, maxrejects=maxrejects,
        output_no_hits=True)

    # Annotate taxonomic assignments with classification method
    taxa1 = _annotate_method(taxa1, 'VSEARCH')

    # perform second pass classification on unassigned taxa
    # filter out unassigned seqs
    try:
        query, = filter_seqs(sequences=query, taxonomy=taxa1,
                             include=DEFAULTUNASSIGNABLELABEL)
    except ValueError:
        # get ValueError if all sequences are filtered out.
        # so if no sequences are unassigned, return exact match results
        return taxa1

    # classify with sklearn classifier
    taxa2, = cs(reads=query, classifier=classifier,
                reads_per_batch=reads_per_batch, n_jobs=threads,
                confidence=confidence, read_orientation=read_orientation)

    # Annotate taxonomic assignments with classification method
    taxa2 = _annotate_method(taxa2, 'sklearn')

    # merge into one big happy result
    taxa, = merge(data=[taxa2, taxa1])
    return taxa


parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
              'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
              'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
              'strand': Str % Choices(['both', 'plus']),
              'threads': Threads,
              'maxhits': Int % Range(1, None) | Str % Choices(['all']),
              'maxrejects': Int % Range(1, None) | Str % Choices(['all'])}

extra_params = {'search_exact': Bool,
                'top_hits_only': Bool,
                'output_no_hits': Bool,
                'weak_id': Float % Range(0.0, 1.0, inclusive_end=True)}

inputs = {'query': FeatureData[Sequence],
          'reference_reads': FeatureData[Sequence]}

input_descriptions = {'query': 'Query Sequences.',
                      'reference_reads': 'Reference sequences.'}

parameter_descriptions = {
    'strand': 'Align against reference sequences in forward ("plus") '
              'or both directions ("both").',
    'maxaccepts': 'Maximum number of hits to keep for each query. Set to '
                  '"all" to keep all hits > perc_identity similarity. Note '
                  'that if strand=both, maxaccepts will keep N hits for each '
                  'direction (if searches in the opposite direction yield '
                  'results that exceed the minimum perc_identity). In those '
                  'cases use maxhits to control the total number of hits '
                  'returned. This option works in pair with maxrejects. '
                  'The search process sorts target sequences by decreasing '
                  'number of k-mers they have in common with the query '
                  'sequence, using that information as a proxy for sequence '
                  'similarity. After pairwise alignments, if the first target '
                  'sequence passes the acceptation criteria, it is accepted '
                  'as best hit and the search process stops for that query. '
                  'If maxaccepts is set to a higher value, more hits are '
                  'accepted. If maxaccepts and maxrejects are both set to '
                  '"all", the complete database is searched.',
    'perc_identity': 'Reject match if percent identity to query is '
                     'lower.',
    'query_cov': 'Reject match if query alignment coverage per high-'
                 'scoring pair is lower.',
    'threads': 'Number of threads to use for job parallelization. Pass 0 to '
               'use one per available CPU.',
    'maxhits': 'Maximum number of hits to show once the search is terminated.',
    'maxrejects': 'Maximum number of non-matching target sequences to '
                  'consider before stopping the search. This option works in '
                  'pair with maxaccepts (see maxaccepts description for '
                  'details).'}

extra_param_descriptions = {
    'search_exact': 'Search for exact full-length matches to the query '
                    'sequences. Only 100% exact matches are reported and this '
                    'command is much faster than the default. If True, the '
                    'perc_identity, query_cov, maxaccepts, and maxrejects '
                    'settings are ignored. Note: query and reference reads '
                    'must be trimmed to the exact same DNA locus (e.g., '
                    'primer site) because only exact matches will be '
                    'reported.',
    'top_hits_only': 'Only the top hits between the query and reference '
                     'sequence sets are reported. For each query, the top '
                     'hit is the one presenting the highest percentage of '
                     'identity. Multiple equally scored top hits will be '
                     'used for consensus taxonomic assignment if '
                     'maxaccepts is greater than 1.',
    'output_no_hits': 'Report both matching and non-matching queries. '
                      'WARNING: always use the default setting for this '
                      'option unless if you know what you are doing! If '
                      'you set this option to False, your sequences and '
                      'feature table will need to be filtered to exclude '
                      'unclassified sequences, otherwise you may run into '
                      'errors downstream from missing feature IDs.',
    'weak_id': 'Show hits with percentage of identity of at least N, '
               'without terminating the search. A normal search stops as '
               'soon as enough hits are found (as defined by maxaccepts, '
               'maxrejects, and perc_identity). As weak_id reports weak '
               'hits that are not deduced from maxaccepts, high '
               'perc_identity values can be used, hence preserving both '
               'speed and sensitivity. Logically, weak_id must be smaller '
               'than the value indicated by perc_identity, otherwise this '
               'option will be ignored.',
}

classification_output = ('classification', FeatureData[Taxonomy])

classification_output_description = {
    'classification': 'Taxonomy classifications of query sequences.'}

blast6_output = ('search_results', FeatureData[BLAST6])

blast6_output_description = {'search_results': 'Top hits for each query.'}

ignore_prefilter = ' This parameter is ignored if `prefilter` is disabled.'


plugin.methods.register_function(
    function=vsearch_global,
    inputs=inputs,
    parameters={**parameters,
                **extra_params},
    outputs=[blast6_output],
    input_descriptions=input_descriptions,
    parameter_descriptions={
        **parameter_descriptions,
        **extra_param_descriptions,
    },
    output_descriptions=blast6_output_description,
    name='VSEARCH global alignment search',
    description=('Search for top hits in a reference database via global '
                 'alignment between the query sequences and reference '
                 'database sequences using VSEARCH. Returns a report of the '
                 'top M hits for each query (where M=maxaccepts or maxhits).'),
    citations=[citations['rognes2016vsearch']]
)


plugin.pipelines.register_function(
    function=classify_consensus_vsearch,
    inputs={**inputs,
            'reference_taxonomy': FeatureData[Taxonomy]},
    parameters={**parameters,
                **extra_params,
                **min_consensus_param,
                'unassignable_label': Str,
                },
    outputs=[classification_output, blast6_output],
    input_descriptions={**input_descriptions,
                        'reference_taxonomy': 'Reference taxonomy labels.'},
    parameter_descriptions={
        **parameter_descriptions,
        **extra_param_descriptions,
        **min_consensus_param_description,
        'unassignable_label': 'Annotation given to sequences without any hits.'
    },
    output_descriptions={**classification_output_description,
                         **blast6_output_description},
    name='VSEARCH-based consensus taxonomy classifier',
    description=('Assign taxonomy to query sequences using VSEARCH. Performs '
                 'VSEARCH global alignment between query and reference_reads, '
                 'then assigns consensus taxonomy to each query sequence from '
                 'among maxaccepts top hits, min_consensus of which share '
                 'that taxonomic assignment. Unlike classify-consensus-blast, '
                 'this method searches the entire reference database before '
                 'choosing the top N hits, not the first N hits.'),
    citations=[citations['rognes2016vsearch']]
)


plugin.pipelines.register_function(
    function=classify_hybrid_vsearch_sklearn,
    inputs={**inputs,
            'reference_taxonomy': FeatureData[Taxonomy],
            'classifier': TaxonomicClassifier},
    parameters={**parameters,
                **min_consensus_param,
                'reads_per_batch': _classify_parameters['reads_per_batch'],
                'confidence': _classify_parameters['confidence'],
                'read_orientation': _classify_parameters['read_orientation'],
                'prefilter': Bool,
                'sample_size': Int % Range(1, None),
                'randseed': Int % Range(0, None)},
    outputs=[classification_output],
    input_descriptions={**input_descriptions,
                        'reference_taxonomy': 'Reference taxonomy labels.',
                        'classifier': 'Pre-trained sklearn taxonomic '
                                      'classifier for classifying the reads.'},
    parameter_descriptions={
        **{k: parameter_descriptions[k] for k in [
            'strand', 'maxaccepts', 'threads']},
        **min_consensus_param_description,
        'perc_identity': 'Percent sequence similarity to use for PREFILTER. ' +
                         parameter_descriptions['perc_identity'] + ' Set to a '
                         'lower value to perform a rough pre-filter.' +
                         ignore_prefilter,
        'query_cov': 'Query coverage threshold to use for PREFILTER. ' +
                     parameter_descriptions['query_cov'] + ' Set to a '
                     'lower value to perform a rough pre-filter.' +
                     ignore_prefilter,
        'confidence': _parameter_descriptions['confidence'],
        'read_orientation': 'Direction of reads with respect to reference '
                            'sequences in pre-trained sklearn classifier. '
                            'same will cause reads to be classified unchanged'
                            '; reverse-complement will cause reads to be '
                            'reversed and complemented prior to '
                            'classification. "auto" will autodetect '
                            'orientation based on the confidence estimates '
                            'for the first 100 reads.',
        'reads_per_batch': 'Number of reads to process in each batch for '
                           'sklearn classification. If "auto", this parameter '
                           'is autoscaled to min(number of query sequences / '
                           'threads, 20000).',
        'prefilter': 'Toggle positive filter of query sequences on or off.',
        'sample_size': 'Randomly extract the given number of sequences from '
                       'the reference database to use for prefiltering.' +
                       ignore_prefilter,
        'randseed': 'Use integer as a seed for the pseudo-random generator '
                    'used during prefiltering. A given seed always produces '
                    'the same output, which is useful for replicability. Set '
                    'to 0 to use a pseudo-random seed.' + ignore_prefilter,
    },
    output_descriptions=classification_output_description,
    name='ALPHA Hybrid classifier: VSEARCH exact match + sklearn classifier',
    description=('NOTE: THIS PIPELINE IS AN ALPHA RELEASE. Please report bugs '
                 'to https://forum.qiime2.org!\n'
                 'Assign taxonomy to query sequences using hybrid classifier. '
                 'First performs rough positive filter to remove artifact and '
                 'low-coverage sequences (use "prefilter" parameter to toggle '
                 'this step on or off). Second, performs VSEARCH exact match '
                 'between query and reference_reads to find exact matches, '
                 'followed by least common ancestor consensus taxonomy '
                 'assignment from among maxaccepts top hits, min_consensus of '
                 'which share that taxonomic assignment. Query sequences '
                 'without an exact match are then classified with a pre-'
                 'trained sklearn taxonomy classifier to predict the most '
                 'likely taxonomic lineage.'),
)