File: obiaddtaxids.py

package info (click to toggle)
obitools 1.2.13%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 4,652 kB
  • sloc: python: 18,199; ansic: 1,542; makefile: 98
file content (424 lines) | stat: -rw-r--r-- 16,686 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#!/usr/bin/python3
'''
:py:mod:`obiaddtaxids`: adds *taxids* to sequence records using an ecopcr database
==================================================================================

.. codeauthor:: Celine Mercier <celine.mercier@metabarcoding.org>

The :py:mod:`obiaddtaxids` command annotates sequence records with a *taxid* based on 
a taxon scientific name stored in the sequence record header.

Taxonomic information linking a *taxid* to a taxon scientific name is stored in a 
database formatted as an ecoPCR database (see :doc:`obitaxonomy <obitaxonomy>`) or 
a NCBI taxdump (see NCBI ftp site).

The way to extract the taxon scientific name from the sequence record header can be
specified by two options:

    - By default, the sequence identifier is used. Underscore characters (``_``) are substituted
      by spaces before looking for the taxon scientific name into the taxonomic
      database.

    - If the input file is an ``OBITools`` extended :doc:`fasta <../fasta>` format, the ``-k`` option
      specifies the attribute containing the taxon scientific name.

    - If the input file is a :doc:`fasta <../fasta>` file imported from the UNITE or from the SILVA web sites,
      the ``-f`` option allows specifying this source and parsing correctly the associated 
      taxonomic information.
      
  
For each sequence record, :py:mod:`obiaddtaxids` tries to match the extracted taxon scientific name 
with those stored in the taxonomic database.

    - If a match is found, the sequence record is annotated with the corresponding *taxid*.

Otherwise,
    
    - If the ``-g`` option is set and the taxon name is composed of two words and only the 
      first one is found in the taxonomic database at the 'genus' rank, :py:mod:`obiaddtaxids` 
      considers that it found the genus associated with this sequence record and it stores this 
      sequence record in the file specified by the ``-g`` option.
    
    - If the ``-u`` option is set and no taxonomic information is retrieved from the 
      scientific taxon name, the sequence record is stored in the file specified by the 
      ``-u`` option.

    *Example*
    
    
    .. code-block:: bash
    
       > obiaddtaxids -k species_name -g genus_identified.fasta \\
                      -u unidentified.fasta -d my_ecopcr_database \\
                      my_sequences.fasta > identified.fasta

    Tries to match the value associated with the ``species_name`` key of each sequence record 
    from the ``my_sequences.fasta`` file with a taxon name from the ecoPCR database ``my_ecopcr_database``. 
        
            - If there is an exact match, the sequence record is stored in the ``identified.fasta`` file. 
        
            - If not and the ``species_name`` value is composed of two words, :py:mod:`obiaddtaxids` 
              considers the first word as a genus name and tries to find it into the taxonomic database. 
        
                - If a genus is found, the sequence record is stored in the ``genus_identified.fasta``
                  file. 
                  
                - Otherwise the sequence record is stored in the ``unidentified.fasta`` file.

'''


import re

from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager
from obitools.options.taxonomyfilter import addTaxonomyDBOptions
from obitools.options.taxonomyfilter import loadTaxonomyDatabase
from obitools.format.genericparser import genericEntryIteratorGenerator
from obitools import NucSequence


def addObiaddtaxidsOptions(optionManager):
    
    optionManager.add_option('-g','--genus_found',
                             action="store", dest="genus_found",
                             metavar="<FILENAME>",
                             type="string",
                             default=None,
                             help="(not with UNITE databases) file used to store sequences with the genus found.")

    optionManager.add_option('-u','--unidentified',
                             action="store", dest="unidentified",
                             metavar="<FILENAME>",
                             type="string",
                             default=None,
                             help="file used to store completely unidentified sequences.")

    optionManager.add_option('-s','--dirty',
                             action='store', dest="dirty",
                             metavar="<FILENAME>",
                             type="str",
                             default=None,
                             help="(not with UNITE databases) if chosen, ALL the words in the name used to identify the sequences will be searched"
                                  " when neither the exact name nor the genus have been found."
                                  " Only use if the sequences in your database are badly named with useless words or numbers"
                                  " in the name etc."
                                  " The sequences identified this way will be written in <FILENAME>.")
    
    optionManager.add_option('-f','--format',
                             action="store", dest="db_type",
                             metavar="<FORMAT>",
                             type="string",
                             default='raw',
                             help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE_FULL', 'UNITE_GENERAL' or 'SILVA'."
                                  "The UNITE_FULL format is the one used for the 'Full UNITE+INSD dataset', and the UNITE_GENERAL format is the "
                                  "one used for the 'General FASTA release'."
                                  " Default : raw.")
    
    optionManager.add_option('-k','--key-name',
                             action="store", dest="tagname",
                             metavar="<KEYNAME>",
                             type="string",
                             default='',
                             help="name of the key attribute containing the taxon name in databases of 'raw' type. Default : the taxon name is the id "
                             "of the sequence. The taxon name MUST have '_' between the words of the name when it's the id, and "
                             "CAN be of this form when it's in a field.")

    optionManager.add_option('-a','--restricting_ancestor',
                             action="store", dest="res_anc",
                             type="str",
                             metavar="<ANCESTOR>",
                             default='',
                             help="can be a word or a taxid (number). Enables to restrict the search of taxids under a "
                                  "specified ancestor. If it's a word, it's the field containing the ancestor's taxid "
                                  "in each sequence's header (can be different for each sequence). If it's a number, "
                                  "it's the taxid of the ancestor (in which case it's the same for all the sequences)")



def numberInStr(s) :
    containsNumber = False
    for c in s :
        if c.isdigit() :
            containsNumber = True
    return containsNumber


def UNITEIterator_FULL(f):
    
    fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
    for entry in fastaEntryIterator(f) :
        all = entry.split('\n')
        header = all[0]
        fields = header.split('|')        
        seq_id = fields[0][1:]
        seq = all[1]
        s = NucSequence(seq_id, seq)
        
        path = fields[1]

        species_name_loc = path.index('s__')
        species_name_loc+=3
        s['species_name'] = path[species_name_loc:]
        
        genus_name_loc = path.index('g__')
        genus_name_loc+=3
        s['genus_name'] = path[genus_name_loc:species_name_loc-4]
        
        path = re.sub('[a-z]__', '', path)
        s['path'] = path.replace(';', ',')

        yield s


def UNITEIterator_GENERAL(f):
    
    fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
    for entry in fastaEntryIterator(f) :
        all = entry.split('\n')
        header = all[0]
        fields = header.split('|')  
        
        seq_id = fields[0][1:]
        seq = all[1]
        s = NucSequence(seq_id, seq)
        
        s['species_name'] = seq_id.replace("_", " ")
        
        path = fields[4]
        path = re.sub('[a-z]__', '', path)
        path = path.replace(';', ',')
        s['path'] = path.replace(',,', ',')
        
        yield s


def SILVAIterator(f, tax):
    
    fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
    for entry in fastaEntryIterator(f) :
        all = entry.split('\n')
        header = all[0]
        fields = header.split(' | ')
        id = fields[0][1:]
        seq = all[1]
        s = NucSequence(id, seq)
        
        if (
            '(' in fields[1] 
            and len(fields[1].split('(')[1][:-1]) > 2 
            and ')' not in fields[1].split('(')[1][:-1] 
            and not numberInStr(fields[1].split('(')[1][:-1])
            ) :
            species_name = fields[1].split('(')[0][:-1]
            other_name = fields[1].split('(')[1][:-1]
            
            ancestor = None
            notAnAncestor = False
            
            if (len(other_name.split(' ')) == 1 and other_name[0].isupper()):
                
                try:
                    ancestor = tax.findTaxonByName(other_name)
                except KeyError :
                    notAnAncestor = True
            
            if (ancestor == None and notAnAncestor == False):
                s['common_name'] = other_name
                s['original_silva_name'] = fields[1]
                s['species_name'] = species_name
            
            elif (ancestor != None and notAnAncestor == False) :
                s['ancestor_name'] = other_name
                s['ancestor'] = ancestor[0]
                s['original_silva_name'] = fields[1]
                s['species_name'] = species_name
                
            elif notAnAncestor == True :
                s['species_name'] = fields[1]
                        
        else :
            s['species_name'] = fields[1]
        
        yield s

    
def dirtyLookForSimilarNames(name, tax, ancestor):
    
    similar_name = ''
    taxid = None
    
    try :
        t = tax.findTaxonByName(name)
        taxid = t[0]
        similar_name = t[3]
    
    except KeyError :
        taxid = None
        
    if ancestor != None and not tax.isAncestor(ancestor, taxid) :
        taxid = None
    
    return similar_name, taxid
    

def getGenusTaxid(tax, species_name, ancestor):
    genus_sp = species_name.split(' ')
    genus_taxid = getTaxid(tax, genus_sp[0], ancestor)
    if tax.getRank(genus_taxid) != 'genus' :
        raise KeyError()
    return genus_taxid


def getTaxid(tax, name, ancestor):

    taxid = tax.findTaxonByName(name)[0][0]
    if ancestor != None and not tax.isAncestor(ancestor, taxid) :
        raise KeyError()
    return taxid


def get_species_name(s, options) :
    
    species_name = None
    if options.tagname == '' or options.tagname in s :
        if options.tagname == '' :
            species_name = s.id
        else :
            species_name = s[options.tagname]
                
        if "_" in species_name :
            species_name = species_name.replace('_', ' ')
          
        if len(species_name.split(' ')) == 2 and (species_name.split(' ')[1] == 'sp' or species_name.split(' ')[1] == 'sp.' or species_name.split(' ')[1] == 'unknown') :
            species_name = species_name.split(' ')[0]
          
        if options.tagname == '' :
            s['species_name'] = species_name
    
    return species_name


def getVaguelySimilarNames(species_name, tax, restricting_ancestor) :
    
    kindOfFound = False              
    uselessWords = ['sp', 'sp.', 'fungus', 'fungal', 'unknown', 'strain', 'associated', 'uncultured']
    for word in species_name.split(' ') :
        if word not in uselessWords :
            similar_name, taxid = dirtyLookForSimilarNames(word, tax, restricting_ancestor)
            if taxid != None :
                if len(similar_name) > len(s['species_name']) or kindOfFound == False :
                    s['species_name'] = similar_name
                    kindOfFound = True
    return kindOfFound


def openFiles(options) :
    
    if options.unidentified is not None:
        options.unidentified=open(options.unidentified,'w')
    
    if options.genus_found is not None:
        options.genus_found=open(options.genus_found,'w')
        
    if options.dirty is not None:
        options.dirty = open(options.dirty, 'w')


################################################################################################

if __name__=='__main__':

    optionParser = getOptionManager([addObiaddtaxidsOptions, addTaxonomyDBOptions], progdoc=__doc__)
    
    (options,entries) = optionParser()
    
    tax=loadTaxonomyDatabase(options)
    
    if options.db_type == 'raw' :
        entryIterator = fastaIterator
        entries = entryIterator(entries)
    elif options.db_type == 'UNITE_FULL' :
        entryIterator = UNITEIterator_FULL
        entries = entryIterator(entries)
    elif options.db_type == 'UNITE_GENERAL' :
        entryIterator = UNITEIterator_GENERAL
        entries = entryIterator(entries)
    elif options.db_type == 'SILVA' :
        entryIterator = SILVAIterator
        entries = entryIterator(entries, tax)
        options.tagname = 'species_name'

    openFiles(options)
    
    if (options.db_type == 'raw') or (options.db_type == 'SILVA') :
        
        if options.res_anc == '' :
            restricting_ancestor = None
        elif options.res_anc.isdigit() :
            restricting_ancestor = int(options.res_anc)
        
        for s in entries:
            
            if options.res_anc != '' and not options.res_anc.isdigit():
                restricting_ancestor = int(s[options.res_anc])
            
            species_name = get_species_name(s, options)
            
            if species_name != None :    
                try:
                    taxid = getTaxid(tax, species_name, restricting_ancestor)
                    s['taxid'] = taxid
                    print(formatFasta(s))
                
                except KeyError:
                    
                    genusFound = False
                    if options.genus_found is not None and len(species_name.split(' ')) >= 2 :
                        try:
                            genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
                            s['genus_taxid'] = genusTaxid
                            print(formatFasta(s), file=options.genus_found)
                            genusFound = True
                        except KeyError :
                            pass
                    
                    kindOfFound = False
                    if options.dirty is not None and not genusFound :
                        kindOfFound = getVaguelySimilarNames(species_name, tax, restricting_ancestor)
                        if kindOfFound == True :
                            print(formatFasta(s), file=options.dirty)
                    
                    if options.unidentified is not None and not genusFound and not kindOfFound :
                        print(formatFasta(s), file=options.unidentified)


    elif ((options.db_type =='UNITE_FULL') or (options.db_type =='UNITE_GENERAL')) :
        
        restricting_ancestor = tax.findTaxonByName('Fungi')[0][0]
                
        for s in entries :
            
            try :
                species_name = s['species_name']
                taxid = getTaxid(tax, species_name, restricting_ancestor)
                s['taxid'] = taxid
                s['rank'] = tax.getRank(taxid)
                print(formatFasta(s))
                
            
            except KeyError:
                
                genusFound = False
                if options.genus_found is not None :
                    try:
                        genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
                        s['genus_taxid'] = genusTaxid
                        print(formatFasta(s), file=options.genus_found)
                        genusFound = True
                    
                    except KeyError:
                        pass
                
                if options.unidentified is not None and not genusFound :
                    print(formatFasta(s), file=options.unidentified)