File: embl_genbank_features.pyx

package info (click to toggle)
obitools 3.0.1~b26%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 26,788 kB
  • sloc: ansic: 24,299; python: 657; sh: 27; makefile: 20
file content (148 lines) | stat: -rwxr-xr-x 4,120 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#cython: language_level=3

'''
Created on June 12th 2018

@author: coissac/mercier
'''


import logging
import re
from itertools import chain
from obitools3.utils cimport str2bytes

        
_featureMatcher = re.compile(b'^(FT|  )   [^ ].+\n((FT|  )    .+\n)+',re.M)
_featureCleaner = re.compile(b'^FT',re.M)

def textFeatureIterator(fttable):
    '''
    Iterate through a textual description of a feature table in a genbank
    or embl format. Return at each step a text representation of each individual
    feature composing the table.
    
    @param fttable:  a string corresponding to the feature table of a genbank
                     or an embl entry
                      
    @type fttable: C{str}
    
    @return: an iterator on str
    @rtype: iterator
    
    @see: L{ftParser}
    '''
    for m in _featureMatcher.finditer(fttable):
        t = m.group()
        t = _featureCleaner.sub(b'  ',t)
        yield t
   
        
_qualifierMatcher = re.compile(b'(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
_qualifierCleanner= re.compile(b"^ +",re.M)
        
def qualifierIterator(qualifiers):
    '''
    Parse a textual description of a feature in embl or genbank format
    as returned by the textFeatureIterator iterator and iterate through 
    the key, value qualified defining this location.
     
    @param qualifiers: substring containing qualifiers
    @type qualifiers: str
    
    @return: an iterator on tuple (key,value), where keys are C{str}
    @rtype: iterator
    '''
    for m in _qualifierMatcher.finditer(qualifiers):
        t = m.group()
        t = _qualifierCleanner.sub(b'',t)
        t = t.split(b'=',1)
        if len(t)==1:
            t = (t[0],None)
        else:
            if t[0]==b'translation':
                value = t[1].replace(b'\n',b'')
            else:
                value = t[1].replace(b'\n',b' ')
            try:
                value = eval(value)
                if type(value) == str:
                    value = str2bytes(value)
            except:
                pass
            t = (t[0],value)
        yield t
    
     
_ftmatcher = re.compile(b'(?<=^ {5})\S+')
_qualifiersMatcher = re.compile(b'^ +/.+',re.M+re.DOTALL)

def ftParser(feature):
    fttype = _ftmatcher.search(feature).group()
    qualifiers=_qualifiersMatcher.search(feature)
    if qualifiers is not None:
        qualifiers=qualifiers.group()
    else:
        qualifiers=b""
        logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)

    return fttype,qualifiers       


class Feature(dict):
    
    def __init__(self,type):
        self._fttype=type

    def getFttype(self):
        return self._fttype

    def __str__(self):
        return repr(self)
    
    def __repr__(self):
        return str((self.ftType, dict.__repr__(self)))
    
    ftType = property(getFttype, None, None, "Feature type name")
       

def featureFactory(featureDescription):
    fttype,qualifiers = ftParser(featureDescription)
    feature = Feature(fttype)
    feature.raw  = featureDescription

    for k,v in qualifierIterator(qualifiers):
        feature.setdefault(k,[]).append(v)
        
    return feature
       
        
def featureIterator(featureTable,skipError=False):
    for tft in textFeatureIterator(featureTable):
        try:
            feature = featureFactory(tft)
        except AssertionError,e:
            logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
            if not skipError:
                raise e
            logging.debug("\t===> Error skipped")
            continue
            
        yield feature

        
def extractTaxon(bytes text, dict tags):
         
    s = next(featureIterator(text))
    s = [s]          
    
    t = set(int(v[6:]) for v in chain(*tuple(f[b'db_xref'] for f in s if b'db_xref' in f)) 
            if  v[0:6]==b'taxon:')
    if len(t)==1 :
        taxid=t.pop()
        if taxid >=0:
            tags[b'TAXID']=taxid      

    t = set(chain(*tuple(f[b'organism'] for f in s if b'organism' in f))) 
    if len(t)==1:
        tags[b'organism']=t.pop()