1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
#cython: language_level=3
'''
Created on June 12th 2018
@author: coissac/mercier
'''
import logging
import re
from itertools import chain
from obitools3.utils cimport str2bytes
_featureMatcher = re.compile(b'^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M)
_featureCleaner = re.compile(b'^FT',re.M)
def textFeatureIterator(fttable):
'''
Iterate through a textual description of a feature table in a genbank
or embl format. Return at each step a text representation of each individual
feature composing the table.
@param fttable: a string corresponding to the feature table of a genbank
or an embl entry
@type fttable: C{str}
@return: an iterator on str
@rtype: iterator
@see: L{ftParser}
'''
for m in _featureMatcher.finditer(fttable):
t = m.group()
t = _featureCleaner.sub(b' ',t)
yield t
_qualifierMatcher = re.compile(b'(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
_qualifierCleanner= re.compile(b"^ +",re.M)
def qualifierIterator(qualifiers):
'''
Parse a textual description of a feature in embl or genbank format
as returned by the textFeatureIterator iterator and iterate through
the key, value qualified defining this location.
@param qualifiers: substring containing qualifiers
@type qualifiers: str
@return: an iterator on tuple (key,value), where keys are C{str}
@rtype: iterator
'''
for m in _qualifierMatcher.finditer(qualifiers):
t = m.group()
t = _qualifierCleanner.sub(b'',t)
t = t.split(b'=',1)
if len(t)==1:
t = (t[0],None)
else:
if t[0]==b'translation':
value = t[1].replace(b'\n',b'')
else:
value = t[1].replace(b'\n',b' ')
try:
value = eval(value)
if type(value) == str:
value = str2bytes(value)
except:
pass
t = (t[0],value)
yield t
_ftmatcher = re.compile(b'(?<=^ {5})\S+')
_qualifiersMatcher = re.compile(b'^ +/.+',re.M+re.DOTALL)
def ftParser(feature):
fttype = _ftmatcher.search(feature).group()
qualifiers=_qualifiersMatcher.search(feature)
if qualifiers is not None:
qualifiers=qualifiers.group()
else:
qualifiers=b""
logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
return fttype,qualifiers
class Feature(dict):
def __init__(self,type):
self._fttype=type
def getFttype(self):
return self._fttype
def __str__(self):
return repr(self)
def __repr__(self):
return str((self.ftType, dict.__repr__(self)))
ftType = property(getFttype, None, None, "Feature type name")
def featureFactory(featureDescription):
fttype,qualifiers = ftParser(featureDescription)
feature = Feature(fttype)
feature.raw = featureDescription
for k,v in qualifierIterator(qualifiers):
feature.setdefault(k,[]).append(v)
return feature
def featureIterator(featureTable,skipError=False):
for tft in textFeatureIterator(featureTable):
try:
feature = featureFactory(tft)
except AssertionError,e:
logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
if not skipError:
raise e
logging.debug("\t===> Error skipped")
continue
yield feature
def extractTaxon(bytes text, dict tags):
s = next(featureIterator(text))
s = [s]
t = set(int(v[6:]) for v in chain(*tuple(f[b'db_xref'] for f in s if b'db_xref' in f))
if v[0:6]==b'taxon:')
if len(t)==1 :
taxid=t.pop()
if taxid >=0:
tags[b'TAXID']=taxid
t = set(chain(*tuple(f[b'organism'] for f in s if b'organism' in f)))
if len(t)==1:
tags[b'organism']=t.pop()
|