1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
# Copyright 2012 by Wibowo Arindrarto. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Bio.SearchIO parser for Exonerate cigar output format."""
import re
from Bio._py3k import _as_bytes, _bytes_to_string
from ._base import _BaseExonerateParser, _STRAND_MAP
from .exonerate_vulgar import ExonerateVulgarIndexer
__all__ = ['ExonerateCigarParser', 'ExonerateCigarIndexer']
# precompile regex
_RE_CIGAR = re.compile(r"""^cigar:\s+
(\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+ # query: ID, start, end, strand
(\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+ # hit: ID, start, end, strand
(\d+)(\s+.*)$ # score, vulgar components
""", re.VERBOSE)
class ExonerateCigarParser(_BaseExonerateParser):
"""Parser for Exonerate cigar strings."""
_ALN_MARK = 'cigar'
def parse_alignment_block(self, header):
qresult = header['qresult']
hit = header['hit']
hsp = header['hsp']
self.read_until(lambda line: line.startswith('cigar'))
cigars = re.search(_RE_CIGAR, self.line)
# if the file has c4 alignments
# check if cigar values match our previously parsed header values
if self.has_c4_alignment:
assert qresult['id'] == cigars.group(1)
assert hsp['query_start'] == cigars.group(2)
assert hsp['query_end'] == cigars.group(3)
assert hsp['query_strand'] == cigars.group(4)
assert hit['id'] == cigars.group(5)
assert hsp['hit_start'] == cigars.group(6)
assert hsp['hit_end'] == cigars.group(7)
assert hsp['hit_strand'] == cigars.group(8)
assert hsp['score'] == cigars.group(9)
else:
qresult['id'] = cigars.group(1)
hsp['query_start'] = cigars.group(2)
hsp['query_end'] = cigars.group(3)
hsp['query_strand'] = cigars.group(4)
hit['id'] = cigars.group(5)
hsp['hit_start'] = cigars.group(6)
hsp['hit_end'] = cigars.group(7)
hsp['hit_strand'] = cigars.group(8)
hsp['score'] = cigars.group(9)
# adjust strands
hsp['query_strand'] = _STRAND_MAP[hsp['query_strand']]
hsp['hit_strand'] = _STRAND_MAP[hsp['hit_strand']]
# cast coords into ints
qstart = int(hsp['query_start'])
qend = int(hsp['query_end'])
hstart = int(hsp['hit_start'])
hend = int(hsp['hit_end'])
# set coords (start <= end)
hsp['query_start'] = min(qstart, qend)
hsp['query_end'] = max(qstart, qend)
hsp['hit_start'] = min(hstart, hend)
hsp['hit_end'] = max(hstart, hend)
# cast score into int
hsp['score'] = int(hsp['score'])
# store cigar components
hsp['cigar_comp'] = cigars.group(10)
# HACK: since we can't really figure out exactly when a
# HSP starts or ends, we set the entire alignment as one HSP
hsp['query_ranges'] = [(hsp['query_start'], hsp['query_end'])]
hsp['hit_ranges'] = [(hsp['hit_start'], hsp['hit_end'])]
return {'qresult': qresult, 'hit': hit, 'hsp': hsp}
class ExonerateCigarIndexer(ExonerateVulgarIndexer):
"""Indexer class for exonerate cigar lines."""
_parser = ExonerateCigarParser
_query_mark = _as_bytes('cigar')
def get_qresult_id(self, pos):
"""Returns the query ID of the nearest cigar line."""
handle = self._handle
handle.seek(pos)
# get line, check if it's a vulgar line, and get query ID
line = handle.readline()
assert line.startswith(self._query_mark), line
id = re.search(_RE_CIGAR, _bytes_to_string(line))
return id.group(1)
# if not used as a module, run the doctest
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()
|