File: exonerate_cigar.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3
links: PTS, VCS
area: main
in suites: stretch
size: 46,860 kB
ctags: 13,237
sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (108 lines) | stat: -rw-r--r-- 3,995 bytes
parent folder | download | duplicates (3)
# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Bio.SearchIO parser for Exonerate cigar output format."""

import re

from Bio._py3k import _as_bytes, _bytes_to_string

from ._base import _BaseExonerateParser, _STRAND_MAP
from .exonerate_vulgar import ExonerateVulgarIndexer


__all__ = ['ExonerateCigarParser', 'ExonerateCigarIndexer']


# precompile regex
_RE_CIGAR = re.compile(r"""^cigar:\s+
        (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+  # query: ID, start, end, strand
        (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+  # hit: ID, start, end, strand
        (\d+)(\s+.*)$                         # score, vulgar components
        """, re.VERBOSE)


class ExonerateCigarParser(_BaseExonerateParser):

    """Parser for Exonerate cigar strings."""

    _ALN_MARK = 'cigar'

    def parse_alignment_block(self, header):
        qresult = header['qresult']
        hit = header['hit']
        hsp = header['hsp']
        self.read_until(lambda line: line.startswith('cigar'))
        cigars = re.search(_RE_CIGAR, self.line)
        # if the file has c4 alignments
        # check if cigar values match our previously parsed header values
        if self.has_c4_alignment:
            assert qresult['id'] == cigars.group(1)
            assert hsp['query_start'] == cigars.group(2)
            assert hsp['query_end'] == cigars.group(3)
            assert hsp['query_strand'] == cigars.group(4)
            assert hit['id'] == cigars.group(5)
            assert hsp['hit_start'] == cigars.group(6)
            assert hsp['hit_end'] == cigars.group(7)
            assert hsp['hit_strand'] == cigars.group(8)
            assert hsp['score'] == cigars.group(9)
        else:
            qresult['id'] = cigars.group(1)
            hsp['query_start'] = cigars.group(2)
            hsp['query_end'] = cigars.group(3)
            hsp['query_strand'] = cigars.group(4)
            hit['id'] = cigars.group(5)
            hsp['hit_start'] = cigars.group(6)
            hsp['hit_end'] = cigars.group(7)
            hsp['hit_strand'] = cigars.group(8)
            hsp['score'] = cigars.group(9)

        # adjust strands
        hsp['query_strand'] = _STRAND_MAP[hsp['query_strand']]
        hsp['hit_strand'] = _STRAND_MAP[hsp['hit_strand']]
        # cast coords into ints
        qstart = int(hsp['query_start'])
        qend = int(hsp['query_end'])
        hstart = int(hsp['hit_start'])
        hend = int(hsp['hit_end'])
        # set coords (start <= end)
        hsp['query_start'] = min(qstart, qend)
        hsp['query_end'] = max(qstart, qend)
        hsp['hit_start'] = min(hstart, hend)
        hsp['hit_end'] = max(hstart, hend)
        # cast score into int
        hsp['score'] = int(hsp['score'])
        # store cigar components
        hsp['cigar_comp'] = cigars.group(10)
        # HACK: since we can't really figure out exactly when a
        # HSP starts or ends, we set the entire alignment as one HSP
        hsp['query_ranges'] = [(hsp['query_start'], hsp['query_end'])]
        hsp['hit_ranges'] = [(hsp['hit_start'], hsp['hit_end'])]

        return {'qresult': qresult, 'hit': hit, 'hsp': hsp}


class ExonerateCigarIndexer(ExonerateVulgarIndexer):

    """Indexer class for exonerate cigar lines."""

    _parser = ExonerateCigarParser
    _query_mark = _as_bytes('cigar')

    def get_qresult_id(self, pos):
        """Returns the query ID of the nearest cigar line."""
        handle = self._handle
        handle.seek(pos)
        # get line, check if it's a vulgar line, and get query ID
        line = handle.readline()
        assert line.startswith(self._query_mark), line
        id = re.search(_RE_CIGAR, _bytes_to_string(line))
        return id.group(1)


# if not used as a module, run the doctest
if __name__ == "__main__":
    from Bio._utils import run_doctest
    run_doctest()