File: exonerate_cigar.py

package info (click to toggle)
python-biopython 1.78%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 65,756 kB
  • sloc: python: 221,141; xml: 178,777; ansic: 13,369; sql: 1,208; makefile: 131; sh: 70
file content (109 lines) | stat: -rw-r--r-- 4,089 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for Exonerate cigar output format."""

import re

from ._base import _BaseExonerateParser, _STRAND_MAP
from .exonerate_vulgar import ExonerateVulgarIndexer


__all__ = ("ExonerateCigarParser", "ExonerateCigarIndexer")


# precompile regex
_RE_CIGAR = re.compile(
    r"""^cigar:\s+
        (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+  # query: ID, start, end, strand
        (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+  # hit: ID, start, end, strand
        (\d+)(\s+.*)$                         # score, vulgar components
        """,
    re.VERBOSE,
)


class ExonerateCigarParser(_BaseExonerateParser):
    """Parser for Exonerate cigar strings."""

    _ALN_MARK = "cigar"

    def parse_alignment_block(self, header):
        """Parse alignment block for cigar format, return query results, hits, hsps."""
        qresult = header["qresult"]
        hit = header["hit"]
        hsp = header["hsp"]
        self.read_until(lambda line: line.startswith("cigar"))
        cigars = re.search(_RE_CIGAR, self.line)
        # if the file has c4 alignments
        # check if cigar values match our previously parsed header values
        if self.has_c4_alignment:
            assert qresult["id"] == cigars.group(1)
            assert hsp["query_start"] == cigars.group(2)
            assert hsp["query_end"] == cigars.group(3)
            assert hsp["query_strand"] == cigars.group(4)
            assert hit["id"] == cigars.group(5)
            assert hsp["hit_start"] == cigars.group(6)
            assert hsp["hit_end"] == cigars.group(7)
            assert hsp["hit_strand"] == cigars.group(8)
            assert hsp["score"] == cigars.group(9)
        else:
            qresult["id"] = cigars.group(1)
            hsp["query_start"] = cigars.group(2)
            hsp["query_end"] = cigars.group(3)
            hsp["query_strand"] = cigars.group(4)
            hit["id"] = cigars.group(5)
            hsp["hit_start"] = cigars.group(6)
            hsp["hit_end"] = cigars.group(7)
            hsp["hit_strand"] = cigars.group(8)
            hsp["score"] = cigars.group(9)

        # adjust strands
        hsp["query_strand"] = _STRAND_MAP[hsp["query_strand"]]
        hsp["hit_strand"] = _STRAND_MAP[hsp["hit_strand"]]
        # cast coords into ints
        qstart = int(hsp["query_start"])
        qend = int(hsp["query_end"])
        hstart = int(hsp["hit_start"])
        hend = int(hsp["hit_end"])
        # set coords (start <= end)
        hsp["query_start"] = min(qstart, qend)
        hsp["query_end"] = max(qstart, qend)
        hsp["hit_start"] = min(hstart, hend)
        hsp["hit_end"] = max(hstart, hend)
        # cast score into int
        hsp["score"] = int(hsp["score"])
        # store cigar components
        hsp["cigar_comp"] = cigars.group(10)
        # HACK: since we can't really figure out exactly when a
        # HSP starts or ends, we set the entire alignment as one HSP
        hsp["query_ranges"] = [(hsp["query_start"], hsp["query_end"])]
        hsp["hit_ranges"] = [(hsp["hit_start"], hsp["hit_end"])]

        return {"qresult": qresult, "hit": hit, "hsp": hsp}


class ExonerateCigarIndexer(ExonerateVulgarIndexer):
    """Indexer class for exonerate cigar lines."""

    _parser = ExonerateCigarParser
    _query_mark = b"cigar"

    def get_qresult_id(self, pos):
        """Return the query ID of the nearest cigar line."""
        handle = self._handle
        handle.seek(pos)
        # get line, check if it's a vulgar line, and get query ID
        line = handle.readline()
        assert line.startswith(self._query_mark), line
        id = re.search(_RE_CIGAR, line.decode())
        return id.group(1)


# if not used as a module, run the doctest
if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()