File: maf.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (263 lines) | stat: -rw-r--r-- 8,444 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
"""
Support for the `MAF`_ multiple sequence alignment format used by `multiz`_.

.. _MAF: http://genome.ucsc.edu/FAQ/FAQformat.html#format5
.. _multiz: http://www.bx.psu.edu/miller_lab/
"""

from io import (
    StringIO,
    TextIOWrapper,
)

from bx import interval_index_file
from bx.align import (
    Alignment,
    Component,
)

MAF_INVERSE_STATUS = "V"
MAF_INSERT_STATUS = "I"
MAF_CONTIG_STATUS = "C"
MAF_CONTIG_NESTED_STATUS = "c"
MAF_NEW_STATUS = "N"
MAF_NEW_NESTED_STATUS = "n"
MAF_MAYBE_NEW_STATUS = "S"
MAF_MAYBE_NEW_NESTED_STATUS = "s"
MAF_MISSING_STATUS = "M"


class MAFIndexedAccess(interval_index_file.AbstractIndexedAccess):
    """
    Indexed access to a MAF file.
    """

    def read_at_current_offset(self, file, **kwargs):
        """
        Read the MAF block at the current position in `file` and return an
        instance of `Alignment`.
        """
        return read_next_maf(file, **kwargs)

    def open_data(self):
        data = super().open_data()
        return TextIOWrapper(data, encoding="ascii")


class MAFMultiIndexedAccess(interval_index_file.AbstractMultiIndexedAccess):
    """
    Indexed access to multiple MAF files.
    """

    indexed_access_class = MAFIndexedAccess


Indexed = MAFIndexedAccess
"""Deprecated: `MAFIndexedAccess` is also available under the name `Indexed`."""

MultiIndexed = MAFMultiIndexedAccess
"""Deprecated: `MAFMultiIndexedAccess` is also available under the name `MultiIndexed`."""


class Reader:
    """
    Iterate over all maf blocks in a file in order
    """

    def __init__(self, file, **kwargs):
        self.file = file
        self.maf_kwargs = kwargs
        # Read and verify maf header, store any attributes
        fields = self.file.readline().split()
        if fields[0] != "##maf":
            raise Exception("File does not have MAF header")
        self.attributes = parse_attributes(fields[1:])

    def __next__(self):
        return read_next_maf(self.file, **self.maf_kwargs)

    def __iter__(self):
        return ReaderIter(self)

    def close(self):
        self.file.close()


class ReaderIter:
    """
    Adapts a `Reader` to the iterator protocol.
    """

    def __init__(self, reader):
        self.reader = reader

    def __iter__(self):
        return self

    def __next__(self):
        v = next(self.reader)
        if not v:
            raise StopIteration
        return v


class Writer:
    def __init__(self, file, attributes=None):
        if attributes is None:
            attributes = {}
        self.file = file
        # Write header, Webb's maf code wants version first, we accomodate
        if "version" not in attributes:
            attributes["version"] = 1
        self.file.write("##maf version={}".format(attributes["version"]))
        for key in attributes:
            if key == "version":
                continue
            self.file.writelines(f" {key}={attributes[key]}")
        self.file.write("\n")

    def write(self, alignment):
        self.file.write("a score=" + str(alignment.score))
        for key in alignment.attributes:
            self.file.write(f" {key}={alignment.attributes[key]}")
        self.file.write("\n")
        # Components
        rows = []
        for c in alignment.components:
            # "Empty component" generates an 'e' row
            if c.empty:
                rows.append(("e", c.src, str(c.start), str(c.size), c.strand, str(c.src_size), c.synteny_empty))
                continue
            # Regular component
            rows.append(("s", c.src, str(c.start), str(c.size), c.strand, str(c.src_size), c.text))
            # If component has quality, write a q row
            if c.quality is not None:
                rows.append(("q", c.src, "", "", "", "", c.quality))
            # If component has synteny follow up with an 'i' row
            if c.synteny_left and c.synteny_right:
                rows.append(("i", c.src, "", "", "", "", " ".join(map(str, c.synteny_left + c.synteny_right))))
        self.file.write(format_tabular(rows, "llrrrrl"))
        self.file.write("\n")

    def close(self):
        self.file.close()


# ---- Helper methods -------------------------------------------------------


def from_string(string, **kwargs):
    return read_next_maf(StringIO(string), **kwargs)


def read_next_maf(file, species_to_lengths=None, parse_e_rows=False):
    """
    Read the next MAF block from `file` and return as an `Alignment`
    instance. If `parse_e_rows` is true, empty components will be created
    when e rows are encountered.
    """
    alignment = Alignment(species_to_lengths=species_to_lengths)
    # Attributes line
    line = readline(file, skip_blank=True)
    if not line:
        return None
    fields = line.split()
    if fields[0] != "a":
        raise Exception("Expected 'a ...' line")
    alignment.attributes = parse_attributes(fields[1:])
    if "score" in alignment.attributes:
        alignment.score = alignment.attributes["score"]
        del alignment.attributes["score"]
    else:
        alignment.score = 0
    # Sequence lines
    last_component = None
    while True:
        line = readline(file)
        # EOF or Blank line terminates alignment components
        if not line or line.isspace():
            break
        if line.isspace():
            break
        # Parse row
        fields = line.split()
        if fields[0] == "s":
            # An 's' row contains sequence for a component
            component = Component()
            component.src = fields[1]
            component.start = int(fields[2])
            component.size = int(fields[3])
            component.strand = fields[4]
            component.src_size = int(fields[5])
            if len(fields) > 6:
                component.text = fields[6].strip()
            # Add to set
            alignment.add_component(component)
            last_component = component
        elif fields[0] == "e":
            # An 'e' row, when no bases align for a given species this tells
            # us something about the synteny
            if parse_e_rows:
                component = Component()
                component.empty = True
                component.src = fields[1]
                component.start = int(fields[2])
                component.size = int(fields[3])
                component.strand = fields[4]
                component.src_size = int(fields[5])
                component.text = None
                synteny = fields[6].strip()
                assert len(synteny) == 1, "Synteny status in 'e' rows should be denoted with a single character code"
                component.synteny_empty = synteny
                alignment.add_component(component)
                last_component = component
        elif fields[0] == "i":
            # An 'i' row, indicates left and right synteny status for the
            # previous component, we hope ;)
            assert fields[1] == last_component.src, "'i' row does not follow matching 's' row"
            last_component.synteny_left = (fields[2], int(fields[3]))
            last_component.synteny_right = (fields[4], int(fields[5]))
        elif fields[0] == "q":
            assert fields[1] == last_component.src, "'q' row does not follow matching 's' row"
            # TODO: Should convert this to an integer array?
            last_component.quality = fields[2]

    return alignment


def readline(file, skip_blank=False):
    """Read a line from provided file, skipping any blank or comment lines"""
    while True:
        line = file.readline()
        if not line:
            return None
        if line[0] != "#" and not (skip_blank and line.isspace()):
            return line


def parse_attributes(fields):
    """Parse list of key=value strings into a dict"""
    attributes = {}
    for field in fields:
        pair = field.split("=")
        attributes[pair[0]] = pair[1]
    return attributes


def format_tabular(rows, align=None):
    if len(rows) == 0:
        return ""
    lengths = [len(col) for col in rows[0]]
    for row in rows[1:]:
        for i in range(0, len(row)):
            lengths[i] = max(lengths[i], len(row[i]))
    rval = ""
    for row in rows:
        for i in range(0, len(row)):
            if align and align[i] == "l":
                rval += row[i].ljust(lengths[i])
            else:
                rval += row[i].rjust(lengths[i])
            rval += " "
        rval += "\n"
    return rval