File: parser.py

package info (click to toggle)
python-gffutils 0.13-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 10,164 kB
  • sloc: python: 5,557; makefile: 57; sh: 13
file content (375 lines) | stat: -rw-r--r-- 12,248 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# Portions copied over from BCBio.GFF.GFFParser

import re
import copy
import collections
import urllib
from gffutils import constants
from gffutils.exceptions import AttributeStringError

import logging

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

gff3_kw_pat = re.compile(r"\w+=")

# Encoding/decoding notes
# -----------------------
# From
# https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#description-of-the-format:
#
#       GFF3 files are nine-column, tab-delimited, plain text files.
#       Literal use of tab, newline, carriage return, the percent (%) sign,
#       and control characters must be encoded using RFC 3986
#       Percent-Encoding; no other characters may be encoded. Backslash and
#       other ad-hoc escaping conventions that have been added to the GFF
#       format are not allowed. The file contents may include any character
#       in the set supported by the operating environment, although for
#       portability with other systems, use of Latin-1 or Unicode are
#       recommended.
#
#           tab (%09)
#           newline (%0A)
#           carriage return (%0D)
#           % percent (%25)
#           control characters (%00 through %1F, %7F)
#
#       In addition, the following characters have reserved meanings in
#       column 9 and must be escaped when used in other contexts:
#
#           ; semicolon (%3B)
#           = equals (%3D)
#           & ampersand (%26)
#           , comma (%2C)
#
#
# See also issue #98.
#
# Note that spaces are NOT encoded. Some GFF files have spaces encoded; in
# these cases round-trip invariance will not hold since the %20 will be decoded
# but not re-encoded.
_to_quote = "\n\t\r%;=&,"
_to_quote += "".join([chr(i) for i in range(32)])
_to_quote += chr(127)


# Caching idea from urllib.parse.Quoter, which uses a defaultdict for
# efficiency. Here we're sort of doing the reverse of the "reserved" idea used
# there.
class Quoter(collections.defaultdict):
    def __missing__(self, b):
        if b != "" and b in _to_quote:
            res = "%{:02X}".format(ord(b))
        else:
            res = b
        self[b] = res
        return res


quoter = Quoter()


def _reconstruct(keyvals, dialect, keep_order=False, sort_attribute_values=False):
    """
    Reconstructs the original attributes string according to the dialect.

    Parameters
    ==========
    keyvals : dict
        Attributes from a GFF/GTF feature

    dialect : dict
        Dialect containing info on how to reconstruct a string version of the
        attributes

    keep_order : bool
        If True, then perform sorting of attribute keys to ensure they are in
        the same order as those provided in the original file.  Default is
        False, which saves time especially on large data sets.

    sort_attribute_values : bool
        If True, then sort values to ensure they will always be in the same
        order.  Mostly only useful for testing; default is False.
    """
    if not dialect:
        raise AttributeStringError()
    if not keyvals:
        return ""
    parts = []

    # Re-encode when reconstructing attributes
    if constants.ignore_url_escape_characters or dialect["fmt"] != "gff3":
        attributes = keyvals
    else:
        attributes = {}
        for k, v in keyvals.items():
            attributes[k] = []
            for i in v:
                attributes[k].append("".join([quoter[j] for j in i]))

    # May need to split multiple values into multiple key/val pairs
    if dialect["repeated keys"]:
        items = []
        for key, val in attributes.items():
            if len(val) > 1:
                for v in val:
                    items.append((key, [v]))
            else:
                items.append((key, val))
    else:
        items = list(attributes.items())

    def sort_key(x):
        # sort keys by their order in the dialect; anything not in there will
        # be in arbitrary order at the end.
        try:
            return dialect["order"].index(x[0])
        except ValueError:
            return 1e6

    if keep_order:
        items.sort(key=sort_key)

    for key, val in items:

        # Multival sep is usually a comma:
        if val:
            if sort_attribute_values:
                val = sorted(val)

            val_str = dialect["multival separator"].join(val)

            if val_str:

                # Surround with quotes if needed
                if dialect["quoted GFF2 values"]:
                    val_str = '"%s"' % val_str

                # Typically "=" for GFF3 or " " otherwise
                part = dialect["keyval separator"].join([key, val_str])
            else:
                part = key
        else:
            if dialect["fmt"] == "gtf":
                part = dialect["keyval separator"].join([key, '""'])
            else:
                part = key
        parts.append(part)

    # Typically ";" or "; "
    parts_str = dialect["field separator"].join(parts)

    # Sometimes need to add this
    if dialect["trailing semicolon"]:
        parts_str += ";"

    return parts_str


# TODO:
# Cythonize -- profiling shows that the bulk of the time is spent on this
# function...
def _split_keyvals(keyval_str, dialect=None):
    """
    Given the string attributes field of a GFF-like line, split it into an
    attributes dictionary and a "dialect" dictionary which contains information
    needed to reconstruct the original string.

    Lots of logic here to handle all the corner cases.

    If `dialect` is None, then do all the logic to infer a dialect from this
    attribute string.

    Otherwise, use the provided dialect (and return it at the end).
    """

    def _unquote_quals(quals, dialect):
        """
        Handles the unquoting (decoding) of percent-encoded characters.

        See notes on encoding/decoding above.
        """
        if not constants.ignore_url_escape_characters and dialect["fmt"] == "gff3":
            for key, vals in quals.items():
                unquoted = [urllib.parse.unquote(v) for v in vals]
                quals[key] = unquoted
        return quals

    infer_dialect = False
    if dialect is None:
        # Make a copy of default dialect so it can be modified as needed
        dialect = copy.copy(constants.dialect)
        infer_dialect = True
    from gffutils import feature

    quals = feature.dict_class()
    if not keyval_str:
        return quals, dialect

    # If a dialect was provided, then use that directly.
    if not infer_dialect:
        if dialect["trailing semicolon"]:
            keyval_str = keyval_str.rstrip(";")

        parts = keyval_str.split(dialect["field separator"])

        kvsep = dialect["keyval separator"]
        if dialect["leading semicolon"]:
            pieces = []
            for p in parts:
                if p and p[0] == ";":
                    p = p[1:]
                pieces.append(p.strip().split(kvsep))
                key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

        if dialect["fmt"] == "gff3":
            key_vals = [p.split(kvsep) for p in parts]
        else:
            leadingsemicolon = dialect["leading semicolon"]
            pieces = []
            for i, p in enumerate(parts):
                if i == 0 and leadingsemicolon:
                    p = p[1:]
                pieces.append(p.strip().split(kvsep))
                key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

        quoted = dialect["quoted GFF2 values"]
        for item in key_vals:
            # Easy if it follows spec
            if len(item) == 2:
                key, val = item

            # Only key provided?
            elif len(item) == 1:
                key = item[0]
                val = ""

            else:
                key = item[0]
                val = dialect["keyval separator"].join(item[1:])

            try:
                quals[key]
            except KeyError:
                quals[key] = []

            if quoted:
                if len(val) > 0 and val[0] == '"' and val[-1] == '"':
                    val = val[1:-1]

            if val:
                # TODO: if there are extra commas for a value, just use empty
                # strings
                # quals[key].extend([v for v in val.split(',') if v])
                vals = val.split(",")
                quals[key].extend(vals)

        quals = _unquote_quals(quals, dialect)
        return quals, dialect

    # If we got here, then we need to infer the dialect....
    #
    # Reset the order to an empty list so that it will only be populated with
    # keys that are found in the file.
    dialect["order"] = []

    # ensembl GTF has trailing semicolon
    if keyval_str[-1] == ";":
        keyval_str = keyval_str[:-1]
        dialect["trailing semicolon"] = True

    # GFF2/GTF has a semicolon with at least one space after it.
    # Spaces can be on both sides (e.g. wormbase)
    # GFF3 works with no spaces.
    # So split on the first one we can recognize...
    for sep in (" ; ", "; ", ";"):
        parts = keyval_str.split(sep)
        if len(parts) > 1:
            dialect["field separator"] = sep
            break

    # Is it GFF3?  They have key-vals separated by "="
    if gff3_kw_pat.match(parts[0]):
        key_vals = [p.split("=") for p in parts]
        dialect["fmt"] = "gff3"
        dialect["keyval separator"] = "="

    # Otherwise, key-vals separated by space.  Key is first item.
    else:
        dialect["keyval separator"] = " "
        pieces = []
        for p in parts:
            # Fix misplaced semicolons in keys in some GFF2 files
            if p and p[0] == ";":
                p = p[1:]
                dialect["leading semicolon"] = True
            pieces.append(p.strip().split(" "))
        key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

    for item in key_vals:

        # Easy if it follows spec
        if len(item) == 2:
            key, val = item

        # Only key provided?
        elif len(item) == 1:
            key = item[0]
            val = ""

        # Pathological cases where values of a key have within them the key-val
        # separator, e.g.,
        #  Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126
        #                                                                         ^            ^
        else:
            key = item[0]
            val = dialect["keyval separator"].join(item[1:])

        # Is the key already in there?
        if key in quals:
            dialect["repeated keys"] = True
        else:
            quals[key] = []

        # Remove quotes in GFF2
        if len(val) > 0 and val[0] == '"' and val[-1] == '"':
            val = val[1:-1]
            dialect["quoted GFF2 values"] = True
        if val:

            # TODO: if there are extra commas for a value, just use empty
            # strings
            # quals[key].extend([v for v in val.split(',') if v])

            # See issue #198, where commas within a description can incorrectly
            # cause the dialect inference to conclude that there are not
            # repeated keys.
            #
            # More description in PR #208.
            if dialect["repeated keys"]:
                quals[key].append(val)
            else:
                vals = val.split(",")

                # If anything starts with a leading space, then we infer that
                # it was part of a description or some other typographical
                # interpretation, not a character to split multiple vals on --
                # and append the original val rather than the split vals.
                if any([i[0] == " " for i in vals if i]):
                    quals[key].append(val)
                else:
                    quals[key].extend(vals)

        # keep track of the order of keys
        dialect["order"].append(key)

    if (dialect["keyval separator"] == " ") and (dialect["quoted GFF2 values"]):
        dialect["fmt"] = "gtf"

    quals = _unquote_quals(quals, dialect)
    return quals, dialect