1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
|
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from collections import OrderedDict
from sys import intern
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def expand_attribute_strings(
attribute_strings,
quote_char='\"',
missing_value="",
usecols=None):
"""
The last column of GTF has a variable number of key value pairs
of the format: "key1 value1; key2 value2;"
Parse these into a dictionary mapping each key onto a list of values,
where the value is None for any row where the key was missing.
Parameters
----------
attribute_strings : list of str
quote_char : str
Quote character to remove from values
missing_value : any
If an attribute is missing from a row, give it this value.
usecols : list of str or None
If not None, then only expand columns included in this set,
otherwise use all columns.
Returns OrderedDict of column->value list mappings, in the order they
appeared in the attribute strings.
"""
n = len(attribute_strings)
extra_columns = {}
column_order = []
#
# SOME NOTES ABOUT THE BIZARRE STRING INTERNING GOING ON BELOW
#
# While parsing millions of repeated strings (e.g. "gene_id" and "TP53"),
# we can save a lot of memory by making sure there's only one string
# object per unique string. The canonical way to do this is using
# the 'intern' function. One problem is that Py2 won't let you intern
# unicode objects, so to get around this we call intern(str(...)).
#
# It also turns out to be faster to check interned strings ourselves
# using a local dictionary, hence the two dictionaries below
# and pair of try/except blocks in the loop.
column_interned_strings = {}
value_interned_strings = {}
for (i, attribute_string) in enumerate(attribute_strings):
for kv in attribute_string.split(";"):
# We're slicing the first two elements out of split() because
# Ensembl release 79 added values like:
# transcript_support_level "1 (assigned to previous version 5)";
# ...which gets mangled by splitting on spaces.
parts = kv.strip().split(" ", 2)[:2]
if len(parts) != 2:
continue
column_name, value = parts
try:
column_name = column_interned_strings[column_name]
except KeyError:
column_name = intern(str(column_name))
column_interned_strings[column_name] = column_name
if usecols is not None and column_name not in usecols:
continue
try:
column = extra_columns[column_name]
except KeyError:
column = [missing_value] * n
extra_columns[column_name] = column
column_order.append(column_name)
value = value.replace(quote_char, "") if value.startswith(quote_char) else value
try:
value = value_interned_strings[value]
except KeyError:
value = intern(str(value))
value_interned_strings[value] = value
# if an attribute is used repeatedly then
# keep track of all its values in a list
old_value = column[i]
if old_value is missing_value:
column[i] = value
else:
column[i] = "%s,%s" % (old_value, value)
logging.info("Extracted GTF attributes: %s" % column_order)
return OrderedDict(
(column_name, extra_columns[column_name])
for column_name in column_order)
|