1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
# Copyright 2017 by Kozo Nishida. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Code to work with the KEGG Gene database.
Functions:
- parse - Returns an iterator giving Record objects.
Classes:
- Record - A representation of a KEGG Gene.
"""
from Bio.KEGG import _default_wrap, _wrap_kegg, _write_kegg
# Set up line wrapping rules (see Bio.KEGG._wrap_kegg)
name_wrap = [0, "", (" ", "$", 1, 1), ("-", "$", 1, 1)]
id_wrap = _default_wrap
class Record:
"""Holds info from a KEGG Gene record.
Attributes:
- entry The entry identifier.
- name A list of the gene names.
- definition The definition for the gene.
- orthology A list of 2-tuples: (orthology id, role)
- organism A tuple: (organism id, organism)
- position The position for the gene
- motif A list of 2-tuples: (database, list of link ids)
- dblinks A list of 2-tuples: (database, list of link ids)
"""
def __init__(self):
"""Initialize new record."""
self.entry = ""
self.name = []
self.definition = ""
self.orthology = []
self.organism = ""
self.position = ""
self.motif = []
self.dblinks = []
def __str__(self):
"""Return a string representation of this Record."""
return self._entry() + self._name() + self._dblinks() + "///"
def _entry(self):
return _write_kegg("ENTRY", [self.entry])
def _name(self):
return _write_kegg(
"NAME", [_wrap_kegg(l, wrap_rule=name_wrap) for l in self.name]
)
def _definition(self):
return _write_kegg("DEFINITION", [self.definition])
def _dblinks(self):
s = []
for entry in self.dblinks:
s.append(entry[0] + ": " + " ".join(entry[1]))
return _write_kegg("DBLINKS", [_wrap_kegg(l, wrap_rule=id_wrap(9)) for l in s])
def parse(handle):
"""Parse a KEGG Gene file, returning Record objects.
This is an iterator function, typically used in a for loop. For
example, using one of the example KEGG files in the Biopython
test suite,
>>> with open("KEGG/gene.sample") as handle:
... for record in parse(handle):
... print("%s %s" % (record.entry, record.name[0]))
...
b1174 minE
b1175 minD
"""
record = Record()
for line in handle:
if line[:3] == "///":
yield record
record = Record()
continue
if line[:12] != " ":
keyword = line[:12]
data = line[12:].strip()
if keyword == "ENTRY ":
words = data.split()
record.entry = words[0]
elif keyword == "NAME ":
data = data.strip(";")
record.name.append(data)
elif keyword == "DEFINITION ":
record.definition = data
elif keyword == "ORTHOLOGY ":
id, name = data.split(" ")
orthology = (id, name)
record.orthology.append(orthology)
elif keyword == "ORGANISM ":
id, name = data.split(" ")
organism = (id, name)
record.organism = organism
elif keyword == "POSITION ":
record.position = data
elif keyword == "MOTIF ":
key, values = data.split(": ")
values = values.split()
row = (key, values)
record.motif.append(row)
elif keyword == "DBLINKS ":
if ":" in data:
key, values = data.split(": ")
values = values.split()
row = (key, values)
record.dblinks.append(row)
else:
row = record.dblinks[-1]
key, values = row
values.extend(data.split())
row = key, values
record.dblinks[-1] = row
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()
|