File: cellosaurus.py

package info (click to toggle)
python-biopython 1.78%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 65,756 kB
  • sloc: python: 221,141; xml: 178,777; ansic: 13,369; sql: 1,208; makefile: 131; sh: 70
file content (188 lines) | stat: -rw-r--r-- 5,598 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# Copyright 2016 by Stephen Marshall.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Parser for the cellosaurus.txt file from ExPASy.

See https://web.expasy.org/cellosaurus/

Tested with the release of Version 18 (July 2016).

Functions:
 - read       Reads a file containing one cell line entry
 - parse      Reads a file containing multiple cell line entries

Classes:
 - Record     Holds cell line data.

Examples
--------
You need to download the Cellosaurus database for this examples to
run, e.g. from ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt

    >> from Bio.ExPASy import cellosaurus
    >> with open('cellosaurus.txt') as handle:
    ...    records = cellosaurus.parse(handle)
    ...    for record in records:
    ...        if 'Homo sapiens' in record['OX'][0]:
    ...            print(record['ID'])
    ...
    #15310-LN
    #W7079
    (L)PC6
    00136
    ...

"""


def parse(handle):
    """Parse cell line records.

    This function is for parsing cell line files containing multiple
    records.

    Arguments:
     - handle   - handle to the file.

    """
    while True:
        record = __read(handle)
        if not record:
            break
        yield record


def read(handle):
    """Read one cell line record.

    This function is for parsing cell line files containing
    exactly one record.

    Arguments:
     - handle   - handle to the file.

    """
    record = __read(handle)
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one cell line record found")
    return record


class Record(dict):
    """Holds information from an ExPASy Cellosaurus record as a Python dictionary.

    Each record contains the following keys:

     ---------  ---------------------------     ----------------------
     Line code  Content                         Occurrence in an entry
     ---------  ---------------------------     ----------------------
     ID         Identifier (cell line name)     Once; starts an entry
     AC         Accession (CVCL_xxxx)           Once
     AS         Secondary accession number(s)   Optional; once
     SY         Synonyms                        Optional; once
     DR         Cross-references                Optional; once or more
     RX         References identifiers          Optional: once or more
     WW         Web pages                       Optional; once or more
     CC         Comments                        Optional; once or more
     ST         STR profile data                Optional; once or more
     DI         Diseases                        Optional; once or more
     OX         Species of origin               Once or more
     HI         Hierarchy                       Optional; once or more
     OI         Originate from same individual  Optional; once or more
     SX         Sex (gender) of cell            Optional; once
     CA         Category                        Once
     //         Terminator                      Once; ends an entry

    """

    def __init__(self):
        """Initialize the class."""
        dict.__init__(self)
        self["ID"] = ""
        self["AC"] = ""
        self["AS"] = ""
        self["SY"] = ""
        self["DR"] = []
        self["RX"] = []
        self["WW"] = []
        self["CC"] = []
        self["ST"] = []
        self["DI"] = []
        self["OX"] = []
        self["HI"] = []
        self["OI"] = []
        self["SX"] = ""
        self["CA"] = ""

    def __repr__(self):
        if self["ID"]:
            if self["AC"]:
                return "%s (%s, %s)" % (self.__class__.__name__, self["ID"], self["AC"])
            else:
                return "%s (%s)" % (self.__class__.__name__, self["ID"])
        else:
            return "%s ( )" % (self.__class__.__name__)

    def __str__(self):
        output = "ID: " + self["ID"]
        output += " AC: " + self["AC"]
        output += " AS: " + self["AS"]
        output += " SY: " + self["SY"]
        output += " DR: " + repr(self["DR"])
        output += " RX: " + repr(self["RX"])
        output += " WW: " + repr(self["WW"])
        output += " CC: " + repr(self["CC"])
        output += " ST: " + repr(self["ST"])
        output += " DI: " + repr(self["DI"])
        output += " OX: " + repr(self["OX"])
        output += " HI: " + repr(self["HI"])
        output += " OI: " + repr(self["OI"])
        output += " SX: " + self["SX"]
        output += " CA: " + self["CA"]
        return output


# Everything below is private


def __read(handle):
    record = None

    for line in handle:

        key, value = line[:2], line[5:].rstrip()
        if key == "ID":
            record = Record()
            record["ID"] = value
        elif key in ["AC", "AS", "SY", "SX", "CA"]:
            record[key] += value
        elif key in [
            "AC",
            "AS",
            "SY",
            "RX",
            "WW",
            "CC",
            "ST",
            "DI",
            "OX",
            "HI",
            "OI",
            "SX",
            "CA",
        ]:
            record[key].append(value)
        elif key == "DR":
            k, v = value.split(";")
            record["DR"].append((k.strip(), v.strip()))
        elif key == "//":
            if record:
                return record
            else:
                continue
    if record:
        raise ValueError("Unexpected end of stream")