File: cellosaurus.py

package info (click to toggle)
python-biopython 1.85%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 126,372 kB
  • sloc: xml: 1,047,995; python: 332,722; ansic: 16,944; sql: 1,208; makefile: 140; sh: 81
file content (208 lines) | stat: -rw-r--r-- 6,670 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# Copyright 2016 by Stephen Marshall.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Parser for the cellosaurus.txt file from ExPASy.

See https://web.expasy.org/cellosaurus/

Tested with the release of Version 18 (July 2016).

Functions:
 - read       Reads a file containing one cell line entry
 - parse      Reads a file containing multiple cell line entries

Classes:
 - Record     Holds cell line data.

Examples
--------
This example downloads the Cellosaurus database and parses it. Note that
urlopen returns a stream of bytes, while the parser expects a stream of plain
string, so we use TextIOWrapper to convert bytes to string using the UTF-8
encoding. This is not needed if you download the cellosaurus.txt file in
advance and open it (see the comment below).

    >>> from urllib.request import urlopen
    >>> from io import TextIOWrapper
    >>> from Bio.ExPASy import cellosaurus
    >>> url = "ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt"
    >>> bytestream = urlopen(url)
    >>> textstream = TextIOWrapper(bytestream, "UTF-8")
    >>> # alternatively, use
    >>> # textstream = open("cellosaurus.txt")
    >>> # if you downloaded the cellosaurus.txt file in advance.
    >>> records = cellosaurus.parse(textstream)
    >>> for record in records:
    ...     if 'Homo sapiens' in record['OX'][0]:
    ...         print(record['ID'])  # doctest:+ELLIPSIS
    ...
    #15310-LN
    #W7079
    (L)PC6
    0.5alpha
    ...

"""


def parse(handle):
    """Parse cell line records.

    This function is for parsing cell line files containing multiple
    records.

    Arguments:
     - handle   - handle to the file.

    """
    while True:
        record = __read(handle)
        if not record:
            break
        yield record


def read(handle):
    """Read one cell line record.

    This function is for parsing cell line files containing
    exactly one record.

    Arguments:
     - handle   - handle to the file.

    """
    record = __read(handle)
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one cell line record found")
    return record


class Record(dict):
    """Holds information from an ExPASy Cellosaurus record as a Python dictionary.

    Each record contains the following keys:

    =========  ==============================  =======================
    Line code  Content                         Occurrence in an entry
    =========  ==============================  =======================
    ID         Identifier (cell line name)     Once; starts an entry
    AC         Accession (CVCL_xxxx)           Once
    AS         Secondary accession number(s)   Optional; once
    SY         Synonyms                        Optional; once
    DR         Cross-references                Optional; once or more
    RX         References identifiers          Optional: once or more
    WW         Web pages                       Optional; once or more
    CC         Comments                        Optional; once or more
    ST         STR profile data                Optional; twice or more
    DI         Diseases                        Optional; once or more
    OX         Species of origin               Once or more
    HI         Hierarchy                       Optional; once or more
    OI         Originate from same individual  Optional; once or more
    SX         Sex of cell                     Optional; once
    AG         Age of donor at sampling        Optional; once
    CA         Category                        Once
    DT         Date (entry history)            Once
    //         Terminator                      Once; ends an entry
    =========  ==============================  =======================

    """

    def __init__(self):
        """Initialize the class."""
        dict.__init__(self)
        self["ID"] = ""
        self["AC"] = ""
        self["AS"] = ""
        self["SY"] = ""
        self["DR"] = []
        self["RX"] = []
        self["WW"] = []
        self["CC"] = []
        self["ST"] = []
        self["DI"] = []
        self["OX"] = []
        self["HI"] = []
        self["OI"] = []
        self["SX"] = ""
        self["AG"] = ""
        self["CA"] = ""
        self["DT"] = ""

    def __repr__(self):
        """Return the canonical string representation of the Record object."""
        if self["ID"]:
            if self["AC"]:
                return f"{self.__class__.__name__} ({self['ID']}, {self['AC']})"
            else:
                return f"{self.__class__.__name__} ({self['ID']})"
        else:
            return f"{self.__class__.__name__} ( )"

    def __str__(self):
        """Return a readable string representation of the Record object."""
        output = "ID: " + self["ID"]
        output += " AC: " + self["AC"]
        output += " AS: " + self["AS"]
        output += " SY: " + self["SY"]
        output += " DR: " + repr(self["DR"])
        output += " RX: " + repr(self["RX"])
        output += " WW: " + repr(self["WW"])
        output += " CC: " + repr(self["CC"])
        output += " ST: " + repr(self["ST"])
        output += " DI: " + repr(self["DI"])
        output += " OX: " + repr(self["OX"])
        output += " HI: " + repr(self["HI"])
        output += " OI: " + repr(self["OI"])
        output += " SX: " + self["SX"]
        output += " AG: " + self["AG"]
        output += " CA: " + self["CA"]
        output += " DT: " + self["DT"]
        return output


# Everything below is private


def __read(handle):
    record = None

    for line in handle:
        key, value = line[:2], line[5:].rstrip()
        if key == "ID":
            record = Record()
            record["ID"] = value
        elif key in ["AC", "AS", "SY", "SX", "AG", "CA", "DT"]:
            record[key] += value
        elif key in [
            # just append to the fields defined as lists, not to strings
            "RX",
            "WW",
            "CC",
            "ST",
            "DI",
            "OX",
            "HI",
            "OI",
        ]:
            record[key].append(value)
        elif key == "DR":
            k, v = value.split(";")
            record["DR"].append((k.strip(), v.strip()))
        elif key == "//":
            if record:
                return record
            else:
                continue
    if record:
        raise ValueError("Unexpected end of stream")


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()