File: cellosaurus.py

package info (click to toggle)
python-biopython 1.85%2Bdfsg-4
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 126,372 kB
sloc: xml: 1,047,995; python: 332,722; ansic: 16,944; sql: 1,208; makefile: 140; sh: 81
file content (208 lines) | stat: -rw-r--r-- 6,670 bytes
# Copyright 2016 by Stephen Marshall.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Parser for the cellosaurus.txt file from ExPASy.

See https://web.expasy.org/cellosaurus/

Tested with the release of Version 18 (July 2016).

Functions:
 - read       Reads a file containing one cell line entry
 - parse      Reads a file containing multiple cell line entries

Classes:
 - Record     Holds cell line data.

Examples
--------
This example downloads the Cellosaurus database and parses it. Note that
urlopen returns a stream of bytes, while the parser expects a stream of plain
string, so we use TextIOWrapper to convert bytes to string using the UTF-8
encoding. This is not needed if you download the cellosaurus.txt file in
advance and open it (see the comment below).

    >>> from urllib.request import urlopen
    >>> from io import TextIOWrapper
    >>> from Bio.ExPASy import cellosaurus
    >>> url = "ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt"
    >>> bytestream = urlopen(url)
    >>> textstream = TextIOWrapper(bytestream, "UTF-8")
    >>> # alternatively, use
    >>> # textstream = open("cellosaurus.txt")
    >>> # if you downloaded the cellosaurus.txt file in advance.
    >>> records = cellosaurus.parse(textstream)
    >>> for record in records:
    ...     if 'Homo sapiens' in record['OX'][0]:
    ...         print(record['ID'])  # doctest:+ELLIPSIS
    ...
    #15310-LN
    #W7079
    (L)PC6
    0.5alpha
    ...

"""


def parse(handle):
    """Parse cell line records.

    This function is for parsing cell line files containing multiple
    records.

    Arguments:
     - handle   - handle to the file.

    """
    while True:
        record = __read(handle)
        if not record:
            break
        yield record


def read(handle):
    """Read one cell line record.

    This function is for parsing cell line files containing
    exactly one record.

    Arguments:
     - handle   - handle to the file.

    """
    record = __read(handle)
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one cell line record found")
    return record


class Record(dict):
    """Holds information from an ExPASy Cellosaurus record as a Python dictionary.

    Each record contains the following keys:

    =========  ==============================  =======================
    Line code  Content                         Occurrence in an entry
    =========  ==============================  =======================
    ID         Identifier (cell line name)     Once; starts an entry
    AC         Accession (CVCL_xxxx)           Once
    AS         Secondary accession number(s)   Optional; once
    SY         Synonyms                        Optional; once
    DR         Cross-references                Optional; once or more
    RX         References identifiers          Optional: once or more
    WW         Web pages                       Optional; once or more
    CC         Comments                        Optional; once or more
    ST         STR profile data                Optional; twice or more
    DI         Diseases                        Optional; once or more
    OX         Species of origin               Once or more
    HI         Hierarchy                       Optional; once or more
    OI         Originate from same individual  Optional; once or more
    SX         Sex of cell                     Optional; once
    AG         Age of donor at sampling        Optional; once
    CA         Category                        Once
    DT         Date (entry history)            Once
    //         Terminator                      Once; ends an entry
    =========  ==============================  =======================

    """

    def __init__(self):
        """Initialize the class."""
        dict.__init__(self)
        self["ID"] = ""
        self["AC"] = ""
        self["AS"] = ""
        self["SY"] = ""
        self["DR"] = []
        self["RX"] = []
        self["WW"] = []
        self["CC"] = []
        self["ST"] = []
        self["DI"] = []
        self["OX"] = []
        self["HI"] = []
        self["OI"] = []
        self["SX"] = ""
        self["AG"] = ""
        self["CA"] = ""
        self["DT"] = ""

    def __repr__(self):
        """Return the canonical string representation of the Record object."""
        if self["ID"]:
            if self["AC"]:
                return f"{self.__class__.__name__} ({self['ID']}, {self['AC']})"
            else:
                return f"{self.__class__.__name__} ({self['ID']})"
        else:
            return f"{self.__class__.__name__} ( )"

    def __str__(self):
        """Return a readable string representation of the Record object."""
        output = "ID: " + self["ID"]
        output += " AC: " + self["AC"]
        output += " AS: " + self["AS"]
        output += " SY: " + self["SY"]
        output += " DR: " + repr(self["DR"])
        output += " RX: " + repr(self["RX"])
        output += " WW: " + repr(self["WW"])
        output += " CC: " + repr(self["CC"])
        output += " ST: " + repr(self["ST"])
        output += " DI: " + repr(self["DI"])
        output += " OX: " + repr(self["OX"])
        output += " HI: " + repr(self["HI"])
        output += " OI: " + repr(self["OI"])
        output += " SX: " + self["SX"]
        output += " AG: " + self["AG"]
        output += " CA: " + self["CA"]
        output += " DT: " + self["DT"]
        return output


# Everything below is private


def __read(handle):
    record = None

    for line in handle:
        key, value = line[:2], line[5:].rstrip()
        if key == "ID":
            record = Record()
            record["ID"] = value
        elif key in ["AC", "AS", "SY", "SX", "AG", "CA", "DT"]:
            record[key] += value
        elif key in [
            # just append to the fields defined as lists, not to strings
            "RX",
            "WW",
            "CC",
            "ST",
            "DI",
            "OX",
            "HI",
            "OI",
        ]:
            record[key].append(value)
        elif key == "DR":
            k, v = value.split(";")
            record["DR"].append((k.strip(), v.strip()))
        elif key == "//":
            if record:
                return record
            else:
                continue
    if record:
        raise ValueError("Unexpected end of stream")


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()