File: Prosite.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3~bpo8%2B1
links: PTS, VCS
area: main
in suites: jessie-backports
size: 46,856 kB
sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (300 lines) | stat: -rw-r--r-- 11,114 bytes
parent folder | download | duplicates (2)
# Copyright 1999 by Jeffrey Chang.  All rights reserved.
# Copyright 2000 by Jeffrey Chang.  All rights reserved.
# Revisions Copyright 2007 by Peter Cock.  All rights reserved.
# Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""
This module provides code to work with the prosite dat file from
Prosite.
http://www.expasy.ch/prosite/

Tested with:
Release 20.43, 10-Feb-2009


Functions:

    - read                  Reads a Prosite file containing one Prosite record
    - parse                 Iterates over records in a Prosite file.

Classes:

    - Record                Holds Prosite data.
"""


def parse(handle):
    """Parse Prosite records.

    This function is for parsing Prosite files containing multiple
    records.

    handle   - handle to the file."""
    while True:
        record = __read(handle)
        if not record:
            break
        yield record


def read(handle):
    """Read one Prosite record.

    This function is for parsing Prosite files containing
    exactly one record.

    handle   - handle to the file."""

    record = __read(handle)
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one Prosite record found")
    return record


class Record(object):
    """Holds information from a Prosite record.

    Members:

        - name           ID of the record.  e.g. ADH_ZINC
        - type           Type of entry.  e.g. PATTERN, MATRIX, or RULE
        - accession      e.g. PS00387
        - created        Date the entry was created.  (MMM-YYYY)
        - data_update    Date the 'primary' data was last updated.
        - info_update    Date data other than 'primary' data was last updated.
        - pdoc           ID of the PROSITE DOCumentation.

        - description    Free-format description.
        - pattern        The PROSITE pattern.  See docs.
        - matrix         List of strings that describes a matrix entry.
        - rules          List of rule definitions (from RU lines).  (strings)
        - prorules       List of prorules (from PR lines). (strings)

    NUMERICAL RESULTS

        - nr_sp_release  SwissProt release.
        - nr_sp_seqs     Number of seqs in that release of Swiss-Prot. (int)
        - nr_total       Number of hits in Swiss-Prot.  tuple of (hits, seqs)
        - nr_positive    True positives.  tuple of (hits, seqs)
        - nr_unknown     Could be positives.  tuple of (hits, seqs)
        - nr_false_pos   False positives.  tuple of (hits, seqs)
        - nr_false_neg   False negatives.  (int)
        - nr_partial     False negatives, because they are fragments. (int)

    COMMENTS

        - cc_taxo_range  Taxonomic range.  See docs for format
        - cc_max_repeat  Maximum number of repetitions in a protein
        - cc_site        Interesting site.  list of tuples (pattern pos, desc.)
        - cc_skip_flag   Can this entry be ignored?
        - cc_matrix_type
        - cc_scaling_db
        - cc_author
        - cc_ft_key
        - cc_ft_desc
        - cc_version     version number (introduced in release 19.0)

    The following are all lists if tuples (swiss-prot accession, swiss-prot name).

    DATA BANK REFERENCES

        - dr_positive
        - dr_false_neg
        - dr_false_pos
        - dr_potential   Potential hits, but fingerprint region not yet available.
        - dr_unknown     Could possibly belong
        - pdb_structs    List of PDB entries.

    """
    def __init__(self):
        self.name = ''
        self.type = ''
        self.accession = ''
        self.created = ''
        self.data_update = ''
        self.info_update = ''
        self.pdoc = ''

        self.description = ''
        self.pattern = ''
        self.matrix = []
        self.rules = []
        self.prorules = []
        self.postprocessing = []

        self.nr_sp_release = ''
        self.nr_sp_seqs = ''
        self.nr_total = (None, None)
        self.nr_positive = (None, None)
        self.nr_unknown = (None, None)
        self.nr_false_pos = (None, None)
        self.nr_false_neg = None
        self.nr_partial = None

        self.cc_taxo_range = ''
        self.cc_max_repeat = ''
        self.cc_site = []
        self.cc_skip_flag = ''

        self.dr_positive = []
        self.dr_false_neg = []
        self.dr_false_pos = []
        self.dr_potential = []
        self.dr_unknown = []

        self.pdb_structs = []


# Everything below are private functions

def __read(handle):
    import re
    record = None
    for line in handle:
        keyword, value = line[:2], line[5:].rstrip()
        if keyword == 'ID':
            record = Record()
            cols = value.split("; ")
            if len(cols) != 2:
                raise ValueError("I don't understand identification line\n%s"
                         % line)
            record.name = cols[0]
            record.type = cols[1].rstrip('.')    # don't want '.'
        elif keyword == 'AC':
            record.accession = value.rstrip(';')
        elif keyword == 'DT':
            dates = value.rstrip('.').split("; ")
            if (not dates[0].endswith('(CREATED)')) or \
               (not dates[1].endswith('(DATA UPDATE)')) or \
               (not dates[2].endswith('(INFO UPDATE)')):
                raise ValueError("I don't understand date line\n%s" % line)
            record.created = dates[0].rstrip(' (CREATED)')
            record.data_update = dates[1].rstrip(' (DATA UPDATE)')
            record.info_update = dates[2].rstrip(' (INFO UPDATE)')
        elif keyword == 'DE':
            record.description = value
        elif keyword == 'PA':
            record.pattern += value
        elif keyword == 'MA':
            record.matrix.append(value)
        elif keyword == 'PP':
            record.postprocessing.extend(value.split(";"))
        elif keyword == 'RU':
            record.rules.append(value)
        elif keyword == 'NR':
            cols = value.split(";")
            for col in cols:
                if not col:
                    continue
                qual, data = [word.lstrip() for word in col.split("=")]
                if qual == '/RELEASE':
                    release, seqs = data.split(",")
                    record.nr_sp_release = release
                    record.nr_sp_seqs = int(seqs)
                elif qual == '/FALSE_NEG':
                    record.nr_false_neg = int(data)
                elif qual == '/PARTIAL':
                    record.nr_partial = int(data)
                elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
                    m = re.match(r'(\d+)\((\d+)\)', data)
                    if not m:
                        raise Exception("Broken data %s in comment line\n%s"
                                        % (repr(data), line))
                    hits = tuple(map(int, m.groups()))
                    if(qual == "/TOTAL"):
                        record.nr_total = hits
                    elif(qual == "/POSITIVE"):
                        record.nr_positive = hits
                    elif(qual == "/UNKNOWN"):
                        record.nr_unknown = hits
                    elif(qual == "/FALSE_POS"):
                        record.nr_false_pos = hits
                else:
                    raise ValueError("Unknown qual %s in comment line\n%s"
                                     % (repr(qual), line))
        elif keyword == 'CC':
            # Expect CC lines like this:
            # CC   /TAXO-RANGE=??EPV; /MAX-REPEAT=2;
            # Can (normally) split on ";" and then on "="
            cols = value.split(";")
            for col in cols:
                if not col or col[:17] == 'Automatic scaling':
                    # DNAJ_2 in Release 15 has a non-standard comment line:
                    # CC   Automatic scaling using reversed database
                    # Throw it away.  (Should I keep it?)
                    continue
                if col.count("=") == 0:
                    # Missing qualifier!  Can we recover gracefully?
                    # For example, from Bug 2403, in PS50293 have:
                    # CC /AUTHOR=K_Hofmann; N_Hulo
                    continue
                qual, data = [word.lstrip() for word in col.split("=")]
                if qual == '/TAXO-RANGE':
                    record.cc_taxo_range = data
                elif qual == '/MAX-REPEAT':
                    record.cc_max_repeat = data
                elif qual == '/SITE':
                    pos, desc = data.split(",")
                    record.cc_site.append((int(pos), desc))
                elif qual == '/SKIP-FLAG':
                    record.cc_skip_flag = data
                elif qual == '/MATRIX_TYPE':
                    record.cc_matrix_type = data
                elif qual == '/SCALING_DB':
                    record.cc_scaling_db = data
                elif qual == '/AUTHOR':
                    record.cc_author = data
                elif qual == '/FT_KEY':
                    record.cc_ft_key = data
                elif qual == '/FT_DESC':
                    record.cc_ft_desc = data
                elif qual == '/VERSION':
                    record.cc_version = data
                else:
                    raise ValueError("Unknown qual %s in comment line\n%s"
                                     % (repr(qual), line))
        elif keyword == 'DR':
            refs = value.split(";")
            for ref in refs:
                if not ref:
                    continue
                acc, name, type = [word.strip() for word in ref.split(",")]
                if type == 'T':
                    record.dr_positive.append((acc, name))
                elif type == 'F':
                    record.dr_false_pos.append((acc, name))
                elif type == 'N':
                    record.dr_false_neg.append((acc, name))
                elif type == 'P':
                    record.dr_potential.append((acc, name))
                elif type == '?':
                    record.dr_unknown.append((acc, name))
                else:
                    raise ValueError("I don't understand type flag %s" % type)
        elif keyword == '3D':
            cols = value.split()
            for id in cols:
                record.pdb_structs.append(id.rstrip(';'))
        elif keyword == 'PR':
            rules = value.split(";")
            record.prorules.extend(rules)
        elif keyword == 'DO':
            record.pdoc = value.rstrip(';')
        elif keyword == 'CC':
            continue
        elif keyword == '//':
            if not record:
                # Then this was the copyright statement
                continue
            break
        else:
            raise ValueError("Unknown keyword %s found" % keyword)
    else:
        return
    if not record:
        raise ValueError("Unexpected end of stream.")
    return record