File: __init__.py | Debian Sources

File: init.py

package info (click to toggle)
python-biopython 1.78%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bullseye
size: 65,756 kB
sloc: python: 221,141; xml: 178,777; ansic: 13,369; sql: 1,208; makefile: 131; sh: 70
file content (334 lines) | stat: -rw-r--r-- 11,230 bytes
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#

"""Parser for FSSP files, used in a database of protein fold classifications (DEPRECATED).

This is a module to handle FSSP files. For now it parses only the header,
summary and alignment sections.

See: Holm and Sander (1996) The FSSP database: fold classification based on
structure-structure alignment of proteins.

Functions
---------
    :read_fssp(file_handle): reads an fssp file into the records. Returns
                            a tuple of two instances.

    :mult_align: returns a Biopython alignment object.

"""

import re
import warnings

from . import fssp_rec

from Bio import BiopythonDeprecationWarning

warnings.warn(
    "Bio.FSSP has been deprecated, and we intend to remove it"
    " in a future release of Biopython. Please contact the Biopython"
    " developers if you need this module.",
    BiopythonDeprecationWarning,
)


fff_rec = fssp_rec.fff_rec
header_records = {
    "database": re.compile("^DATABASE"),
    "pdbid": re.compile("^PDBID"),
    "header": re.compile("^HEADER"),
    "compnd": re.compile("^COMPND"),
    "author": re.compile("^AUTHOR"),
    "source": re.compile("^SOURCE"),
    "seqlength": re.compile("^SEQLENGTH"),
    "nalign": re.compile("^NALIGN"),
}

summary_title = re.compile("## +SUMMARY")
summary_rec = re.compile(" *[0-9]+: +[1-9][0-9a-z]{3,3}")
alignments_title = re.compile("## +ALIGNMENTS")
alignments_rec = re.compile(" *[0-9]+ +-{0,1}[0-9]+")
equiv_title = re.compile("## +EQUIVALENCES")


class FSSPHeader:
    """Store the FSSP file header's properties."""

    def __init__(self):
        """Initialize the class."""
        self.database = None
        self.pdbid = ""
        self.header = ""
        self.compnd = ""
        self.source = ""
        self.author = []
        self.seqlength = 0
        self.nalign = 0

    def fill_header(self, inline):
        """Fill in properties from line."""
        for i in header_records:
            if header_records[i].match(inline):
                if i == "database" or i == "seqlength" or i == "nalign":
                    setattr(self, i, int(inline.split()[1]))
                elif i == "compnd" or i == "author":
                    setattr(self, i, inline.split()[1:])
                elif i == "source" or i == "header":
                    attr = inline[inline.find(" ") + 1 :].strip()
                    setattr(self, i, attr)
                else:
                    setattr(self, i, inline.split()[1])


class PosAlign:
    """Store the position alignments, AminoAcid plus Structure."""

    def __init__(self, inStr):
        """Initialize the class."""
        inStr = inStr.strip()
        if len(inStr) != 1 and len(inStr) != 2:
            raise ValueError("PosAlign: length not 2 chars" + inStr)
        if inStr == "..":
            self.aa = "-"
            self.gap = 1
        else:
            self.gap = 0
            self.aa = inStr[0]
            if self.aa == self.aa.lower():
                self.aa = "C"
            if len(inStr) == 2:
                self.ss = inStr[1].upper()
            else:
                self.ss = "0"

    def __repr__(self):
        """Return position alignments as a string."""
        if self.gap:
            outstring = ".."
        else:
            outstring = self.aa + self.ss.lower()
        return outstring


class FSSPSumRec:
    """Store the summary records from SUMMARY Section of file."""

    def __init__(self, in_str):
        """Initialize the class."""
        self.raw = in_str
        in_rec = in_str.strip().split()
        # print(in_rec)
        self.nr = int(in_rec[0][:-1])
        self.pdb1 = in_rec[1][:4]
        if len(in_rec[1]) == 4:
            self.chain1 = "0"
        elif len(in_rec[1]) == 5:
            self.chain1 = in_rec[1][4]
        else:
            raise ValueError("Bad PDB ID 1")
        self.pdb2 = in_rec[2][:4]
        if len(in_rec[2]) == 4:
            self.chain2 = "0"
        elif len(in_rec[2]) == 5:
            self.chain2 = in_rec[2][4]
        else:
            raise ValueError("Bad PDB ID 2")
        self.zscore = float(in_rec[3])
        self.rmsd = float(in_rec[4])
        self.lali = float(in_rec[5])
        self.lseq2 = float(in_rec[6])
        self.pID = float(in_rec[7])
        self.revers = int(in_rec[8])
        self.permut = int(in_rec[9])
        self.nfrag = int(in_rec[10])
        self.topo = in_rec[11]
        self.doc = ""
        for i in in_rec[12:]:
            self.doc = self.doc + i + " "
        self.doc = self.doc.rstrip() + "\n"

    def __repr__(self):
        """Return the text from the FSSP SUMMARY section."""
        return self.raw


class FSSPAlignRec:
    """Store the Alignment records from ALIGNMENTS section of file."""

    def __init__(self, in_fff_rec):
        """Initialize the class."""
        # print(in_fff_rec)
        self.abs_res_num = int(in_fff_rec[fssp_rec.align.abs_res_num])
        self.pdb_res_num = in_fff_rec[fssp_rec.align.pdb_res_num].strip()
        self.chain_id = in_fff_rec[fssp_rec.align.chain_id]
        if self.chain_id == " ":
            self.chain_id = "0"
        self.res_name = in_fff_rec[fssp_rec.align.res_name]
        if self.res_name == self.res_name.lower():
            self.res_name = "C"
        self.ss1 = in_fff_rec[fssp_rec.align.ss1]
        self.turn3 = in_fff_rec[fssp_rec.align.turn3]
        self.turn4 = in_fff_rec[fssp_rec.align.turn4]
        self.turn5 = in_fff_rec[fssp_rec.align.turn5]
        self.pos_align_dict = {}
        self.PosAlignList = []

    def add_align_list(self, align_list):
        """Add the given alignment list to the structure."""
        for i in align_list:
            self.PosAlignList.append(PosAlign(i))

    def pos_align_list2dict(self):
        """Create a dictionary from the position alignment list.

        The key is sequential starting on 1.
        """
        j = 1
        for i in self.PosAlignList:
            self.pos_align_dict[j] = i
            j = j + 1


class FSSPAlignDict(dict):
    """Create a dict to access Alignment Records(FSSPAlignRec).

    Key is the alignment record's chain_id, plus residue name,
    plus PDB residue number

    key = align_rec.chain_id + align_rec.res_name + str(align_rec.pdb_res_num

    Also creates two indexes, one by PDB Residue Number, the other by absolute
    residue number, so you can access the data by either.
    pdb_res_dict: Key PDB residue number
    abs_res_dict: Key absolute residue number

    """

    def __init__(self):
        """Initialize the class."""
        # The following two dictionaries are pointers to records in self
        # The first dictionary is a "pdb_residue_number: self_key"
        # The second dictionary is a "absolute_residue_number: self_key"
        self.pdb_res_dict = {}
        self.abs_res_dict = {}
        self.data = {}

    def build_resnum_list(self):
        """Create the keys by residue number."""
        for i in self:
            self.abs_res_dict[self[i].abs_res_num] = i
            self.pdb_res_dict[self[i].pdb_res_num] = i

    def abs(self, num):
        """Given an absolute residue number & chain, returns the relevant fssp record."""
        return self[self.abs_res_dict[num]]

    def pdb(self, num):
        """Given an PDB residue number & chain, returns the relevant fssp record."""
        return self[self.pdb_res_dict[num]]

    def sequence(self, num):
        """Return a sequence string."""
        s = ""
        for i in sorted(self.abs_res_dict):
            s += self.abs(i).pos_align_dict[num].aa
        return s

    def fasta_mult_align(self):
        """Create a FASTA multi alignment record."""
        mult_align_dict = {}
        for j in self.abs(1).pos_align_dict:
            mult_align_dict[j] = ""
        for fssp_record in self.values():
            for j in fssp_record.pos_align_dict:
                mult_align_dict[j] += fssp_record.pos_align_dict[j].aa
        out_str = ""
        for i in sorted(mult_align_dict):
            out_str += "> %d\n" % i
            k = 0
            for j in mult_align_dict[i]:
                k += 1
                if k % 72 == 0:
                    out_str += "\n"
                out_str += j
            out_str += "\n"
        return out_str


class FSSPSumDict(dict):
    """Create a dict to access summary records (FSSPSumRec).

    The key is NR, Record Number.
    """

    pass


#
# Process a fssp file into its constituents. Return a 2-tuple containing
# a list of FSSPSumRecs and a dictionary of alignment records.
#
def read_fssp(fssp_handle):
    """Process a FSSP file and creates the classes containing its parts.

    Returns:
        :header: Contains the file header and its properties.
        :sum_dict: Contains the summary section.
        :align_dict: Contains the alignments.

    """
    header = FSSPHeader()
    sum_dict = FSSPSumDict()
    align_dict = FSSPAlignDict()
    curline = fssp_handle.readline()
    while not summary_title.match(curline):
        # Still in title
        header.fill_header(curline)
        curline = fssp_handle.readline()

    if not summary_title.match(curline):
        raise ValueError("Bad FSSP file: no summary record found")
    curline = fssp_handle.readline()  # Read the title line, discard
    curline = fssp_handle.readline()  # Read the next line
    # Process the summary records into a list
    while summary_rec.match(curline):
        cur_sum_rec = FSSPSumRec(curline)
        sum_dict[cur_sum_rec.nr] = cur_sum_rec
        curline = fssp_handle.readline()

    # Outer loop: process everything up to the EQUIVALENCES title record
    while not equiv_title.match(curline):
        while not alignments_title.match(curline) and not equiv_title.match(curline):
            curline = fssp_handle.readline()
        if not alignments_title.match(curline):
            if equiv_title.match(curline):
                # print("Reached equiv_title")
                break
            else:
                raise ValueError("Bad FSSP file: no alignments title record found")

        if equiv_title.match(curline):
            break
        # If we got to this point, this means that we have matched an
        # alignments title. Parse the alignment records in a loop.
        curline = fssp_handle.readline()  # Read the title line, discard
        curline = fssp_handle.readline()  # Read the next line
        while alignments_rec.match(curline):
            align_rec = FSSPAlignRec(fff_rec(curline))
            key = align_rec.chain_id + align_rec.res_name + str(align_rec.pdb_res_num)
            align_list = curline[fssp_rec.align.start_aa_list :].strip().split()
            if key not in align_dict:
                align_dict[key] = align_rec
            align_dict[key].add_align_list(align_list)
            curline = fssp_handle.readline()
            if not curline:
                print("EOFEOFEOF")
                raise EOFError
    for i in align_dict.values():
        i.pos_align_list2dict()
        del i.PosAlignList
    align_dict.build_resnum_list()
    return (header, sum_dict, align_dict)