File: nbrf_io.py

package info (click to toggle)
python-weblogo 3.8.0-2
links: PTS, VCS
area: main
in suites: sid, trixie
size: 3,700 kB
sloc: xml: 14,455; python: 10,384; sh: 140; makefile: 58
file content (178 lines) | stat: -rwxr-xr-x 6,006 bytes
#  Copyright (c) 2006, The Regents of the University of California, through
#  Lawrence Berkeley National Laboratory (subject to receipt of any required
#  approvals from the U.S. Dept. of Energy).  All rights reserved.

#  This software is distributed under the new BSD Open Source License.
#  <http://www.opensource.org/licenses/bsd-license.html>
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#
#  (1) Redistributions of source code must retain the above copyright notice,
#  this list of conditions and the following disclaimer.
#
#  (2) Redistributions in binary form must reproduce the above copyright
#  notice, this list of conditions and the following disclaimer in the
#  documentation and or other materials provided with the distribution.
#
#  (3) Neither the name of the University of California, Lawrence Berkeley
#  National Laboratory, U.S. Dept. of Energy nor the names of its contributors
#  may be used to endorse or promote products derived from this software
#  without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.

"""Sequence IO for NBRF/PIR format.

The format is similar to fasta. The header line consistins of '>', a two-
letter sequence type (P1, F1, DL, DC, RL, RC, or XX), a semicolon, and a
sequence ID. The next line is a textual description of the sequence,
followed by one or more lines containing the sequence data. The end of
the sequence is marked by a "*" (asterisk) character.

type_code -- A map between NBRF two letter type codes and Alphabets.


see:  http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html

--- Example NBRF File ---

>P1;CRAB_ANAPL
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
  MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
  SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
  GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
  SDVPERSIPI TREEKPAIAG AQRK*

>P1;CRAB_BOVIN
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
  MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
  PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
  HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
  QASGPERTIP ITREEKPAVT AAPKK*

"""

from typing import Iterator, Optional, TextIO

from ..seq import (
    Alphabet,
    Seq,
    SeqList,
    dna_alphabet,
    generic_alphabet,
    protein_alphabet,
    rna_alphabet,
)

names = (
    "nbrf",
    "pir",
)
extensions = ("nbrf", "pir", "ali")

type_code = {
    "P1": protein_alphabet,  # Protein (complete)
    "F1": protein_alphabet,  # Protein (fragment)
    "DL": dna_alphabet,  # DNA (linear)
    "DC": dna_alphabet,  # DNA (circular)
    "RC": rna_alphabet,  # RNA (linear)
    "RL": rna_alphabet,  # RNA (circular)
    "N3": rna_alphabet,  # tRNA
    "N1": rna_alphabet,  # other functional RNA
    "XX": generic_alphabet,
}


def read(fin: TextIO, alphabet: Optional[Alphabet] = None) -> SeqList:
    """Read and parse a NBRF sequence file.

    Args:
        fin -- A stream or file to read
        alphabet -- The expected alphabet of the data. If not supplied, then
                an appropriate alphabet will be inferred from the data.
    Returns:
        SeqList -- A list of sequences
    Raises:
        ValueError -- If the file is unparsable
    """
    seqs = [s for s in iterseq(fin, alphabet)]
    return SeqList(seqs)


def iterseq(fin: TextIO, alphabet: Optional[Alphabet] = None) -> Iterator[Seq]:
    """Generate sequences from an NBRF file.

    arguments:
        fin -- A stream or file to read
        alphabet --
    yields :
        Seq
    raises :
        ValueError -- On a parse error.
    """

    body, header, sequence = range(3)  # Internal states

    state = body
    seq_id = None
    seq_desc = None
    seq_alpha = None
    seqs = []

    for lineno, line in enumerate(fin):
        if state == body:
            if line == "" or line.isspace():
                continue
            if line[0] == ">":
                seq_type, seq_id = line[1:].split(";")
                if alphabet:
                    seq_alpha = alphabet
                else:
                    seq_alpha = type_code[seq_type]
                state = header
                continue
            raise ValueError("Parse error on line: %d" % lineno)

        elif state == header:
            seq_desc = line.strip()
            state = sequence
            continue

        elif state == sequence:
            data = "".join(line.split())  # Strip out white space
            if data[-1] == "*":
                # End of sequence data
                seqs.append(data[:-1])

                seq = Seq(
                    "".join(seqs),
                    name=seq_id.strip(),  # type: ignore
                    description=seq_desc,
                    alphabet=seq_alpha,
                )

                yield seq
                state = body
                seq_id = None
                seq_desc = None
                seqs = []
                continue
            else:
                seqs.append(data)
                continue
        else:
            # If we ever get here something has gone terrible wrong
            assert False  # pragma: no cover

    # end for