File: genbank_io.py

package info (click to toggle)

python-weblogo 3.8.0-2

links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 3,700 kB
sloc: xml: 14,455; python: 10,384; sh: 140; makefile: 58

file content (70 lines) | stat: -rwxr-xr-x 1,782 bytes

#!/usr/bin/env python


"""Read GenBank flat files.

Currently only reads sequence data and not annotations.

"""

from typing import Iterator, Optional, TextIO

from ..seq import Alphabet, Seq, SeqList
from ..utils import isblank

names = ("genbank",)
extensions = ("gb", "genbank", "gbk")


def read(fin: TextIO, alphabet: Optional[Alphabet] = None) -> SeqList:
    """Read and parse a file of genbank records.

    Args:
    fin -- A stream or file to read
    alphabet -- The expected alphabet of the data, if given

    Returns:
    SeqList -- A list of sequences

    Raises:
    ValueError -- If the file is unparsable
    """
    seqs = [s for s in iterseq(fin, alphabet)]
    return SeqList(seqs)


def iterseq(fin: TextIO, alphabet: Optional[Alphabet] = None) -> Iterator[Seq]:
    """Iterate over genbank records

    Args:
    fin -- A stream or file to read
    alphabet -- The expected alphabet of the data, if given

    Yields:
    Seq -- One alphabetic sequence at a time.

    Raises:
    ValueError -- If the file is unparsable
    """
    alphabet = Alphabet(alphabet)

    header, block, data = range(3)
    state = header
    seq: list = []
    for L, line in enumerate(fin):
        if isblank(line):
            continue
        if state == header:
            if not line.startswith("LOCUS"):
                raise ValueError("Cannot find start of record at line %d" % L)
            state = block
        elif state == block:
            if line.startswith("ORIGIN") or line.startswith("//"):
                state = data
        elif state == data:
            if line.startswith("//"):
                yield Seq("".join(seq), alphabet)
                seq = []
                state = block
            else:
                seq.extend(line.split()[1:])