File: embl.py

package info (click to toggle)

python-biopython 1.42-2

links: PTS
area: main
in suites: etch, etch-m68k
size: 17,584 kB
ctags: 12,272
sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203

file content (84 lines) | stat: -rw-r--r-- 2,911 bytes

parent folder | download | duplicates (2)

# Not clear on the distinction, if any, between 'embl' and 'embl/65'.  This
# code might apply to either or both.

# See 'http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html' for a
# definition of this file format.

# This code only makes a best effort--the output may not be strictly valid.
# So, for example, the EMBL ID is supposed to be alphanumeric, starting with a
# letter, but we don't check for this, etc.


# Example:
# ID   AA03518    standard; DNA; FUN; 237 BP.
# XX
# AC   U03518;
# XX
# DE   Aspergillus awamori internal transcribed spacer 1 (ITS1) and 18S
# DE   rRNA and 5.8S rRNA genes, partial sequence.
# XX
# SQ   Sequence 237 BP; 41 A; 77 C; 67 G; 52 T; 0 other;
#      aacctgcgga aggatcatta ccgagtgcgg gtcctttggg cccaacctcc catccgtgtc        60
#      tattgtaccc tgttgcttcg gcgggcccgc cgcttgtcgg ccgccggggg ggcgcctctg       120
#      ccccccgggc ccgtgcccgc cggagacccc aacacgaaca ctgtctgaaa gcgtgcagtc       180
#      tgagttgatt gaatgcaatc agttaaaact ttcaacaatg gatctcttgg ttccggc          237
# //


import textwrap

from Bio import Alphabet
from Bio import Writer

class WriteEmbl(Writer.Writer):
    def __init__(self, outfile):
        Writer.Writer.__init__(self, outfile)
        
    def write(self, record):
        seq = record.seq
        assert seq.alphabet.size == 1, "cannot handle alphabet of size %d" % \
               seq.alphabet.size
        data = seq.data
        upperdata = data.upper()

# It'd be nice if the alphabet was usefully set, but for many interesting
# cases (e.g., reading from FASTA files), it's not.

        if isinstance(seq.alphabet, Alphabet.RNAAlphabet):
            molecule = 'mRNA'
            letters = ['A', 'C', 'G', 'U']
        else:
            molecule = 'DNA'
            letters = ['A', 'C', 'G', 'T']

        division = 'UNC'                # unknown

        self.outfile.write("ID   %s  standard; %s; %s; %d BP.\n"
                           % (record.id, molecule, division, len(data)))

        desclist = textwrap.wrap(record.description, 74)
        for l in desclist:
            self.outfile.write("DE   %s\n" % l)

        counts = [ upperdata.count(l) for l in letters ]
        othercount = len(upperdata) - sum(counts)

        countstring = ''.join([ " %d %s;" % p for p in zip(counts, letters) ])

        self.outfile.write("SQ   Sequence %s BP;%s %d other;\n"
                           % (len(data), countstring, othercount))

        rowlength = 60
        blocklength = 10
        for i in xrange(0, len(data), rowlength):
            self.outfile.write(" " * 5)
            row = data[i:i+rowlength]
            for b in xrange(0, rowlength, blocklength):
                block = row[b:b+blocklength]
                self.outfile.write("%-*s" % (blocklength+1, block))
            self.outfile.write("%9d\n" % min(i+rowlength, len(data)))

        self.outfile.write("//\n")


make_writer = WriteEmbl