File: kabat_format.py

package info (click to toggle)
python-biopython 1.42-2
links: PTS
area: main
in suites: etch, etch-m68k
size: 17,584 kB
ctags: 12,272
sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (139 lines) | stat: -rw-r--r-- 5,473 bytes
parent folder | download | duplicates (2)
# Copyright 2001 by Katharine Lindner.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Martel based parser to read Kabat formatted files.

This is a huge regular regular expression for Kabat, built using
the 'regular expressiona on steroids' capabilities of Martel.

http://immuno.bme.nwu.edu/

Notes:
Just so I remember -- the new end of line syntax is:
  New regexp syntax - \R
     \R    means "\n|\r\n?"
     [\R]  means "[\n\r]"

This helps us have endlines be consistent across platforms.

"""
# standard library
#http://immuno.bme.nwu.edu/seqhunt.html
import string

# Martel
import Martel
from Martel import RecordReader

# - useful constants for dealing with the blank space in GenBank documents
# this is useful since blank space can be significant in GenBank flat files.
INDENT = 12
FEATURE_KEY_INDENT = 5
FEATURE_QUALIFIER_INDENT = 21

# --- first set up some helper constants and functions
blank_space = Martel.MaxRepeat(Martel.Str(" "), 1, 80 )
lower_alpha = Martel.Any( "abcdefghijklmnopqrstuvwxyz" )
alpha = Martel.NoCase( lower_alpha )
date = Martel.Group("date",
                    Martel.Re("[-\w]+"))
amino_3_letter_codes = [ 'ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'PHE', 'GLY',\
                'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', \
                'ARG', 'SER','THR', 'VAL', 'TRP', 'TYR', '---' ]
# from http://www.lmb.uni-muenchen.de/groups/bioinformatics/02/ch_02.html

amino_1_letter_codes = 'ACDEFGHIKLMNPQRSTVWY-'

nucleotides = 'gcat-'

amino_alts = map( Martel.Str, amino_3_letter_codes )

codon = Martel.Group( "codon", Martel.MaxRepeat( Martel.Any( nucleotides ), 1, 3 ) )
amino_3_letter_code = reduce( Martel.Alt, amino_alts )
amino_1_letter_code = Martel.Group( "amino_1_letter_code", \
    Martel.Any( amino_1_letter_codes ) )
amino_codes = Martel.Opt( amino_3_letter_code + blank_space + amino_1_letter_code )

residue =   blank_space + codon + blank_space + amino_codes


kabatid = Martel.Group("kabatid",
                    Martel.Rep1(Martel.Integer()))
pubmed_num = Martel.Rep1(Martel.Integer())
residue_num = Martel.Rep1(Martel.Integer())
kabat_num = Martel.Rep1(Martel.Integer()) + Martel.Opt( alpha )
id_line = Martel.Str("KADBID") + \
          blank_space + \
          kabatid + \
          Martel.ToEol()
creation_date_line =  Martel.Str( "CREATD" ) + \
                      blank_space + \
                      Martel.ToEol("creation_date")
last_mod_date_line =  Martel.Str( "LMODIF" ) + \
                      blank_space + \
                      Martel.ToEol("last_mod_date")
definition_line = Martel.Str( "DEFINI" ) + \
                  blank_space + \
                  Martel.ToEol( "definition" )
species_line = Martel.Str( "SPECIE" ) + \
               blank_space + \
               Martel.ToEol( "species" )
amino_acid_sequence_name_line = Martel.Str( "AANAME" ) + \
                                blank_space + \
                                Martel.ToEol( "amino_acid_sequence_name" )

nucleotide_sequence_name_line = Martel.Str( "NNNAME" ) + \
                                blank_space + \
                                Martel.ToEol( "nucleotide_sequence_name" )
amino_acid_ref_author_line = Martel.Str( "AAREFA" ) + \
                             Martel.ToEol( "amino_acid_ref_author" )
amino_acid_ref_journal_line = Martel.Str( "AAREFJ" ) + \
                              Martel.ToEol( "amino_acid_ref_journal" )
amino_acid_ref_pubmed_line = Martel.Str( "AAPUBM" ) +  \
                             Martel.ToEol( "amino_acid_ref_pubmed" )
nucleotide_ref_author_line = Martel.Str( "NNREFA" ) +  \
                             Martel.ToEol( "nucleotide_ref_author" )
nucleotide_ref_journal_line = Martel.Str( "NNREFJ" ) + \
                              Martel.ToEol( "nucleotide_ref_journal" )
nucleotide_ref_pubmed_line = Martel.Str( "NNPUBM" ) +  \
                             Martel.ToEol("nucleotide_ref_pubmed")
annotation_key = Martel.Group( "annotation_key", Martel.RepN( Martel.Re( "\w" ), 4 ) )
annotation_val = Martel.Group( "annotation_val", Martel.ToEol() )
annotation_item = annotation_key + \
                  blank_space + \
                  annotation_val

annotation_line = Martel.Str( "ANNOTA" ) + \
                  blank_space + annotation_item
residue_line = Martel.Str( "SEQTPA" ) + \
               blank_space + \
               residue_num + \
               blank_space + \
               kabat_num + \
               Martel.Opt( residue ) + \
               Martel.ToEol( )
kabat_record_end_line = Martel.Str( "RECEND" ) + \
                        Martel.ToEol()
amino_acid_ref = amino_acid_ref_author_line + \
                 amino_acid_ref_journal_line + \
                 amino_acid_ref_pubmed_line
nucleotide_ref = nucleotide_ref_author_line + \
                 nucleotide_ref_journal_line + \
                 nucleotide_ref_pubmed_line
residue_lines = Martel.Rep( residue_line )
annotation = Martel.Group( "annotation", Martel.Rep( annotation_line ) )
refs = Martel.Rep1( amino_acid_ref | nucleotide_ref )
kabat_record = id_line + creation_date_line + last_mod_date_line + \
               definition_line + species_line + amino_acid_sequence_name_line + \
               nucleotide_sequence_name_line + refs + annotation +  \
               residue_lines + kabat_record_end_line