File: IUPAC.py

package info (click to toggle)
python-biopython 1.73%2Bdfsg-1
links: PTS, VCS
area: main
in suites: buster
size: 57,852 kB
sloc: python: 169,977; xml: 97,539; ansic: 15,653; sql: 1,208; makefile: 159; sh: 63
file content (125 lines) | stat: -rw-r--r-- 3,255 bytes
# Copyright 2000-2001 by Andrew Dalke.
# Revisions copyright 2008 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Standard nucleotide and protein alphabets defined by IUPAC."""

from Bio import Alphabet
from Bio.Data import IUPACData


# #################### Protein

# From the IUPAC definition at:
# http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html#AA21

assert (IUPACData.extended_protein_letters ==
        IUPACData.extended_protein_letters.upper())


class ExtendedIUPACProtein(Alphabet.ProteinAlphabet):
    """Extended uppercase IUPAC protein single letter alphabet including X etc.

    In addition to the standard 20 single letter protein codes, this includes:

     - `B` = "Asx";  Aspartic acid (R) or Asparagine (N)
     - `X` = "Xxx";  Unknown or 'other' amino acid
     - `Z` = "Glx";  Glutamic acid (E) or Glutamine (Q)
     - `J` = "Xle";  Leucine (L) or Isoleucine (I), used in mass-spec (NMR)
     - `U` = "Sec";  Selenocysteine
     - `O` = "Pyl";  Pyrrolysine

    This alphabet is not intended to be used with `X` for Selenocysteine
    (an ad-hoc standard prior to the IUPAC adoption of `U` instead).
    """

    letters = IUPACData.extended_protein_letters


extended_protein = ExtendedIUPACProtein()

assert IUPACData.protein_letters == IUPACData.protein_letters.upper()


class IUPACProtein(ExtendedIUPACProtein):
    """IUPAC protein alphabet of the 20 standard amino acids.

    Uppercase and single letter.
    """

    letters = IUPACData.protein_letters


protein = IUPACProtein()

# #################### DNA


# The next two are the IUPAC definitions, from:
# http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html
class IUPACAmbiguousDNA(Alphabet.DNAAlphabet):
    """Uppercase IUPAC ambiguous DNA."""

    letters = IUPACData.ambiguous_dna_letters


ambiguous_dna = IUPACAmbiguousDNA()


class IUPACUnambiguousDNA(IUPACAmbiguousDNA):
    """Uppercase IUPAC unambiguous DNA (letters GATC only)."""

    letters = IUPACData.unambiguous_dna_letters


unambiguous_dna = IUPACUnambiguousDNA()


# Also from the URL, but not part of the standard
class ExtendedIUPACDNA(Alphabet.DNAAlphabet):
    """Extended IUPAC DNA alphabet.

    In addition to the standard letter codes GATC, this includes:

     - `B` = 5-bromouridine
     - `D` = 5,6-dihydrouridine
     - `S` = thiouridine
     - `W` = wyosine

    """

    letters = IUPACData.extended_dna_letters


extended_dna = ExtendedIUPACDNA()

# #################### RNA


class IUPACAmbiguousRNA(Alphabet.RNAAlphabet):
    """Uppercase IUPAC ambiguous RNA."""

    letters = IUPACData.ambiguous_rna_letters


ambiguous_rna = IUPACAmbiguousRNA()


class IUPACUnambiguousRNA(IUPACAmbiguousRNA):
    """Uppercase IUPAC unambiguous RNA (letters GAUC only)."""

    letters = IUPACData.unambiguous_rna_letters


unambiguous_rna = IUPACUnambiguousRNA()

# are there extended forms?
# class ExtendedIUPACRNA(Alphabet.RNAAlphabet):
#    letters = extended_rna_letters
#    #   B == 5-bromouridine
#    #   D == 5,6-dihydrouridine
#    #   S == thiouridine
#    #   W == wyosine