File: IUPAC.py

package info (click to toggle)
python-biopython 1.64%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 44,416 kB
  • ctags: 12,472
  • sloc: python: 153,759; xml: 67,286; ansic: 9,003; sql: 1,488; makefile: 144; sh: 59
file content (105 lines) | stat: -rw-r--r-- 3,114 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Copyright 2000-2001 by Andrew Dalke.
# Revisions copyright 2008 by Peter Cock.
# All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Standard nucleotide and protein alphabets defined by IUPAC."""

from Bio import Alphabet
from Bio.Data import IUPACData

##################### Protein

# From the IUPAC definition at:
#   http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html#AA21

assert IUPACData.extended_protein_letters == IUPACData.extended_protein_letters.upper()


class ExtendedIUPACProtein(Alphabet.ProteinAlphabet):
    """Extended uppercase IUPAC protein single letter alphabet including X etc.

    In addition to the standard 20 single letter protein codes, this includes:

    B = "Asx";  Aspartic acid (R) or Asparagine (N)
    X = "Xxx";  Unknown or 'other' amino acid
    Z = "Glx";  Glutamic acid (E) or Glutamine (Q)
    J = "Xle";  Leucine (L) or Isoleucine (I), used in mass-spec (NMR)
    U = "Sec";  Selenocysteine
    O = "Pyl";  Pyrrolysine

    This alphabet is not intended to be used with X for Selenocysteine
    (an ad-hoc standard prior to the IUPAC adoption of U instead).
    """
    letters = IUPACData.extended_protein_letters

extended_protein = ExtendedIUPACProtein()

assert IUPACData.protein_letters == IUPACData.protein_letters.upper()


class IUPACProtein(ExtendedIUPACProtein):
    """Uppercase IUPAC protein single letter alphabet of the 20 standard amino acids."""
    letters = IUPACData.protein_letters

protein = IUPACProtein()

##################### DNA


# The next two are the IUPAC definitions, from:
#   http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html
class IUPACAmbiguousDNA(Alphabet.DNAAlphabet):
    """Uppercase IUPAC ambiguous DNA."""
    letters = IUPACData.ambiguous_dna_letters

ambiguous_dna = IUPACAmbiguousDNA()


class IUPACUnambiguousDNA(IUPACAmbiguousDNA):
    """Uppercase IUPAC unambiguous DNA (letters GATC only)."""
    letters = IUPACData.unambiguous_dna_letters

unambiguous_dna = IUPACUnambiguousDNA()


# Also from the URL, but not part of the standard
class ExtendedIUPACDNA(Alphabet.DNAAlphabet):
    """Extended IUPAC DNA alphabet.

    In addition to the standard letter codes GATC, this includes:

    B = 5-bromouridine
    D = 5,6-dihydrouridine
    S = thiouridine
    W = wyosine
    """
    letters = IUPACData.extended_dna_letters

extended_dna = ExtendedIUPACDNA()

##################### RNA


class IUPACAmbiguousRNA(Alphabet.RNAAlphabet):
    """Uppercase IUPAC ambiguous RNA."""
    letters = IUPACData.ambiguous_rna_letters

ambiguous_rna = IUPACAmbiguousRNA()


class IUPACUnambiguousRNA(IUPACAmbiguousRNA):
    """Uppercase IUPAC unambiguous RNA (letters GAUC only)."""
    letters = IUPACData.unambiguous_rna_letters

unambiguous_rna = IUPACUnambiguousRNA()

# are there extended forms?
#class ExtendedIUPACRNA(Alphabet.RNAAlphabet):
#    letters = extended_rna_letters
#    #   B == 5-bromouridine
#    #   D == 5,6-dihydrouridine
#    #   S == thiouridine
#    #   W == wyosine