File: StructureAlignment.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 46,860 kB
  • ctags: 13,237
  • sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (136 lines) | stat: -rw-r--r-- 4,024 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Map the residues of two structures to each other based on a FASTA alignment
file.
"""

from __future__ import print_function

from Bio.Data import SCOPData

from Bio.PDB import Selection
from Bio.PDB.Polypeptide import is_aa


class StructureAlignment(object):
    """
    This class aligns two structures based on an alignment of their
    sequences.
    """
    def __init__(self, fasta_align, m1, m2, si=0, sj=1):
        """
        Attributes:

            - fasta_align --- Alignment object
            - m1, m2 --- two models
            - si, sj --- the sequences in the Alignment object that
              correspond to the structures
        """
        l = fasta_align.get_alignment_length()
        # Get the residues in the models
        rl1 = Selection.unfold_entities(m1, 'R')
        rl2 = Selection.unfold_entities(m2, 'R')
        # Residue positions
        p1 = 0
        p2 = 0
        # Map equivalent residues to each other
        map12 = {}
        map21 = {}
        # List of residue pairs (None if -)
        duos = []
        for i in range(0, l):
            column = fasta_align.get_column(i)
            aa1 = column[si]
            aa2 = column[sj]
            if aa1 != "-":
                # Position in seq1 is not -
                while True:
                    # Loop until an aa is found
                    r1 = rl1[p1]
                    p1 = p1 + 1
                    if is_aa(r1):
                        break
                self._test_equivalence(r1, aa1)
            else:
                r1 = None
            if aa2 != "-":
                # Position in seq2 is not -
                while True:
                    # Loop until an aa is found
                    r2 = rl2[p2]
                    p2 = p2 + 1
                    if is_aa(r2):
                        break
                self._test_equivalence(r2, aa2)
            else:
                r2 = None
            if r1:
                # Map residue in seq1 to its equivalent in seq2
                map12[r1] = r2
            if r2:
                # Map residue in seq2 to its equivalent in seq1
                map21[r2] = r1
            # Append aligned pair (r is None if gap)
            duos.append((r1, r2))
        self.map12 = map12
        self.map21 = map21
        self.duos = duos

    def _test_equivalence(self, r1, aa1):
        "Test if aa in sequence fits aa in structure."
        resname = r1.get_resname()
        resname = SCOPData.protein_letters_3to1[resname]
        assert(aa1 == resname)

    def get_maps(self):
        """
        Return two dictionaries that map a residue in one structure to
        the equivealent residue in the other structure.
        """
        return self.map12, self.map21

    def get_iterator(self):
        """
        Iterator over all residue pairs.
        """
        for i in range(0, len(self.duos)):
            yield self.duos[i]


if __name__ == "__main__":
    import sys
    from Bio.Alphabet import generic_protein
    from Bio import AlignIO
    from Bio.PDB import PDBParser

    if len(sys.argv) != 4:
        print("Expects three arguments,")
        print(" - FASTA alignment filename (expect two sequences)")
        print(" - PDB file one")
        print(" - PDB file two")
        sys.exit()

    # The alignment
    fa = AlignIO.read(open(sys.argv[1]), "fasta", generic_protein)

    pdb_file1 = sys.argv[2]
    pdb_file2 = sys.argv[3]

    # The structures
    p = PDBParser()
    s1 = p.get_structure('1', pdb_file1)
    p = PDBParser()
    s2 = p.get_structure('2', pdb_file2)

    # Get the models
    m1 = s1[0]
    m2 = s2[0]

    al = StructureAlignment(fa, m1, m2)

    # Print aligned pairs (r is None if gap)
    for (r1, r2) in al.get_iterator():
        print("%s %s" % (r1, r2))