File: cealign.py

package info (click to toggle)
python-biopython 1.85%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 126,372 kB
  • sloc: xml: 1,047,995; python: 332,722; ansic: 16,944; sql: 1,208; makefile: 140; sh: 81
file content (193 lines) | stat: -rw-r--r-- 7,535 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# Copyright (C) 2022, Joao Rodrigues (j.p.g.l.m.rodrigues@gmail.com)
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.

"""Protein Structural Alignment using Combinatorial Extension.

Python code written by Joao Rodrigues. C++ code and Python/C++ interface
adapted from open-source Pymol and originally written by Jason Vertrees.
The original license and notices are available in `cealign` folder.

Reference
---------
Shindyalov, I.N., Bourne P.E. (1998).
"Protein structure alignment by incremental combinatorial extension (CE)
of the optimal path". Protein Engineering. 11 (9): 739–747. PMID 9796821.
"""

import numpy as np

from Bio.PDB.ccealign import run_cealign
from Bio.PDB.PDBExceptions import PDBException
from Bio.PDB.qcprot import QCPSuperimposer

_RESID_SORTER = lambda r: r.id[1]  # noqa: E731


class CEAligner:
    """Protein Structure Alignment by Combinatorial Extension."""

    def __init__(self, window_size=8, max_gap=30):
        """Superimpose one set of atoms onto another using structural data.

        Structures are superimposed using guide atoms, CA and C4', for protein
        and nucleic acid molecules respectively.

        Parameters
        ----------
        window_size : float, optional
            CE algorithm parameter. Used to define paths when building the
            CE similarity matrix. Default is 8.
        max_gap : float, optional
            CE algorithm parameter. Maximum gap size. Default is 30.
        """
        assert window_size > 0, "window_size must be greater than 0"
        assert max_gap >= 0, "max_gap must be positive (or zero)"

        self.window_size = window_size
        self.max_gap = max_gap

        self.rms = None
        self._rigid_motion = None
        self.refcoord = None
        self._coord = None
        self._superimposer = QCPSuperimposer()

    def get_guide_coord_from_structure(self, structure):
        """Return the coordinates of guide atoms in the structure.

        We use guide atoms (C-alpha and C4' atoms) since it is much faster than
        using all atoms in the calculation without a significant loss in
        accuracy.
        """
        coords = []
        # CE algorithm is sensitive to atom ordering. To reproduce Pymol
        # results, sort atoms by chain and then residue number.
        for chain in sorted(structure.get_chains()):
            for resid in sorted(chain, key=_RESID_SORTER):
                if "CA" in resid:
                    coords.append(resid["CA"].coord.tolist())
                elif "C4'" in resid:
                    coords.append(resid["C4'"].coord.tolist())
        if not coords:
            msg = f"Structure {structure.id} does not have any guide atoms."
            raise PDBException(msg)
        return coords

    def set_reference(self, structure):
        """Define a reference structure onto which all others will be aligned."""
        self.refcoord = self.get_guide_coord_from_structure(structure)

        if len(self.refcoord) < self.window_size * 2:
            n_atoms = len(self.refcoord)
            msg = (
                f"Too few atoms in the reference structure ({n_atoms}). "
                "Try reducing the window_size parameter."
            )
            raise PDBException(msg)

    def align(self, structure, transform=True, *, final_optimization=True):
        """Align the input structure onto the reference structure.

        Parameters
        ----------
        transform: bool, optional
            If True (default), apply the rotation/translation that minimizes
            the RMSD between the two structures to the input structure. If
            False, the structure is not modified but the optimal RMSD will
            still be calculated.
        final_optimization: bool, optional
            If True (default), apply additional optimization to statistically
            significant alignments.
        """
        self.rms = None  # clear before aligning
        self._rigid_motion = None

        coord = self.get_guide_coord_from_structure(structure)
        self._coord = coord

        if len(coord) < self.window_size * 2:
            n_atoms = len(coord)
            msg = (
                f"Too few atoms in the mobile structure ({n_atoms}). "
                "Try reducing the window_size parameter."
            )
            raise PDBException(msg)

        # Run CEAlign
        # CEAlign returns the best N paths, sorted descending by length,
        # where each path is a pair of lists with aligned atom indices.
        alignments = run_cealign(self.refcoord, coord, self.window_size, self.max_gap)
        longest_alignments = [
            alignment
            for alignment in alignments
            if alignment.length == alignments[0].length
        ]

        # Iterate over paths and find the one that gives the lowest
        # corresponding RMSD. Use QCP to align the molecules.
        self.rms = float("inf")
        superimposer = self._superimposer
        best_alignment = None
        for alignment in longest_alignments:
            idxA, idxB = alignment.path

            coordsA = np.array([self.refcoord[i] for i in idxA])
            coordsB = np.array([coord[i] for i in idxB])

            superimposer.set(coordsA, coordsB)
            superimposer.run()
            if superimposer.rms < self.rms:
                best_alignment = alignment
                self.rms = superimposer.rms
                self._rigid_motion = (superimposer.rot, superimposer.tran)

        if best_alignment is None:
            raise RuntimeError("Failed to find a suitable alignment.")

        # Gap optimization
        if final_optimization and best_alignment.z_score >= 3.5:
            self._optimize(best_alignment)

        if transform:
            # Transform all atoms
            rotmtx, trvec = self._rigid_motion
            for chain in structure.get_chains():
                for resid in chain.get_unpacked_list():
                    for atom in resid.get_unpacked_list():
                        atom.transform(rotmtx, trvec)

    def _optimize(self, alignment):
        best_path = alignment.path
        superimposer = self._superimposer

        for ab_index in [0, 1]:
            for index in range(1, len(best_path[ab_index]) - 1):
                best_shift = 0
                left = best_path[ab_index][index - 1]
                center = best_path[ab_index][index]
                right = best_path[ab_index][index + 1]

                for shift in range(
                    max(-self.window_size // 2, left - center + 1),
                    min(self.window_size // 2 + 1, right - center),
                ):
                    best_path[ab_index][index] += shift
                    idxA, idxB = best_path

                    coordsA = np.array([self.refcoord[i] for i in idxA])
                    coordsB = np.array([self._coord[i] for i in idxB])

                    superimposer.set(coordsA, coordsB)
                    superimposer.run()
                    best_path[ab_index][index] -= shift

                    if superimposer.rms < self.rms:
                        best_shift = shift
                        self.rms = superimposer.rms
                        self._rigid_motion = (superimposer.rot, superimposer.tran)

                best_path[ab_index][index] += best_shift