File: proteindatabank.py

package info (click to toggle)
python-ase 3.26.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 15,484 kB
  • sloc: python: 148,112; xml: 2,728; makefile: 110; javascript: 47
file content (244 lines) | stat: -rw-r--r-- 8,288 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# fmt: off

"""Module to read and write atoms in PDB file format.

See::

    http://www.wwpdb.org/documentation/file-format

Note: The PDB format saves cell lengths and angles; hence the absolute
orientation is lost when saving.  Saving and loading a file will
conserve the scaled positions, not the absolute ones.
"""

import warnings

import numpy as np

from ase.atoms import Atoms
from ase.cell import Cell
from ase.io.espresso import label_to_symbol
from ase.utils import reader, writer


def read_atom_line(line_full):
    """
    Read atom line from pdb format
    HETATM    1  H14 ORTE    0       6.301   0.693   1.919  1.00  0.00        H
    """
    line = line_full.rstrip('\n')
    type_atm = line[0:6]
    if type_atm == "ATOM  " or type_atm == "HETATM":

        name = line[12:16].strip()

        altloc = line[16]
        resname = line[17:21].strip()
        # chainid = line[21]        # Not used

        seq = line[22:26].split()
        if len(seq) == 0:
            resseq = 1
        else:
            resseq = int(seq[0])  # sequence identifier
        # icode = line[26]          # insertion code, not used

        # atomic coordinates
        try:
            coord = np.array([float(line[30:38]),
                              float(line[38:46]),
                              float(line[46:54])], dtype=np.float64)
        except ValueError:
            raise ValueError("Invalid or missing coordinate(s)")

        # occupancy & B factor
        try:
            occupancy = float(line[54:60])
        except ValueError:
            occupancy = None  # Rather than arbitrary zero or one

        if occupancy is not None and occupancy < 0:
            warnings.warn("Negative occupancy in one or more atoms")

        try:
            bfactor = float(line[60:66])
        except ValueError:
            # The PDB use a default of zero if the data is missing
            bfactor = 0.0

        # segid = line[72:76] # not used
        symbol = line[76:78].strip().upper()

    else:
        raise ValueError("Only ATOM and HETATM supported")

    return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq


@reader
def read_proteindatabank(fileobj, index=-1, read_arrays=True):
    """Read PDB files."""
    images = []
    orig = np.identity(3)
    trans = np.zeros(3)
    occ = []
    bfactor = []
    residuenames = []
    residuenumbers = []
    atomtypes = []

    symbols = []
    positions = []
    cell = None
    pbc = None

    def build_atoms():
        atoms = Atoms(symbols=symbols,
                      cell=cell, pbc=pbc,
                      positions=positions)

        if not read_arrays:
            return atoms

        info = {'occupancy': occ,
                'bfactor': bfactor,
                'residuenames': residuenames,
                'atomtypes': atomtypes,
                'residuenumbers': residuenumbers}
        for name, array in info.items():
            if len(array) == 0:
                pass
            elif len(array) != len(atoms):
                warnings.warn('Length of {} array, {}, '
                              'different from number of atoms {}'.
                              format(name, len(array), len(atoms)))
            else:
                atoms.set_array(name, np.array(array))
        return atoms

    for line in fileobj.readlines():
        if line.startswith('CRYST1'):
            cellpar = [float(line[6:15]),  # a
                       float(line[15:24]),  # b
                       float(line[24:33]),  # c
                       float(line[33:40]),  # alpha
                       float(line[40:47]),  # beta
                       float(line[47:54])]  # gamma
            cell = Cell.new(cellpar)
            pbc = True
        for c in range(3):
            if line.startswith('ORIGX' + '123'[c]):
                orig[c] = [float(line[10:20]),
                           float(line[20:30]),
                           float(line[30:40])]
                trans[c] = float(line[45:55])

        if line.startswith('ATOM') or line.startswith('HETATM'):
            # Atom name is arbitrary and does not necessarily
            # contain the element symbol.  The specification
            # requires the element symbol to be in columns 77+78.
            # Fall back to Atom name for files that do not follow
            # the spec, e.g. packmol.

            # line_info = symbol, name, altloc, resname, coord, occupancy,
            #             bfactor, resseq
            line_info = read_atom_line(line)

            try:
                symbol = label_to_symbol(line_info[0])
            except (KeyError, IndexError):
                symbol = label_to_symbol(line_info[1])

            position = np.dot(orig, line_info[4]) + trans
            atomtypes.append(line_info[1])
            residuenames.append(line_info[3])
            if line_info[5] is not None:
                occ.append(line_info[5])
            bfactor.append(line_info[6])
            residuenumbers.append(line_info[7])

            symbols.append(symbol)
            positions.append(position)

        if line.startswith('END'):
            # End of configuration reached
            # According to the latest PDB file format (v3.30),
            # this line should start with 'ENDMDL' (not 'END'),
            # but in this way PDB trajectories from e.g. CP2K
            # are supported (also VMD supports this format).
            atoms = build_atoms()
            images.append(atoms)
            occ = []
            bfactor = []
            residuenames = []
            residuenumbers = []
            atomtypes = []
            symbols = []
            positions = []
            cell = None
            pbc = None

    if len(images) == 0:
        atoms = build_atoms()
        images.append(atoms)
    return images[index]


@writer
def write_proteindatabank(fileobj, images, write_arrays=True):
    """Write images to PDB-file."""
    rot_t = None
    if hasattr(images, 'get_positions'):
        images = [images]

    #     1234567 123 6789012345678901   89   67   456789012345678901234567 890
    format = ('ATOM  %5d %4s %4s %4d    %8.3f%8.3f%8.3f%6.2f%6.2f'
              '          %2s  \n')

    # RasMol complains if the atom index exceeds 100000. There might
    # be a limit of 5 digit numbers in this field.
    MAXNUM = 100000

    symbols = images[0].get_chemical_symbols()
    natoms = len(symbols)

    for n, atoms in enumerate(images):
        if atoms.get_pbc().any():
            currentcell = atoms.get_cell()
            cellpar = currentcell.cellpar()
            _, rot_t = currentcell.standard_form()
            # ignoring Z-value, using P1 since we have all atoms defined
            # explicitly
            cellformat = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n'
            fileobj.write(cellformat % (cellpar[0], cellpar[1], cellpar[2],
                                        cellpar[3], cellpar[4], cellpar[5]))
        fileobj.write('MODEL     ' + str(n + 1) + '\n')
        p = atoms.get_positions()
        if rot_t is not None:
            p = p.dot(rot_t.T)
        occupancy = np.ones(len(atoms))
        bfactor = np.zeros(len(atoms))
        residuenames = ['MOL '] * len(atoms)
        residuenumbers = np.ones(len(atoms))
        names = atoms.get_chemical_symbols()
        if write_arrays:
            if 'occupancy' in atoms.arrays:
                occupancy = atoms.get_array('occupancy')
            if 'bfactor' in atoms.arrays:
                bfactor = atoms.get_array('bfactor')
            if 'residuenames' in atoms.arrays:
                residuenames = atoms.get_array('residuenames')
            if 'residuenumbers' in atoms.arrays:
                residuenumbers = atoms.get_array('residuenumbers')
            if 'atomtypes' in atoms.arrays:
                names = atoms.get_array('atomtypes')
        for a in range(natoms):
            x, y, z = p[a]
            occ = occupancy[a]
            bf = bfactor[a]
            resname = residuenames[a].ljust(4)
            resseq = residuenumbers[a]
            name = names[a]
            fileobj.write(format % ((a + 1) % MAXNUM, name, resname, resseq,
                                    x, y, z, occ, bf, symbols[a].upper()))
        fileobj.write('ENDMDL\n')