File: MMCIFParser.py

package info (click to toggle)
python-biopython 1.64%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 44,416 kB
  • ctags: 12,472
  • sloc: python: 153,759; xml: 67,286; ansic: 9,003; sql: 1,488; makefile: 144; sh: 59
file content (184 lines) | stat: -rw-r--r-- 7,535 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""mmCIF parser"""

from __future__ import print_function

from string import ascii_letters

import numpy

from Bio._py3k import range

from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB.StructureBuilder import StructureBuilder
from Bio.PDB.PDBExceptions import PDBConstructionException


class MMCIFParser(object):
    def get_structure(self, structure_id, filename):
        self._mmcif_dict=MMCIF2Dict(filename)
        self._structure_builder=StructureBuilder()
        self._build_structure(structure_id)
        return self._structure_builder.get_structure()

    def _build_structure(self, structure_id):
        mmcif_dict=self._mmcif_dict
        atom_id_list=mmcif_dict["_atom_site.label_atom_id"]
        residue_id_list=mmcif_dict["_atom_site.label_comp_id"]
        try:
            element_list = mmcif_dict["_atom_site.type_symbol"]
        except KeyError:
            element_list = None
        seq_id_list=mmcif_dict["_atom_site.label_seq_id"]
        chain_id_list=mmcif_dict["_atom_site.label_asym_id"]
        x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]]
        y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]]
        z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]]
        alt_list=mmcif_dict["_atom_site.label_alt_id"]
        b_factor_list=mmcif_dict["_atom_site.B_iso_or_equiv"]
        occupancy_list=mmcif_dict["_atom_site.occupancy"]
        fieldname_list=mmcif_dict["_atom_site.group_PDB"]
        try:
            serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]]
        except KeyError:
            # No model number column
            serial_list = None
        except ValueError:
            # Invalid model number (malformed file)
            raise PDBConstructionException("Invalid model number")
        try:
            aniso_u11=mmcif_dict["_atom_site.aniso_U[1][1]"]
            aniso_u12=mmcif_dict["_atom_site.aniso_U[1][2]"]
            aniso_u13=mmcif_dict["_atom_site.aniso_U[1][3]"]
            aniso_u22=mmcif_dict["_atom_site.aniso_U[2][2]"]
            aniso_u23=mmcif_dict["_atom_site.aniso_U[2][3]"]
            aniso_u33=mmcif_dict["_atom_site.aniso_U[3][3]"]
            aniso_flag=1
        except KeyError:
            # no anisotropic B factors
            aniso_flag=0
        # if auth_seq_id is present, we use this.
        # Otherwise label_seq_id is used.
        if "_atom_site.auth_seq_id" in mmcif_dict:
            seq_id_list=mmcif_dict["_atom_site.auth_seq_id"]
        else:
            seq_id_list=mmcif_dict["_atom_site.label_seq_id"]
        # Now loop over atoms and build the structure
        current_chain_id=None
        current_residue_id=None
        structure_builder=self._structure_builder
        structure_builder.init_structure(structure_id)
        structure_builder.init_seg(" ")
        # Historically, Biopython PDB parser uses model_id to mean array index
        # so serial_id means the Model ID specified in the file
        current_model_id = 0
        current_serial_id = 0
        for i in range(0, len(atom_id_list)):
            x=x_list[i]
            y=y_list[i]
            z=z_list[i]
            resname=residue_id_list[i]
            chainid=chain_id_list[i]
            altloc=alt_list[i]
            if altloc==".":
                altloc=" "
            resseq=seq_id_list[i]
            name=atom_id_list[i]
            # occupancy & B factor
            try:
                tempfactor=float(b_factor_list[i])
            except ValueError:
                raise PDBConstructionException("Invalid or missing B factor")
            try:
                occupancy=float(occupancy_list[i])
            except ValueError:
                raise PDBConstructionException("Invalid or missing occupancy")
            fieldname=fieldname_list[i]
            if fieldname=="HETATM":
                hetatm_flag="H"
            else:
                hetatm_flag=" "
            if serial_list is not None:
                # model column exists; use it
                serial_id = serial_list[i]
                if current_serial_id != serial_id:
                    # if serial changes, update it and start new model
                    current_serial_id = serial_id
                    structure_builder.init_model(current_model_id, current_serial_id)
                    current_model_id += 1
            else:
                # no explicit model column; initialize single model
                structure_builder.init_model(current_model_id)
            if current_chain_id!=chainid:
                current_chain_id=chainid
                structure_builder.init_chain(current_chain_id)
                current_residue_id=resseq
                icode, int_resseq=self._get_icode(resseq)
                structure_builder.init_residue(resname, hetatm_flag, int_resseq,
                    icode)
            elif current_residue_id!=resseq:
                current_residue_id=resseq
                icode, int_resseq=self._get_icode(resseq)
                structure_builder.init_residue(resname, hetatm_flag, int_resseq,
                    icode)
            coord=numpy.array((x, y, z), 'f')
            element = element_list[i] if element_list else None
            structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc,
                name, element=element)
            if aniso_flag==1:
                u=(aniso_u11[i], aniso_u12[i], aniso_u13[i],
                    aniso_u22[i], aniso_u23[i], aniso_u33[i])
                mapped_anisou = [float(x) for x in u]
                anisou_array=numpy.array(mapped_anisou, 'f')
                structure_builder.set_anisou(anisou_array)
        # Now try to set the cell
        try:
            a=float(mmcif_dict["_cell.length_a"])
            b=float(mmcif_dict["_cell.length_b"])
            c=float(mmcif_dict["_cell.length_c"])
            alpha=float(mmcif_dict["_cell.angle_alpha"])
            beta=float(mmcif_dict["_cell.angle_beta"])
            gamma=float(mmcif_dict["_cell.angle_gamma"])
            cell=numpy.array((a, b, c, alpha, beta, gamma), 'f')
            spacegroup=mmcif_dict["_symmetry.space_group_name_H-M"]
            spacegroup=spacegroup[1:-1] # get rid of quotes!!
            if spacegroup is None:
                raise Exception
            structure_builder.set_symmetry(spacegroup, cell)
        except:
            pass    # no cell found, so just ignore

    def _get_icode(self, resseq):
        """Tries to return the icode. In MMCIF files this is just part of
        resseq! In PDB files, it's a separate field."""
        last_resseq_char=resseq[-1]
        if last_resseq_char in ascii_letters:
            icode=last_resseq_char
            int_resseq=int(resseq[0:-1])
        else:
            icode=" "
            int_resseq=int(resseq)
        return icode, int_resseq


if __name__=="__main__":
    import sys

    if len(sys.argv) != 2:
        print("Usage: python MMCIFparser.py filename")
        raise SystemExit
    filename=sys.argv[1]

    p=MMCIFParser()

    structure=p.get_structure("test", filename)

    for model in structure.get_list():
        print(model)
        for chain in model.get_list():
            print(chain)
            print("Found %d residues." % len(chain.get_list()))