File: getresdata.py

package info (click to toggle)
avogadrolibs 1.100.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 77,996 kB
  • sloc: cpp: 127,202; ansic: 2,212; python: 1,433; perl: 321; sh: 83; makefile: 43
file content (216 lines) | stat: -rw-r--r-- 5,683 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python

from __future__ import print_function

import os
import sys

import requests

from openbabel import pybel
from openbabel import openbabel as ob

# TODO: process Open Babel resdata.txt
#   if we can find certain non-standard residues
mdLigands = [
"ASH", # Neutral ASP
"CYX", # SS-bonded CYS
"CYM", # Negative CYS
"GLH", # Neutral GLU
"HIP", # Positive HIS
"HID", # Neutral HIS, proton HD1 present
"HIE", # Neutral HIS, proton HE2 present
"LYN", # Neutral LYS
"TYM", # Negative TYR
]

# the location of the LigandExpo list by count
ligandURL = "http://ligand-expo.rcsb.org/dictionaries/cc-counts.tdd"
# URL for the ideal geometry
# e.g http://ligand-expo.rcsb.org/reports/H/HEM/HEM_ideal.pdb
sdfTemplate = "http://ligand-expo.rcsb.org/reports/{}/{}/{}_ideal.sdf"
# URL for the ideal geometry (PDB)
pdbTemplate = "http://ligand-expo.rcsb.org/reports/{}/{}/{}_ideal.pdb"
# save ligands with at least this # of occurrences

ligandThresh = 500

# default ligand list
ligands = [
# amino acids
"ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", "LEU",
"MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", "TRP", "TYR",
# DNA nucleic
"DA", "DC", "DG", "DT", "DI",
# RNA nucleic
"A", "C", "G", "U", "I",
# misc
"HEM", "HOH"
]

# okay, we build up the list of ligands to fetch
r = requests.get(ligandURL, stream=True)
for line in r.iter_lines(decode_unicode=True):
    if 'count' in str(line):
        continue # skip first line

    name, count = line.split()
    if (int(count) < ligandThresh):
        # too rare, we'll skip the rest of the list
        break
    if str(name) not in ligands:
        ligands.append(str(name))

print(
'''
#ifndef AVOGADRO_CORE_RESIDUE_DATA
#define AVOGADRO_CORE_RESIDUE_DATA

#include <map>
#include <string>
#include <vector>
namespace Avogadro {
namespace Core {

class ResidueData
{
private:
  std::string m_residueName;
  std::map<std::string, int> m_residueAtomNames;
  std::vector<std::pair<std::string, std::string>> m_residueSingleBonds;
  std::vector<std::pair<std::string, std::string>> m_residueDoubleBonds;

public:
  ResidueData() {}
  ResidueData(std::string name,
              std::map<std::string, int> atomNames,
              std::vector<std::pair<std::string, std::string>> singleBonds,
              std::vector<std::pair<std::string, std::string>> doubleBonds)
  {
    m_residueName = name;
    m_residueAtomNames = atomNames;
    m_residueSingleBonds = singleBonds;
    m_residueDoubleBonds = doubleBonds;
  }

  ResidueData(const ResidueData& other)
  {
    m_residueName = other.m_residueName;
    m_residueAtomNames = other.m_residueAtomNames;
    m_residueSingleBonds = other.m_residueSingleBonds;
    m_residueDoubleBonds = other.m_residueDoubleBonds;
  }

  ResidueData& operator=(ResidueData other)
  {
    using std::swap;
    swap(*this, other);
    return *this;
  }

  std::map<std::string, int> residueAtoms() {
    return m_residueAtomNames;
  }

  std::vector<std::pair<std::string, std::string>> residueSingleBonds()
  {
    return m_residueSingleBonds;
  }

  std::vector<std::pair<std::string, std::string>> residueDoubleBonds()
  {
    return m_residueDoubleBonds;
  }
};
'''
)

final_ligands = []
for ligand in ligands:
    sdf = requests.get(sdfTemplate.format(ligand[0], ligand, ligand))
    # there *must* be a way to do this from a requests buffer, but this works
    with open('temp.sdf', 'wb') as handle:
        for block in sdf.iter_content(1024):
            handle.write(block)

    try:
      mol_sdf = next(pybel.readfile("sdf", 'temp.sdf'))
    except StopIteration:
      continue

    if len(mol_sdf.atoms) < 2:
        continue
    final_ligands.append(ligand)

    pdb = requests.get(pdbTemplate.format(ligand[0], ligand, ligand))
    with open('temp.pdb', 'wb') as handle:
        for block in pdb.iter_content(1024):
            handle.write(block)

    try:
      mol_pdb = next(pybel.readfile("pdb", 'temp.pdb'))
    except StopIteration:
      continue
    
    atom_map = {}
    for i in range(len(mol_sdf.atoms)):
        idx = mol_sdf.atoms[i].idx
        atom = mol_pdb.atoms[i].OBAtom
        res = atom.GetResidue()
        # build up a map between atom index and atom ID
        atom_map[idx] = res.GetAtomID(atom).strip().rstrip(), atom.GetAtomicNum()

    # go through bonds
    single_bonds = []
    double_bonds = []
    for bond in ob.OBMolBondIter(mol_sdf.OBMol):
        begin = bond.GetBeginAtomIdx()
        end = bond.GetEndAtomIdx()
        if bond.GetBondOrder() == 2:
            double_bonds.append((atom_map[begin][0], atom_map[end][0]))
        elif bond.GetBondOrder() == 1:
            single_bonds.append((atom_map[begin][0], atom_map[end][0]))

    # print out the residue data
    print('ResidueData %sData("%s",' % (ligand, ligand))
    print('// Atoms')
    print('{')
    for atom in list(atom_map.values())[:-1]:
        print('{ "%s", %d },' % (atom[0], atom[1]), end='')
    print('{"%s", %d }' % (atom[0], atom[1]))
    print('},')

    print('// Single Bonds')
    print('{')
    for bond in single_bonds[:-1]:
        print('{ "%s", "%s" },' % bond, end='')
    print('{ "%s", "%s" }' % single_bonds[-1])
    print('},')

    print('// Double Bonds')
    print('{')
    if len(double_bonds):
        for bond in double_bonds[:-1]:
            print('{ "%s", "%s" },' % bond, end='')
        print('{ "%s", "%s" }' % double_bonds[-1])

    print('}')

    print(');')

print('''std::map<std::string, ResidueData> residueDict = {''')

# print the list of ligands
for ligand in final_ligands:
    print('{ "%s", %sData },' % (ligand, ligand))

print('''
};
}
}

#endif
'''
)
os.remove("temp.sdf")
os.remove('temp.pdb')