File: disulf.cpp

package info (click to toggle)
gemmi 0.7.4%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,644 kB
  • sloc: cpp: 64,445; python: 5,425; ansic: 4,545; sh: 374; makefile: 112; javascript: 86; f90: 42
file content (156 lines) | stat: -rw-r--r-- 5,663 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// This is a manually run test. It searches for disulfide (SG-SG only)
// bonds in two ways:
// - using a simple search - find_disulfide_bonds1()
// - using cell-linked lists method from neighbor.hpp - find_disulfide_bonds2()
// and compares the results.
// It was run for all PDB entries to test the cell lists implementation.

#include <gemmi/neighbor.hpp>
#include <gemmi/contact.hpp>
#include <gemmi/model.hpp>
#include <gemmi/mmread_gz.hpp>
#include <gemmi/dirwalk.hpp>
#include <stdexcept>  // for runtime_error
#include <chrono>

static int verbose = false;
using namespace gemmi;

struct BondInfo {
  const_CRA cra1, cra2;
  int image_idx;
  double dist_sq;

  void print(const UnitCell& cell) const {
    NearestImage im = cell.find_nearest_pbc_image(cra1.atom->pos, cra2.atom->pos, image_idx);
    assert(fabs(dist_sq - im.dist_sq) < 1e-3);
    std::printf("%s - %s  im:%s  %.3f\n",
                atom_str(cra1).c_str(), atom_str(cra2).c_str(),
                im.symmetry_code(true).c_str(), std::sqrt(dist_sq));
  }

};

// Number of SG atoms is relatively small; checking all pairs should be fast.
std::vector<BondInfo> find_disulfide_bonds1(const Model& model,
                                              const UnitCell& cell) {
  const double max_dist = 3.0;
  // Find all SG sulfur atoms.
  std::vector<const_CRA> atoms;
  const std::string sg = "SG";
  for (const Chain& chain : model.chains)
    for (const Residue& res : chain.residues)
      for (const Atom& atom : res.atoms)
        if (atom.element == El::S && atom.name == sg)
          atoms.push_back(const_CRA{&chain, &res, &atom});
  // Check distances.
  std::vector<BondInfo> ret;
  for (size_t i = 0; i < atoms.size(); ++i) {
    const Atom* a1 = atoms[i].atom;
    for (size_t j = i; j < atoms.size(); ++j) {
      const Atom* a2 = atoms[j].atom;
      if (a1->same_conformer(*a2)) {
        Asu asu = (i != j ? Asu::Any : Asu::Different);
        NearestImage im = cell.find_nearest_image(a1->pos, a2->pos, asu);
        // if i == j and the image is nearby the atom is on special position
        if (im.dist_sq < max_dist * max_dist && (i != j || im.dist_sq > 1.0))
          ret.push_back({atoms[i], atoms[j], im.sym_idx, im.dist_sq});
      }
    }
  }
  return ret;
}

// use NeighborSearch (cell list method) to find disulfide SG-SG bonds
static std::vector<BondInfo> find_disulfide_bonds2(Model& model,
                                                   const UnitCell& cell) {
  const double max_dist = 3.0;
  const std::string sg = "SG";
  NeighborSearch ns(model, cell, 5.0);
#if 0
  ns.populate();
#else
  for (const Chain& chain : model.chains)
    for (const Residue& res : chain.residues)
      for (const Atom& atom : res.atoms)
        if (atom.element == El::S && atom.name == sg) {
          auto indices = model.get_indices(&chain, &res, &atom);
          ns.add_atom(atom, indices[0], indices[1], indices[2]);
        }
#endif
  std::vector<BondInfo> ret;
#if 0  // faster, but requires more code
  for (const Chain& chain : model.chains)
    for (const Residue& res : chain.residues)
      for (const Atom& atom : res.atoms)
        if (atom.element == El::S && atom.name == sg) {
          auto indices = model.get_indices(&chain, &res, &atom);
          ns.for_each(atom.pos, atom.altloc, max_dist,
                      [&](const NeighborSearch::Mark& m, double dist_sq) {
              if (m.element != El::S)
                return;
              if (indices[0] > m.chain_idx || (indices[0] == m.chain_idx &&
                    (indices[1] > m.residue_idx ||
                     (indices[1] == m.residue_idx && dist_sq < 1))))
                return;
              const_CRA cra1 = {&chain, &res, &atom};
              const_CRA cra2 = m.to_cra(model);
              if (cra2.atom->name == sg)
                ret.push_back({cra1, cra2, m.image_idx, dist_sq});
          });
        }
#else  // slower, but simpler
  ContactSearch contacts(max_dist);
  contacts.for_each_contact(ns, [&](const CRA& cra1, const CRA& cra2,
                                    int image_idx, double dist_sq) {
      if (cra1.atom->element == El::S && cra1.atom->name == sg &&
          cra2.atom->element == El::S && cra2.atom->name == sg)
        ret.push_back({cra1, cra2, image_idx, dist_sq});
  });
#endif
  return ret;
}

static void check_disulf(const std::string& path) {
  if (verbose)
    printf("path: %s\n", path.c_str());
  Structure st = read_structure_gz(path);
  Model& model = st.first_model();
  using Clock = std::chrono::steady_clock;
  auto start = Clock::now();
  std::vector<BondInfo> c1 = find_disulfide_bonds1(model, st.cell);
  std::chrono::duration<double> elapsed1 = Clock::now() - start;
  start = Clock::now();
  std::vector<BondInfo> c2 = find_disulfide_bonds2(model, st.cell);
  std::chrono::duration<double> elapsed2 = Clock::now() - start;
  printf("%10s  %zu %zu  (%.3g %.3g s)\n", st.name.c_str(),
         c1.size(), c2.size(), elapsed1.count(), elapsed2.count());
  if (c1.size() != c2.size() || verbose) {
    for (const BondInfo& bi : c1)
      bi.print(st.cell);
    printf("---\n");
    for (const BondInfo& bi : c2)
      bi.print(st.cell);
  }
}

int main(int argc, char* argv[]) {
  int pos = 1;
  while (pos < argc && argv[pos] == std::string("-v")) {
    ++pos;
    verbose = true;
  }
  if (pos >= argc) {
    std::fprintf(stderr, "No paths given.\n");
    return 1;
  }
  try {
    for (; pos != argc; ++pos)
      for (std::string path : CoorFileWalk(argv[pos], 'M'))
        check_disulf(path);
  } catch (std::runtime_error& err) {
    std::fprintf(stderr, "Error: %s\n", err.what());
    return 1;
  }
  return 0;
}