File: rdRascalMCES.cpp

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (306 lines) | stat: -rw-r--r-- 14,239 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
//
// Copyright (C) David Cosgrove 2023
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//

#include <RDBoost/python.h>
#include <RDBoost/Wrap.h>

#include <GraphMol/ROMol.h>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalOptions.h>
#include <GraphMol/RascalMCES/RascalResult.h>

namespace python = boost::python;

namespace {

python::list convertVecPairInt(const std::vector<std::pair<int, int>> &vec) {
  python::list pyres;
  for (const auto &p : vec) {
    python::tuple tup = python::make_tuple(p.first, p.second);
    pyres.append(tup);
  }
  return pyres;
}

python::list bondMatches(const RDKit::RascalMCES::RascalResult &self) {
  return convertVecPairInt(self.getBondMatches());
}
python::list atomMatches(const RDKit::RascalMCES::RascalResult &self) {
  return convertVecPairInt(self.getAtomMatches());
}

void largestFragmentOnly(RDKit::RascalMCES::RascalResult &self) {
  self.largestFragOnly();
}

struct RascalResult_wrapper {
  static void wrap() {
    std::string docString = "Used to return RASCAL MCES results.";
    python::class_<RDKit::RascalMCES::RascalResult>(
        "RascalResult", docString.c_str(), python::no_init)
        .def_readonly("smartsString",
                      &RDKit::RascalMCES::RascalResult::getSmarts,
                      "SMARTS string defining the MCES.")
        .def("bondMatches", bondMatches, python::args("self"),
             "A function returning a list of list "
             "of tuples, each inner list containing the matching bonds in the "
             "MCES as tuples of bond indices from mol1 and mol2")
        .def("atomMatches", atomMatches, python::args("self"),
             "Likewise for atoms.")
        .def(
            "largestFragmentOnly", largestFragmentOnly, python::args("self"),
            "Function that cuts the MCES down to the single largest frag.  This cannot be undone.")
        .def_readonly("similarity",
                      &RDKit::RascalMCES::RascalResult::getSimilarity,
                      "Johnson similarity between 2 molecules.")
        .def_readonly("numFragments",
                      &RDKit::RascalMCES::RascalResult::getNumFrags,
                      "Number of fragments in MCES.")
        .def_readonly("largestFragmentSize",
                      &RDKit::RascalMCES::RascalResult::getLargestFragSize,
                      "Number of atoms in largest fragment.")
        .def_readonly("tier1Sim", &RDKit::RascalMCES::RascalResult::getTier1Sim,
                      "The tier 1 similarity estimate.")
        .def_readonly("tier2Sim", &RDKit::RascalMCES::RascalResult::getTier2Sim,
                      "The tier 2 similarity estimate.")
        .def_readonly("timedOut", &RDKit::RascalMCES::RascalResult::getTimedOut,
                      "Whether it timed out.");
  }
};
}  // namespace

namespace RDKit {

python::list findMCESWrapper(const ROMol &mol1, const ROMol &mol2,
                             const python::object &py_opts) {
  RascalMCES::RascalOptions opts;
  if (!py_opts.is_none()) {
    opts = python::extract<RascalMCES::RascalOptions>(py_opts);
  }
  std::vector<RDKit::RascalMCES::RascalResult> results;
  {
    NOGIL gil;
    results = RascalMCES::rascalMCES(mol1, mol2, opts);
  }
  python::list pyres;
  for (auto &res : results) {
    pyres.append(res);
  }
  return pyres;
}

std::vector<std::shared_ptr<ROMol>> extractMols(python::object mols) {
  std::vector<std::shared_ptr<ROMol>> cmols;
  unsigned int nElems = python::extract<unsigned int>(mols.attr("__len__")());
  cmols.resize(nElems);
  for (unsigned int i = 0; i < nElems; ++i) {
    if (!mols[i]) {
      throw_value_error("molecule is None");
    }
    cmols[i] = python::extract<std::shared_ptr<ROMol>>(mols[i]);
  }
  return cmols;
}

python::list packOutputMols(
    const std::vector<std::vector<unsigned int>> &clusters) {
  python::list pyres;
  for (auto &clus : clusters) {
    python::list mols;
    for (auto &m : clus) {
      mols.append(m);
    }
    pyres.append(mols);
  }
  return pyres;
}

python::list rascalClusterWrapper(python::object mols,
                                  const python::object &py_opts) {
  RascalMCES::RascalClusterOptions opts;
  if (!py_opts.is_none()) {
    opts = python::extract<RascalMCES::RascalClusterOptions>(py_opts);
  }
  auto cmols = extractMols(mols);
  std::vector<RDKit::UINT_VECT> clusters;
  {
    NOGIL gil;
    clusters = RascalMCES::rascalCluster(cmols, opts);
  }
  return packOutputMols(clusters);
}

python::list rascalButinaClusterWrapper(python::object mols,
                                        const python::object &py_opts) {
  RascalMCES::RascalClusterOptions opts;
  if (!py_opts.is_none()) {
    opts = python::extract<RascalMCES::RascalClusterOptions>(py_opts);
  }
  auto cmols = extractMols(mols);
  std::vector<RDKit::UINT_VECT> clusters;
  {
    NOGIL gil;
    clusters = RascalMCES::rascalButinaCluster(cmols, opts);
  }
  return packOutputMols(clusters);
}

BOOST_PYTHON_MODULE(rdRascalMCES) {
  python::scope().attr("__doc__") =
      "Module containing implementation of RASCAL Maximum Common Edge Substructure algorithm.";
  RascalResult_wrapper::wrap();

  std::string docString = "RASCAL Options";
  python::class_<RDKit::RascalMCES::RascalOptions, boost::noncopyable>(
      "RascalOptions", docString.c_str())
      .def_readwrite(
          "similarityThreshold",
          &RDKit::RascalMCES::RascalOptions::similarityThreshold,
          "Threshold below which MCES won't be run.  Between 0.0 and 1.0, default=0.7.")
      .def_readwrite(
          "singleLargestFrag",
          &RDKit::RascalMCES::RascalOptions::singleLargestFrag,
          "Return the just single largest fragment of the MCES.  This is equivalent to running with allBestMCEs=True, finding the result with the largest largestFragmentSize, and calling its largestFragmentOnly method.")
      .def_readwrite(
          "completeAromaticRings",
          &RDKit::RascalMCES::RascalOptions::completeAromaticRings,
          "If True (default), partial aromatic rings won't be returned.")
      .def_readwrite("ringMatchesRingOnly",
                     &RDKit::RascalMCES::RascalOptions::ringMatchesRingOnly,
                     "If True (default is False), ring bonds won't match non-ring bonds.")
      .def_readwrite("completeSmallestRings",
                     &RDKit::RascalMCES::RascalOptions::completeSmallestRings,
                     "If True (default is False), only complete rings present in both input molecule's RingInfo will be returned. Implies completeAromaticRings and ringMatchesRingOnly.")
      .def_readwrite(
          "exactConnectionsMatch",
          &RDKit::RascalMCES::RascalOptions::exactConnectionsMatch,
          "If True (default is False), atoms will only match atoms if they have the same\n"
          " number of explicit connections.  E.g. the central atom of\n"
          " C(C)(C) won't match either atom in CC")
      .def_readwrite(
          "minFragSize", &RDKit::RascalMCES::RascalOptions::minFragSize,
          "Imposes a minimum on the number of atoms in a fragment that may be part of the MCES.  Default -1 means no minimum.")
      .def_readwrite(
          "maxFragSeparation",
          &RDKit::RascalMCES::RascalOptions::maxFragSeparation,
          "Maximum number of bonds between fragments in the MCES for both to be reported.  Default -1 means no maximum.  If exceeded, the smaller fragment will be removed.")
      .def_readwrite(
          "allBestMCESs", &RDKit::RascalMCES::RascalOptions::allBestMCESs,
          "If True, reports all MCESs found of the same maximum size.  Default False means just report the first found.")
      .def_readwrite(
          "returnEmptyMCES", &RDKit::RascalMCES::RascalOptions::returnEmptyMCES,
          "If the estimated similarity between the 2 molecules doesn't meet the similarityThreshold, no results are returned.  If you want to know what the"
          " estimates were, set this to True, and examine the tier1Sim and tier2Sim properties of the result then returned.")
      .def_readwrite(
          "timeout", &RDKit::RascalMCES::RascalOptions::timeout,
          "Maximum time (in seconds) to spend on an individual MCESs determination.  Default 60, -1 means no limit.")
      .def_readwrite(
          "maxBondMatchPairs",
          &RDKit::RascalMCES::RascalOptions::maxBondMatchPairs,
          "Too many matching bond (vertex) pairs can cause the process to run out of memory."
          "  The default of 1000 is fairly safe.  Increase with caution, as memory use increases"
          " with the square of this number.  ")
      .def_readwrite("equivalentAtoms",
                     &RDKit::RascalMCES::RascalOptions::equivalentAtoms,
                     "SMARTS strings defining atoms that should"
                     "be considered equivalent. e.g."
                     "[F,Cl,Br,I] so all halogens will match each other."
                     "Space-separated list allowing more than 1"
                     "class of equivalent atoms.")
      .def_readwrite("ignoreBondOrders",
                     &RDKit::RascalMCES::RascalOptions::ignoreBondOrders,
                     "If True, will treat all bonds as the same,"
                     " irrespective of order.  Default=False.")
      .def_readwrite("ignoreAtomAromaticity",
                     &RDKit::RascalMCES::RascalOptions::ignoreAtomAromaticity,
                     "If True, matches atoms solely on atomic number."
                     "  If False, will treat aromatic and aliphatic atoms"
                     " as different.  Default=True.")
      .def_readwrite("minCliqueSize",
                     &RDKit::RascalMCES::RascalOptions::minCliqueSize,
                     "Normally, the minimum clique size is specified"
                     " via the similarityThreshold.  Sometimes it's"
                     " more convenient to specify it directly.  If this"
                     " is > 0, it will over-ride the"
                     " similarityThreshold."
                     "  Note that this refers to the"
                     " minimum number of BONDS in the MCES. Default=0.");

  docString =
      "Find one or more MCESs between the 2 molecules given.  Returns a list of "
      "RascalResult objects."
      "- mol1"
      "- mol2 The two molecules for which to find the MCES"
      "- opts Optional RascalOptions object changing the default run mode."
      "";
  python::def("FindMCES", &RDKit::findMCESWrapper,
              (python::arg("mol1"), python::arg("mol2"),
               python::arg("opts") = python::object()),
              docString.c_str());

  docString =
      "RASCAL Cluster Options.  Most of these pertain to RascalCluster calculations.  Only similarityCutoff is used by RascalButinaCluster.";
  python::class_<RDKit::RascalMCES::RascalClusterOptions, boost::noncopyable>(
      "RascalClusterOptions", docString.c_str())
      .def_readwrite(
          "similarityCutoff",
          &RDKit::RascalMCES::RascalClusterOptions::similarityCutoff,
          "Similarity cutoff for molecules to be in the same cluster.  Between 0.0 and 1.0, default=0.7.")
      .def_readwrite(
          "minFragSize", &RDKit::RascalMCES::RascalClusterOptions::minFragSize,
          "The minimum number of atoms in a fragment for it to be included in the MCES.  Default=3.")
      .def_readwrite(
          "maxNumFrags", &RDKit::RascalMCES::RascalClusterOptions::maxNumFrags,
          "The maximum number of fragments allowed in the MCES for each pair of molecules. Default=2.  So that the MCES"
          " isn't a lot of small fragments scattered around the molecules giving an inflated estimate of similarity.")
      .def_readwrite(
          "numThreads", &RDKit::RascalMCES::RascalClusterOptions::numThreads,
          "Number of threads to use during clustering.  Default=-1 means all the hardware threads less one.")
      .def_readwrite(
          "a", &RDKit::RascalMCES::RascalClusterOptions::a,
          "The penalty score for each unconnected component in the MCES. Default=0.05.")
      .def_readwrite(
          "b", &RDKit::RascalMCES::RascalClusterOptions::a,
          "The weight of matched bonds over matched atoms. Default=2.")
      .def_readwrite(
          "minIntraClusterSim",
          &RDKit::RascalMCES::RascalClusterOptions::minIntraClusterSim,
          "Two pairs of molecules are included in the same cluster if the similarity between"
          " their MCESs is greater than this.  Default=0.9.")
      .def_readwrite(
          "clusterMergeSim",
          &RDKit::RascalMCES::RascalClusterOptions::clusterMergeSim,
          "Two clusters are merged if the fraction of molecules they have in common is greater than this.  Default=0.6.");

  docString =
      "Use the RASCAL MCES similarity metric to do fuzzy clustering.  Returns a list of lists "
      "of molecules, each inner list being a cluster.  The last cluster is all the "
      "molecules that didn't fit into another cluster (the singletons)."
      "- mols List of molecules to be clustered"
      "- opts Optional RascalOptions object changing the default run mode."
      "";
  python::def("RascalCluster", &RDKit::rascalClusterWrapper,
              (python::arg("mols"), python::arg("opts") = python::object()),
              docString.c_str());
  docString =
      "Use the RASCAL MCES similarity metric to do Butina clustering"
      " (Butina JCICS 39 747-750 (1999)).  Returns a list of lists of molecules,"
      " each inner list being a cluster.  The last cluster is all the"
      " molecules that didn't fit into another cluster (the singletons)."
      "- mols List of molecules to be clustered"
      "- opts Optional RascalOptions object changing the default run mode."
      "";
  python::def("RascalButinaCluster", &RDKit::rascalButinaClusterWrapper,
              (python::arg("mols"), python::arg("opts") = python::object()),
              docString.c_str());
}

}  // namespace RDKit