1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
|
//
// Copyright (C) David Cosgrove 2023
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDBoost/python.h>
#include <RDBoost/Wrap.h>
#include <GraphMol/ROMol.h>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalOptions.h>
#include <GraphMol/RascalMCES/RascalResult.h>
namespace python = boost::python;
namespace {
python::list convertVecPairInt(const std::vector<std::pair<int, int>> &vec) {
python::list pyres;
for (const auto &p : vec) {
python::tuple tup = python::make_tuple(p.first, p.second);
pyres.append(tup);
}
return pyres;
}
python::list bondMatches(const RDKit::RascalMCES::RascalResult &self) {
return convertVecPairInt(self.getBondMatches());
}
python::list atomMatches(const RDKit::RascalMCES::RascalResult &self) {
return convertVecPairInt(self.getAtomMatches());
}
void largestFragmentOnly(RDKit::RascalMCES::RascalResult &self) {
self.largestFragOnly();
}
struct RascalResult_wrapper {
static void wrap() {
std::string docString = "Used to return RASCAL MCES results.";
python::class_<RDKit::RascalMCES::RascalResult>(
"RascalResult", docString.c_str(), python::no_init)
.def_readonly("smartsString",
&RDKit::RascalMCES::RascalResult::getSmarts,
"SMARTS string defining the MCES.")
.def("bondMatches", bondMatches, python::args("self"),
"A function returning a list of list "
"of tuples, each inner list containing the matching bonds in the "
"MCES as tuples of bond indices from mol1 and mol2")
.def("atomMatches", atomMatches, python::args("self"),
"Likewise for atoms.")
.def(
"largestFragmentOnly", largestFragmentOnly, python::args("self"),
"Function that cuts the MCES down to the single largest frag. This cannot be undone.")
.def_readonly("similarity",
&RDKit::RascalMCES::RascalResult::getSimilarity,
"Johnson similarity between 2 molecules.")
.def_readonly("numFragments",
&RDKit::RascalMCES::RascalResult::getNumFrags,
"Number of fragments in MCES.")
.def_readonly("largestFragmentSize",
&RDKit::RascalMCES::RascalResult::getLargestFragSize,
"Number of atoms in largest fragment.")
.def_readonly("tier1Sim", &RDKit::RascalMCES::RascalResult::getTier1Sim,
"The tier 1 similarity estimate.")
.def_readonly("tier2Sim", &RDKit::RascalMCES::RascalResult::getTier2Sim,
"The tier 2 similarity estimate.")
.def_readonly("timedOut", &RDKit::RascalMCES::RascalResult::getTimedOut,
"Whether it timed out.");
}
};
} // namespace
namespace RDKit {
python::list findMCESWrapper(const ROMol &mol1, const ROMol &mol2,
const python::object &py_opts) {
RascalMCES::RascalOptions opts;
if (!py_opts.is_none()) {
opts = python::extract<RascalMCES::RascalOptions>(py_opts);
}
std::vector<RDKit::RascalMCES::RascalResult> results;
{
NOGIL gil;
results = RascalMCES::rascalMCES(mol1, mol2, opts);
}
python::list pyres;
for (auto &res : results) {
pyres.append(res);
}
return pyres;
}
std::vector<std::shared_ptr<ROMol>> extractMols(python::object mols) {
std::vector<std::shared_ptr<ROMol>> cmols;
unsigned int nElems = python::extract<unsigned int>(mols.attr("__len__")());
cmols.resize(nElems);
for (unsigned int i = 0; i < nElems; ++i) {
if (!mols[i]) {
throw_value_error("molecule is None");
}
cmols[i] = python::extract<std::shared_ptr<ROMol>>(mols[i]);
}
return cmols;
}
python::list packOutputMols(
const std::vector<std::vector<unsigned int>> &clusters) {
python::list pyres;
for (auto &clus : clusters) {
python::list mols;
for (auto &m : clus) {
mols.append(m);
}
pyres.append(mols);
}
return pyres;
}
python::list rascalClusterWrapper(python::object mols,
const python::object &py_opts) {
RascalMCES::RascalClusterOptions opts;
if (!py_opts.is_none()) {
opts = python::extract<RascalMCES::RascalClusterOptions>(py_opts);
}
auto cmols = extractMols(mols);
std::vector<RDKit::UINT_VECT> clusters;
{
NOGIL gil;
clusters = RascalMCES::rascalCluster(cmols, opts);
}
return packOutputMols(clusters);
}
python::list rascalButinaClusterWrapper(python::object mols,
const python::object &py_opts) {
RascalMCES::RascalClusterOptions opts;
if (!py_opts.is_none()) {
opts = python::extract<RascalMCES::RascalClusterOptions>(py_opts);
}
auto cmols = extractMols(mols);
std::vector<RDKit::UINT_VECT> clusters;
{
NOGIL gil;
clusters = RascalMCES::rascalButinaCluster(cmols, opts);
}
return packOutputMols(clusters);
}
BOOST_PYTHON_MODULE(rdRascalMCES) {
python::scope().attr("__doc__") =
"Module containing implementation of RASCAL Maximum Common Edge Substructure algorithm.";
RascalResult_wrapper::wrap();
std::string docString = "RASCAL Options";
python::class_<RDKit::RascalMCES::RascalOptions, boost::noncopyable>(
"RascalOptions", docString.c_str())
.def_readwrite(
"similarityThreshold",
&RDKit::RascalMCES::RascalOptions::similarityThreshold,
"Threshold below which MCES won't be run. Between 0.0 and 1.0, default=0.7.")
.def_readwrite(
"singleLargestFrag",
&RDKit::RascalMCES::RascalOptions::singleLargestFrag,
"Return the just single largest fragment of the MCES. This is equivalent to running with allBestMCEs=True, finding the result with the largest largestFragmentSize, and calling its largestFragmentOnly method.")
.def_readwrite(
"completeAromaticRings",
&RDKit::RascalMCES::RascalOptions::completeAromaticRings,
"If True (default), partial aromatic rings won't be returned.")
.def_readwrite("ringMatchesRingOnly",
&RDKit::RascalMCES::RascalOptions::ringMatchesRingOnly,
"If True (default is False), ring bonds won't match non-ring bonds.")
.def_readwrite("completeSmallestRings",
&RDKit::RascalMCES::RascalOptions::completeSmallestRings,
"If True (default is False), only complete rings present in both input molecule's RingInfo will be returned. Implies completeAromaticRings and ringMatchesRingOnly.")
.def_readwrite(
"exactConnectionsMatch",
&RDKit::RascalMCES::RascalOptions::exactConnectionsMatch,
"If True (default is False), atoms will only match atoms if they have the same\n"
" number of explicit connections. E.g. the central atom of\n"
" C(C)(C) won't match either atom in CC")
.def_readwrite(
"minFragSize", &RDKit::RascalMCES::RascalOptions::minFragSize,
"Imposes a minimum on the number of atoms in a fragment that may be part of the MCES. Default -1 means no minimum.")
.def_readwrite(
"maxFragSeparation",
&RDKit::RascalMCES::RascalOptions::maxFragSeparation,
"Maximum number of bonds between fragments in the MCES for both to be reported. Default -1 means no maximum. If exceeded, the smaller fragment will be removed.")
.def_readwrite(
"allBestMCESs", &RDKit::RascalMCES::RascalOptions::allBestMCESs,
"If True, reports all MCESs found of the same maximum size. Default False means just report the first found.")
.def_readwrite(
"returnEmptyMCES", &RDKit::RascalMCES::RascalOptions::returnEmptyMCES,
"If the estimated similarity between the 2 molecules doesn't meet the similarityThreshold, no results are returned. If you want to know what the"
" estimates were, set this to True, and examine the tier1Sim and tier2Sim properties of the result then returned.")
.def_readwrite(
"timeout", &RDKit::RascalMCES::RascalOptions::timeout,
"Maximum time (in seconds) to spend on an individual MCESs determination. Default 60, -1 means no limit.")
.def_readwrite(
"maxBondMatchPairs",
&RDKit::RascalMCES::RascalOptions::maxBondMatchPairs,
"Too many matching bond (vertex) pairs can cause the process to run out of memory."
" The default of 1000 is fairly safe. Increase with caution, as memory use increases"
" with the square of this number. ")
.def_readwrite("equivalentAtoms",
&RDKit::RascalMCES::RascalOptions::equivalentAtoms,
"SMARTS strings defining atoms that should"
"be considered equivalent. e.g."
"[F,Cl,Br,I] so all halogens will match each other."
"Space-separated list allowing more than 1"
"class of equivalent atoms.")
.def_readwrite("ignoreBondOrders",
&RDKit::RascalMCES::RascalOptions::ignoreBondOrders,
"If True, will treat all bonds as the same,"
" irrespective of order. Default=False.")
.def_readwrite("ignoreAtomAromaticity",
&RDKit::RascalMCES::RascalOptions::ignoreAtomAromaticity,
"If True, matches atoms solely on atomic number."
" If False, will treat aromatic and aliphatic atoms"
" as different. Default=True.")
.def_readwrite("minCliqueSize",
&RDKit::RascalMCES::RascalOptions::minCliqueSize,
"Normally, the minimum clique size is specified"
" via the similarityThreshold. Sometimes it's"
" more convenient to specify it directly. If this"
" is > 0, it will over-ride the"
" similarityThreshold."
" Note that this refers to the"
" minimum number of BONDS in the MCES. Default=0.");
docString =
"Find one or more MCESs between the 2 molecules given. Returns a list of "
"RascalResult objects."
"- mol1"
"- mol2 The two molecules for which to find the MCES"
"- opts Optional RascalOptions object changing the default run mode."
"";
python::def("FindMCES", &RDKit::findMCESWrapper,
(python::arg("mol1"), python::arg("mol2"),
python::arg("opts") = python::object()),
docString.c_str());
docString =
"RASCAL Cluster Options. Most of these pertain to RascalCluster calculations. Only similarityCutoff is used by RascalButinaCluster.";
python::class_<RDKit::RascalMCES::RascalClusterOptions, boost::noncopyable>(
"RascalClusterOptions", docString.c_str())
.def_readwrite(
"similarityCutoff",
&RDKit::RascalMCES::RascalClusterOptions::similarityCutoff,
"Similarity cutoff for molecules to be in the same cluster. Between 0.0 and 1.0, default=0.7.")
.def_readwrite(
"minFragSize", &RDKit::RascalMCES::RascalClusterOptions::minFragSize,
"The minimum number of atoms in a fragment for it to be included in the MCES. Default=3.")
.def_readwrite(
"maxNumFrags", &RDKit::RascalMCES::RascalClusterOptions::maxNumFrags,
"The maximum number of fragments allowed in the MCES for each pair of molecules. Default=2. So that the MCES"
" isn't a lot of small fragments scattered around the molecules giving an inflated estimate of similarity.")
.def_readwrite(
"numThreads", &RDKit::RascalMCES::RascalClusterOptions::numThreads,
"Number of threads to use during clustering. Default=-1 means all the hardware threads less one.")
.def_readwrite(
"a", &RDKit::RascalMCES::RascalClusterOptions::a,
"The penalty score for each unconnected component in the MCES. Default=0.05.")
.def_readwrite(
"b", &RDKit::RascalMCES::RascalClusterOptions::a,
"The weight of matched bonds over matched atoms. Default=2.")
.def_readwrite(
"minIntraClusterSim",
&RDKit::RascalMCES::RascalClusterOptions::minIntraClusterSim,
"Two pairs of molecules are included in the same cluster if the similarity between"
" their MCESs is greater than this. Default=0.9.")
.def_readwrite(
"clusterMergeSim",
&RDKit::RascalMCES::RascalClusterOptions::clusterMergeSim,
"Two clusters are merged if the fraction of molecules they have in common is greater than this. Default=0.6.");
docString =
"Use the RASCAL MCES similarity metric to do fuzzy clustering. Returns a list of lists "
"of molecules, each inner list being a cluster. The last cluster is all the "
"molecules that didn't fit into another cluster (the singletons)."
"- mols List of molecules to be clustered"
"- opts Optional RascalOptions object changing the default run mode."
"";
python::def("RascalCluster", &RDKit::rascalClusterWrapper,
(python::arg("mols"), python::arg("opts") = python::object()),
docString.c_str());
docString =
"Use the RASCAL MCES similarity metric to do Butina clustering"
" (Butina JCICS 39 747-750 (1999)). Returns a list of lists of molecules,"
" each inner list being a cluster. The last cluster is all the"
" molecules that didn't fit into another cluster (the singletons)."
"- mols List of molecules to be clustered"
"- opts Optional RascalOptions object changing the default run mode."
"";
python::def("RascalButinaCluster", &RDKit::rascalButinaClusterWrapper,
(python::arg("mols"), python::arg("opts") = python::object()),
docString.c_str());
}
} // namespace RDKit
|