File: TautomerQuery.cpp

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (327 lines) | stat: -rw-r--r-- 10,644 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
//
// Created by Gareth Jones on 5/7/2020.
//
// Copyright 2020 Schrodinger, Inc
//  @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.

#include "TautomerQuery.h"
#include <functional>
#include <set>
#include <utility>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/MolStandardize/Tautomer.h>
#include <GraphMol/Bond.h>
#include <GraphMol/QueryOps.h>
#include <GraphMol/QueryAtom.h>
#include <GraphMol/QueryBond.h>
#include <GraphMol/Substruct/SubstructUtils.h>
#include <GraphMol/Fingerprints/Fingerprints.h>
// #define VERBOSE

#ifdef RDK_USE_BOOST_SERIALIZATION
#include <RDGeneral/BoostStartInclude.h>
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/archive/archive_exception.hpp>
#include <RDGeneral/BoostEndInclude.h>
#endif

#ifdef VERBOSE
#include <GraphMol/SmilesParse/SmilesWrite.h>
#endif

namespace {

using namespace RDKit;

// Adapted from Code/GraphMol/Substruct/SubstructUtils.cpp#removeDuplicates
void removeTautomerDuplicates(std::vector<MatchVectType> &matches,
                              std::vector<ROMOL_SPTR> *matchingTautomers,
                              int nAtoms) {
  //
  //  This works by tracking the indices of the atoms in each match vector.
  //  This can lead to unexpected behavior when looking at rings and queries
  //  that don't specify bond orders.  For example querying this molecule:
  //    C1CCC=1
  //  with the pattern constructed from SMARTS C~C~C~C will return a
  //  single match, despite the fact that there are 4 different paths
  //  when valence is considered.  The defense of this behavior is
  //  that the 4 paths are equivalent in the semantics of the query.
  //  Also, OELib returns the same results
  //

  std::set<boost::dynamic_bitset<>> seen;
  std::vector<MatchVectType> res;
  res.reserve(matches.size());
  for (auto &&match : matches) {
    boost::dynamic_bitset<> val(nAtoms);
    for (const auto &ci : match) {
      val.set(ci.second);
    }
    auto pos = seen.lower_bound(val);
    if (pos == seen.end() || *pos != val) {
      res.push_back(std::move(match));
      seen.insert(pos, std::move(val));
    } else if (matchingTautomers) {
      auto position = res.size();
      matchingTautomers->erase(matchingTautomers->begin() + position);
    }
  }
  res.shrink_to_fit();
  matches = std::move(res);
}

}  // namespace

namespace RDKit {

bool TautomerQueryCanSerialize() {
#ifdef RDK_USE_BOOST_SERIALIZATION
  return true;
#else
  return false;
#endif
}

class TautomerQueryMatcher {
 private:
  const TautomerQuery &d_tautomerQuery;
  const SubstructMatchParameters &d_params;
  std::vector<ROMOL_SPTR> *d_matchingTautomers;

 public:
  TautomerQueryMatcher(const TautomerQuery &tautomerQuery,
                       const SubstructMatchParameters &params,
                       std::vector<ROMOL_SPTR> *matchingTautomers)
      : d_tautomerQuery(tautomerQuery),
        d_params(params),
        d_matchingTautomers(matchingTautomers) {}

  bool match(const ROMol &mol, const std::vector<unsigned int> &match) {
#ifdef VERBOSE
    std::cout << "Checking template match" << std::endl;
#endif

    for (auto tautomer : d_tautomerQuery.getTautomers()) {
#ifdef VERBOSE
      std::cout << "Checking Tautomer " << MolToSmiles(*tautomer) << std::endl;
#endif
      if (d_tautomerQuery.matchTautomer(mol, *tautomer, match, d_params)) {
        auto matchingTautomer = d_params.extraFinalCheck
                                    ? d_params.extraFinalCheck(mol, match)
                                    : true;
        if (matchingTautomer) {
#ifdef VERBOSE
          std::cout << "Got Match " << std::endl;
#endif
          if (d_matchingTautomers) {
            d_matchingTautomers->push_back(tautomer);
          }
        }
        return matchingTautomer;
      }
    }
    return false;
  }
};

TautomerQuery::TautomerQuery(std::vector<ROMOL_SPTR> tautomers,
                             const ROMol *const templateMolecule,
                             std::vector<size_t> modifiedAtoms,
                             std::vector<size_t> modifiedBonds)
    : d_tautomers(std::move(tautomers)),
      d_templateMolecule(templateMolecule),
      d_modifiedAtoms(std::move(modifiedAtoms)),
      d_modifiedBonds(std::move(modifiedBonds)) {}

TautomerQuery *TautomerQuery::fromMol(
    const ROMol &query, const std::string &tautomerTransformFile) {
  auto tautomerParams = std::unique_ptr<MolStandardize::TautomerCatalogParams>(
      new MolStandardize::TautomerCatalogParams(tautomerTransformFile));
  MolStandardize::TautomerEnumerator tautomerEnumerator(
      new MolStandardize::TautomerCatalog(tautomerParams.get()));
  const auto res = tautomerEnumerator.enumerate(query);

  std::vector<size_t> modifiedAtoms;
  modifiedAtoms.reserve(res.modifiedAtoms().count());
  for (size_t i = 0; i < query.getNumAtoms(); i++) {
    if (res.modifiedAtoms().test(i)) {
      modifiedAtoms.push_back(i);
    }
  }
  std::vector<size_t> modifiedBonds;
  modifiedBonds.reserve(res.modifiedBonds().count());
  for (size_t i = 0; i < query.getNumBonds(); i++) {
    if (res.modifiedBonds().test(i)) {
      modifiedBonds.push_back(i);
    }
  }

  auto templateMolecule = new RWMol(query);
  for (auto idx : modifiedAtoms) {
    const auto atom = templateMolecule->getAtomWithIdx(idx);
    const auto queryAtom = new QueryAtom(atom->getAtomicNum());
    const bool updateLabel = false;
    const bool preserveProps = true;
    templateMolecule->replaceAtom(idx, queryAtom, updateLabel, preserveProps);
    delete queryAtom;
  }
  for (auto idx : modifiedBonds) {
    auto bondQuery = makeSingleOrDoubleOrAromaticBondQuery();
    auto queryBond = new QueryBond();
    queryBond->setQuery(bondQuery);
    templateMolecule->replaceBond(idx, queryBond, true);
    delete queryBond;
  }

  return new TautomerQuery(res.tautomers(),
                           static_cast<ROMol *>(templateMolecule),
                           modifiedAtoms, modifiedBonds);
}

bool TautomerQuery::matchTautomer(
    const ROMol &mol, const ROMol &tautomer,
    const std::vector<unsigned int> &match,
    const SubstructMatchParameters &params) const {
  for (auto idx : d_modifiedAtoms) {
    const auto queryAtom = tautomer.getAtomWithIdx(idx);
    const auto targetAtom = mol.getAtomWithIdx(match[idx]);
#ifdef VERBOSE
    std::cout << "Query atom " << queryAtom->getSymbol() << " target atom "
              << targetAtom->getSymbol() << std::endl;
#endif
    if (!atomCompat(queryAtom, targetAtom, params)) {
#ifdef VERBOSE
      std::cout << "Atom incompatibility" << std::endl;
#endif
      return false;
    }
  }

  for (auto idx : d_modifiedBonds) {
    const auto queryBond = tautomer.getBondWithIdx(idx);
    const auto beginIdx = queryBond->getBeginAtomIdx();
    const auto endIdx = queryBond->getEndAtomIdx();
    const auto targetBeginIdx = match[beginIdx];
    const auto targetEndIdx = match[endIdx];
    const auto targetBond =
        mol.getBondBetweenAtoms(targetBeginIdx, targetEndIdx);
#ifdef VERBOSE
    std::cout << "Query bond " << queryBond->getBondType() << " target bond "
              << targetBond->getBondType() << std::endl;
#endif
    if (!bondCompat(queryBond, targetBond, params)) {
#ifdef VERBOSE
      std::cout << "Bond incompatibility" << std::endl;
#endif
      return false;
    }
  }

#ifdef VERBOSE
  std::cout << "Tautomer match" << std::endl;
#endif
  return true;
}

std::vector<MatchVectType> TautomerQuery::substructOf(
    const ROMol &mol, const SubstructMatchParameters &params,
    std::vector<ROMOL_SPTR> *matchingTautomers) const {
  if (matchingTautomers) {
    matchingTautomers->clear();
  }

#ifdef VERBOSE
  std::cout << "Tautomer search with query " << MolToSmiles(*d_templateMolecule)
            << " on " << MolToSmiles(mol) << " max matches "
            << params.maxMatches << std::endl;
#endif
  SubstructMatchParameters templateParams(params);
  // need to check all mappings of template to target
  templateParams.uniquify = false;

  TautomerQueryMatcher tautomerQueryMatcher(*this, params, matchingTautomers);
  // use this functor as a final check to see if any tautomer matches the target
  auto checker = [&tautomerQueryMatcher](
                     const ROMol &mol,
                     const std::vector<unsigned int> &match) mutable {
    return tautomerQueryMatcher.match(mol, match);
  };
  templateParams.extraFinalCheck = checker;

  auto matches =
      RDKit::SubstructMatch(mol, *d_templateMolecule, templateParams);

#ifdef VERBOSE
  std::cout << "Found " << matches.size() << " matches " << std::endl;
#endif

  if (params.uniquify && matches.size() > 1) {
    removeTautomerDuplicates(matches, matchingTautomers, mol.getNumAtoms());
#ifdef VERBOSE
    std::cout << "After removing duplicates " << matches.size() << " matches "
              << std::endl;
#endif
  }
  return matches;
}

bool TautomerQuery::isSubstructOf(const ROMol &mol,
                                  const SubstructMatchParameters &params) {
  SubstructMatchParameters params2(params);
  params2.maxMatches = 1;
  auto matches = substructOf(mol, params2);
  return matches.size() > 0;
}

ExplicitBitVect *TautomerQuery::patternFingerprintTemplate(
    unsigned int fpSize) const {
  return PatternFingerprintMol(*d_templateMolecule, fpSize, nullptr, nullptr,
                               true);
}

ExplicitBitVect *TautomerQuery::patternFingerprintTarget(const ROMol &target,
                                                         unsigned int fpSize) {
  return PatternFingerprintMol(target, fpSize, nullptr, nullptr, true);
}

std::vector<MatchVectType> SubstructMatch(
    const ROMol &mol, const TautomerQuery &query,
    const SubstructMatchParameters &params) {
  return query.substructOf(mol, params);
}

void TautomerQuery::toStream(std::ostream &ss) const {
#ifndef RDK_USE_BOOST_SERIALIZATION
  PRECONDITION(0, "Boost SERIALIZATION is not enabled")
#else
  boost::archive::text_oarchive ar(ss);
  ar << *this;
#endif
}

std::string TautomerQuery::serialize() const {
  std::stringstream ss;
  toStream(ss);
  return ss.str();
}

void TautomerQuery::initFromStream(std::istream &ss) {
#ifndef RDK_USE_BOOST_SERIALIZATION
  PRECONDITION(0, "Boost SERIALIZATION is not enabled")
#else
  boost::archive::text_iarchive ar(ss);
  ar >> *this;
#endif
}

void TautomerQuery::initFromString(const std::string &text) {
  std::stringstream ss(text);
  initFromStream(ss);
}

}  // namespace RDKit