File: Tautomer.h

package info (click to toggle)
rdkit 202209.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 203,880 kB
  • sloc: cpp: 334,239; python: 80,247; ansic: 24,579; java: 7,667; sql: 2,123; yacc: 1,884; javascript: 1,358; lex: 1,260; makefile: 576; xml: 229; fortran: 183; cs: 181; sh: 101
file content (428 lines) | stat: -rw-r--r-- 15,993 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
//
//  Copyright (C) 2018-2021 Susan H. Leung and other RDKit contributors
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#include <RDGeneral/export.h>
#ifndef RD_TAUTOMER_H
#define RD_TAUTOMER_H

#include <boost/function.hpp>
#include <string>
#include <utility>
#include <iterator>
#include <Catalogs/Catalog.h>
#include <GraphMol/MolStandardize/MolStandardize.h>
#include <GraphMol/MolStandardize/TautomerCatalog/TautomerCatalogEntry.h>
#include <GraphMol/MolStandardize/TautomerCatalog/TautomerCatalogParams.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <boost/dynamic_bitset.hpp>

namespace RDKit {
class ROMol;
class RWMol;

namespace MolStandardize {

typedef RDCatalog::HierarchCatalog<TautomerCatalogEntry, TautomerCatalogParams,
                                   int>
    TautomerCatalog;

namespace TautomerScoringFunctions {
const std::string tautomerScoringVersion = "1.0.0";

RDKIT_MOLSTANDARDIZE_EXPORT int scoreRings(const ROMol &mol);
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs(const ROMol &mol);
RDKIT_MOLSTANDARDIZE_EXPORT int scoreHeteroHs(const ROMol &mol);

inline int scoreTautomer(const ROMol &mol) {
  return scoreRings(mol) + scoreSubstructs(mol) + scoreHeteroHs(mol);
}
}  // namespace TautomerScoringFunctions

enum class TautomerEnumeratorStatus {
  Completed = 0,
  MaxTautomersReached,
  MaxTransformsReached,
  Canceled
};

class Tautomer {
  friend class TautomerEnumerator;

 public:
  Tautomer() : d_numModifiedAtoms(0), d_numModifiedBonds(0), d_done(false) {}
  Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a = 0, size_t b = 0)
      : tautomer(std::move(t)),
        kekulized(std::move(k)),
        d_numModifiedAtoms(a),
        d_numModifiedBonds(b),
        d_done(false) {}
  ROMOL_SPTR tautomer;
  ROMOL_SPTR kekulized;

 private:
  size_t d_numModifiedAtoms;
  size_t d_numModifiedBonds;
  bool d_done;
};

typedef std::map<std::string, Tautomer> SmilesTautomerMap;
typedef std::pair<std::string, Tautomer> SmilesTautomerPair;

//! Contains results of tautomer enumeration
class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumeratorResult {
  friend class TautomerEnumerator;

 public:
  class const_iterator {
   public:
    typedef ROMOL_SPTR value_type;
    typedef std::ptrdiff_t difference_type;
    typedef const ROMol *pointer;
    typedef const ROMOL_SPTR &reference;
    typedef std::bidirectional_iterator_tag iterator_category;

    explicit const_iterator(const SmilesTautomerMap::const_iterator &it)
        : d_it(it) {}
    reference operator*() const { return d_it->second.tautomer; }
    pointer operator->() const { return d_it->second.tautomer.get(); }
    bool operator==(const const_iterator &other) const {
      return (d_it == other.d_it);
    }
    bool operator!=(const const_iterator &other) const {
      return !(*this == other);
    }
    const_iterator operator++(int) {
      const_iterator copy(d_it);
      operator++();
      return copy;
    }
    const_iterator &operator++() {
      ++d_it;
      return *this;
    }
    const_iterator operator--(int) {
      const_iterator copy(d_it);
      operator--();
      return copy;
    }
    const_iterator &operator--() {
      --d_it;
      return *this;
    }

   private:
    SmilesTautomerMap::const_iterator d_it;
  };
  TautomerEnumeratorResult() : d_status(TautomerEnumeratorStatus::Completed) {}
  TautomerEnumeratorResult(const TautomerEnumeratorResult &other)
      : d_tautomers(other.d_tautomers),
        d_status(other.d_status),
        d_modifiedAtoms(other.d_modifiedAtoms),
        d_modifiedBonds(other.d_modifiedBonds) {
    fillTautomersItVec();
  }
  const const_iterator begin() const {
    return const_iterator(d_tautomers.begin());
  }
  const const_iterator end() const { return const_iterator(d_tautomers.end()); }
  size_t size() const { return d_tautomers.size(); }
  bool empty() const { return d_tautomers.empty(); }
  const ROMOL_SPTR &at(size_t pos) const {
    PRECONDITION(pos < d_tautomers.size(), "index out of bounds");
    return d_tautomersItVec.at(pos)->second.tautomer;
  }
  const ROMOL_SPTR &operator[](size_t pos) const { return at(pos); }
  const boost::dynamic_bitset<> &modifiedAtoms() const {
    return d_modifiedAtoms;
  }
  const boost::dynamic_bitset<> &modifiedBonds() const {
    return d_modifiedBonds;
  }
  TautomerEnumeratorStatus status() const { return d_status; }
  std::vector<ROMOL_SPTR> tautomers() const {
    std::vector<ROMOL_SPTR> tautomerVec;
    tautomerVec.reserve(d_tautomers.size());
    std::transform(
        d_tautomers.begin(), d_tautomers.end(), std::back_inserter(tautomerVec),
        [](const SmilesTautomerPair &t) { return t.second.tautomer; });
    return tautomerVec;
  }
  std::vector<ROMOL_SPTR> operator()() const { return tautomers(); }
  std::vector<std::string> smiles() const {
    std::vector<std::string> smilesVec;
    smilesVec.reserve(d_tautomers.size());
    std::transform(d_tautomers.begin(), d_tautomers.end(),
                   std::back_inserter(smilesVec),
                   [](const SmilesTautomerPair &t) { return t.first; });
    return smilesVec;
  }
  const SmilesTautomerMap &smilesTautomerMap() const { return d_tautomers; }

 private:
  void fillTautomersItVec() {
    for (auto it = d_tautomers.begin(); it != d_tautomers.end(); ++it) {
      d_tautomersItVec.push_back(it);
    }
  }
  // the enumerated tautomers
  SmilesTautomerMap d_tautomers;
  // internal; vector of iterators into map items to enable random
  // access to map items by index
  std::vector<SmilesTautomerMap::const_iterator> d_tautomersItVec;
  // status of the enumeration: did it complete? did it hit a limit?
  // was it canceled?
  TautomerEnumeratorStatus d_status;
  // bit vector: flags atoms modified by the transforms
  boost::dynamic_bitset<> d_modifiedAtoms;
  // bit vector: flags bonds modified by the transforms
  boost::dynamic_bitset<> d_modifiedBonds;
};

class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumeratorCallback {
 public:
  TautomerEnumeratorCallback() {}
  virtual ~TautomerEnumeratorCallback() {}
  virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &) = 0;
};

class RDKIT_MOLSTANDARDIZE_EXPORT TautomerEnumerator {
 public:
  TautomerEnumerator(TautomerCatalog *tautCat)
      : dp_catalog(tautCat),
        d_maxTautomers(1000),
        d_maxTransforms(1000),
        d_removeSp3Stereo(true),
        d_removeBondStereo(true),
        d_removeIsotopicHs(true),
        d_reassignStereo(true) {}
  TautomerEnumerator(const CleanupParameters &params = CleanupParameters());
  TautomerEnumerator(const TautomerEnumerator &other)
      : dp_catalog(other.dp_catalog),
        d_callback(other.d_callback),
        d_maxTautomers(other.d_maxTautomers),
        d_maxTransforms(other.d_maxTransforms),
        d_removeSp3Stereo(other.d_removeSp3Stereo),
        d_removeBondStereo(other.d_removeBondStereo),
        d_removeIsotopicHs(other.d_removeIsotopicHs),
        d_reassignStereo(other.d_reassignStereo) {}
  TautomerEnumerator &operator=(const TautomerEnumerator &other) {
    if (this == &other) {
      return *this;
    }
    dp_catalog = other.dp_catalog;
    d_callback = other.d_callback;
    d_maxTautomers = other.d_maxTautomers;
    d_maxTransforms = other.d_maxTransforms;
    d_removeSp3Stereo = other.d_removeSp3Stereo;
    d_removeBondStereo = other.d_removeBondStereo;
    d_removeIsotopicHs = other.d_removeIsotopicHs;
    d_reassignStereo = other.d_reassignStereo;
    return *this;
  }
  //! \param maxTautomers maximum number of tautomers to be generated
  void setMaxTautomers(unsigned int maxTautomers) {
    d_maxTautomers = maxTautomers;
  }
  //! \return maximum number of tautomers to be generated
  unsigned int getMaxTautomers() { return d_maxTautomers; }
  /*! \param maxTransforms maximum number of transformations to be applied
      this limit is usually hit earlier than the maxTautomers limit
      and leads to a more linear scaling of CPU time with increasing
      number of tautomeric centers (see Sitzmann et al.)
   */
  void setMaxTransforms(unsigned int maxTransforms) {
    d_maxTransforms = maxTransforms;
  }
  //! \return maximum number of transformations to be applied
  unsigned int getMaxTransforms() { return d_maxTransforms; }
  /*! \param removeSp3Stereo; if set to true, stereochemistry information
      will be removed from sp3 atoms involved in tautomerism.
      This means that S-aminoacids will lose their stereochemistry after going
      through tautomer enumeration because of the amido-imidol tautomerism.
      This defaults to true in RDKit, false in the workflow described
      by Sitzmann et al.
   */
  void setRemoveSp3Stereo(bool removeSp3Stereo) {
    d_removeSp3Stereo = removeSp3Stereo;
  }
  /*! \return whether stereochemistry information will be removed from
      sp3 atoms involved in tautomerism
   */
  bool getRemoveSp3Stereo() { return d_removeSp3Stereo; }
  /*! \param removeBondStereo; if set to true, stereochemistry information
      will be removed from double bonds involved in tautomerism.
      This means that enols will lose their E/Z stereochemistry after going
      through tautomer enumeration because of the keto-enolic tautomerism.
      This defaults to true in RDKit and also in the workflow described
      by Sitzmann et al.
   */
  void setRemoveBondStereo(bool removeBondStereo) {
    d_removeBondStereo = removeBondStereo;
  }
  /*! \return whether stereochemistry information will be removed from
      double bonds involved in tautomerism
   */
  bool getRemoveBondStereo() { return d_removeBondStereo; }
  /*! \param removeIsotopicHs; if set to true, isotopic Hs
      will be removed from centers involved in tautomerism.
   */
  void setRemoveIsotopicHs(bool removeIsotopicHs) {
    d_removeIsotopicHs = removeIsotopicHs;
  }
  /*! \return whether isotpoic Hs will be removed from
      centers involved in tautomerism
   */
  bool getRemoveIsotopicHs() { return d_removeIsotopicHs; }
  /*! \param reassignStereo; if set to true, assignStereochemistry
      will be called on each tautomer generated by the enumerate() method.
      This defaults to true.
   */
  void setReassignStereo(bool reassignStereo) {
    d_reassignStereo = reassignStereo;
  }
  /*! \return whether assignStereochemistry will be called on each
      tautomer generated by the enumerate() method
   */
  bool getReassignStereo() { return d_reassignStereo; }
  /*! set this to an instance of a class derived from
      TautomerEnumeratorCallback where operator() is overridden.
      DO NOT delete the instance as ownership of the pointer is transferred
      to the TautomerEnumerator
   */
  void setCallback(TautomerEnumeratorCallback *callback) {
    d_callback.reset(callback);
  }
  /*! \return pointer to an instance of a class derived from
      TautomerEnumeratorCallback.
      DO NOT delete the instance as ownership of the pointer is transferred
      to the TautomerEnumerator
   */
  TautomerEnumeratorCallback *getCallback() const { return d_callback.get(); }

  //! returns a \c TautomerEnumeratorResult structure for the input molecule
  /*!
    The enumeration rules are inspired by the publication:
    M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
    https://doi.org/10.1007/s10822-010-9346-4

    \param mol: the molecule to be enumerated

    Note: the definitions used here are that the atoms modified during
    tautomerization are the atoms at the beginning and end of each tautomer
    transform (the H "donor" and H "acceptor" in the transform) and the bonds
    modified during transformation are any bonds whose order is changed during
    the tautomer transform (these are the bonds between the "donor" and the
    "acceptor")

  */
  TautomerEnumeratorResult enumerate(const ROMol &mol) const;

  //! Deprecated, please use the form returning a \c TautomerEnumeratorResult
  //! instead
  [
      [deprecated("please use the form returning a TautomerEnumeratorResult "
                  "instead")]] std::vector<ROMOL_SPTR>
  enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms,
            boost::dynamic_bitset<> *modifiedBonds = nullptr) const;

  //! returns the canonical tautomer from a \c TautomerEnumeratorResult
  ROMol *pickCanonical(const TautomerEnumeratorResult &tautRes,
                       boost::function<int(const ROMol &mol)> scoreFunc =
                           TautomerScoringFunctions::scoreTautomer) const;

  //! returns the canonical tautomer from an iterable of possible tautomers
  /// When Iterable is TautomerEnumeratorResult we use the other non-templated
  /// overload for efficiency (TautomerEnumeratorResult already has SMILES so no
  /// need to recompute them)
  template <class Iterable,
            typename std::enable_if<
                !std::is_same<Iterable, TautomerEnumeratorResult>::value,
                int>::type = 0>
  ROMol *pickCanonical(const Iterable &tautomers,
                       boost::function<int(const ROMol &mol)> scoreFunc =
                           TautomerScoringFunctions::scoreTautomer) const {
    ROMOL_SPTR bestMol;
    if (tautomers.size() == 1) {
      bestMol = *tautomers.begin();
    } else {
      // Calculate score for each tautomer
      int bestScore = std::numeric_limits<int>::min();
      std::string bestSmiles = "";
      for (const auto &t : tautomers) {
        auto score = scoreFunc(*t);
#ifdef VERBOSE_ENUMERATION
        std::cerr << "  " << MolToSmiles(*t) << " " << score << std::endl;
#endif
        if (score > bestScore) {
          bestScore = score;
          bestSmiles = MolToSmiles(*t);
          bestMol = t;
        } else if (score == bestScore) {
          auto smiles = MolToSmiles(*t);
          if (smiles < bestSmiles) {
            bestSmiles = smiles;
            bestMol = t;
          }
        }
      }
    }
    ROMol *res = new ROMol(*bestMol);
    static const bool cleanIt = true;
    static const bool force = true;
    MolOps::assignStereochemistry(*res, cleanIt, force);

    return res;
  }

  //! returns the canonical tautomer for a molecule
  /*!
    Note that the canonical tautomer is very likely not the most stable tautomer
    for any given conditions. The default scoring rules are designed to produce
    "reasonable" tautomers, but the primary concern is that the results are
    canonical: you always get the same canonical tautomer for a molecule
    regardless of what the input tautomer or atom ordering were.

    The default scoring scheme is inspired by the publication:
    M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
    https://doi.org/10.1007/s10822-010-9346-4

  */
  ROMol *canonicalize(const ROMol &mol,
                      boost::function<int(const ROMol &mol)> scoreFunc =
                          TautomerScoringFunctions::scoreTautomer) const;

 private:
  bool setTautomerStereoAndIsoHs(const ROMol &mol, ROMol &taut,
                                 const TautomerEnumeratorResult &res) const;
  std::shared_ptr<TautomerCatalog> dp_catalog;
  std::shared_ptr<TautomerEnumeratorCallback> d_callback;
  unsigned int d_maxTautomers;
  unsigned int d_maxTransforms;
  bool d_removeSp3Stereo;
  bool d_removeBondStereo;
  bool d_removeIsotopicHs;
  bool d_reassignStereo;
};  // TautomerEnumerator class

// caller owns the pointer
inline TautomerEnumerator *tautomerEnumeratorFromParams(
    const CleanupParameters &params) {
  return new TautomerEnumerator(params);
}
// caller owns the pointer
inline TautomerEnumerator *getV1TautomerEnumerator() {
  TautomerCatalogParams tparms(
      MolStandardize::defaults::defaultTautomerTransformsv1);
  return new TautomerEnumerator(new TautomerCatalog(&tparms));
}

}  // namespace MolStandardize
}  // namespace RDKit

#endif