File: AbbreviationsUtils.cpp

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (228 lines) | stat: -rw-r--r-- 6,833 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
//
//  Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#include "Abbreviations.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/RDKitQueries.h>
#include <boost/tokenizer.hpp>

using tokenizer = boost::tokenizer<boost::char_separator<char>>;

namespace RDKit {

namespace Abbreviations {

namespace common_properties {
const std::string numDummies = "_numDummies";
const std::string origAtomMapping = "_origAtomMapping";
const std::string origBondMapping = "_origBondMapping";
}  // namespace common_properties

namespace Utils {
namespace data {
/*
Translations of superatom labels to SMILES.

First atom of SMILES string should be the one connected to the rest of
the molecule.

ADAPTED FROM: https://github.com/openbabel/superatoms/blob/master/superatom.txt

Originally from http://cactus.nci.nih.gov/osra/

The left-aligned form is the one recognized in MDL alias lines;
the right-aligned form may be used in 2D depiction.

label smiles display_label display_label_w
*/
const std::string defaultAbbreviations =
    R"ABBREVS(CO2Et    C(=O)OCC  CO<sub>2</sub>Et    EtO<sub>2</sub>C
COOEt    C(=O)OCC  CO<sub>2</sub>Et    EtO<sub>2</sub>C
OiBu     OCC(C)C   OiBu     iBuO
nDec     CCCCCCCCCC  nDec
nNon     CCCCCCCCC   nNon
nOct     CCCCCCCC    nOct
nHept    CCCCCCC     nHept
nHex     CCCCCC      nHex
nPent    CCCCC       nPent
iPent    C(C)CCC     iPent
tBu      C(C)(C)C    tBu
iBu      C(C)CC      iBu
nBu      CCCC        nBu
iPr      C(C)C       iPr
nPr      CCC         nPr
Et       CC          Et
NCF3     NC(F)(F)F NCF<sub>3</sub>     F<sub>3</sub>CN
CF3      C(F)(F)F  CF<sub>3</sub>      F<sub>3</sub>C
CCl3     C(Cl)(Cl)Cl CCl<sub>3</sub>     Cl<sub>3</sub>C
CN       C#N       CN       NC
NC       [N+]#[C-] NC       CN 
N(OH)CH3 N([OH])C    N(OH)CH<sub>3</sub> CH<sub>3</sub>(OH)N
NO2      [N+](=O)[O-]  NO<sub>2</sub>      O<sub>2</sub>N
NO       N=O   NO       ON
SO3H     S(=O)(=O)[OH] SO<sub>3</sub>H     HO<sub>3</sub>S
CO2H     C(=O)[OH] CO<sub>2</sub>H     HO<sub>2</sub>C
COOH     C(=O)[OH] COOH     HOOC
OEt      OCC   OEt      EtO
OAc      OC(=O)C   OAc      AcO
NHAc     NC(=O)C   NHAc     AcNH
Ac       C(=O)C    Ac
CHO      C=O   CHO      OHC
NMe      NC    NMe      MeN
SMe      SC    SMe      MeS
OMe      OC    OMe      MeO
CO2-     C(=O)[O-]   COO<sup>-</sup>     <sup>-</sup>OOC
COO-     C(=O)[O-]   COO<sup>-</sup>     <sup>-</sup>OOC)ABBREVS";

/*
Translations of linker superatom labels to SMILES.

First atom of SMILES string should be a dummy connected to the rest of
the molecule. The other linker dummy/dummies show the other attachments

*/
const std::string defaultLinkers =
    R"ABBREVS(PEG6  *OCCOCCOCCOCCOCCOCC* PEG6
PEG5  *OCCOCCOCCOCCOCC* PEG5
PEG4  *OCCOCCOCCOCC* PEG4
PEG3  *OCCOCCOCC* PEG3
Dec   *CCCCCCCCCC*
Non   *CCCCCCCCC*
Oct   *CCCCCCCC*
Hept  *CCCCCCC*)ABBREVS";
// other possible abbreviations that might be useful:
/*
PEG6  *OCCOCCOCCOCCOCC* PEG6
PEG5  *OCCOCCOCCOCCOCC* PEG5
PEG4  *OCCOCCOCCOCC* PEG4
PEG3  *OCCOCCOCC* PEG3
Dec   *CCCCCCCCCC*
Non   *CCCCCCCCC*
Oct   *CCCCCCCC*
Hept  *CCCCCCC*
Hex   *CCCCCC*
Pent  *CCCCC*
Cy   *C1CCC(*)CC1  Cy
ala *N[C@@H](C)C(=O)* ala
arg *N[C@@H](CCCNC(N)=[NH])C(=O)* arg
asn *N[C@@H](CC(N)=O)C(=O)* asn
asp *N[C@@H](CC(O)=O)C(=O)* asp
cys *N[C@@H](CS)C(=O)* cys
gln *N[C@@H](CCC(N)=O)C(=O)* gln
glu *N[C@@H](CCC(O)=O)C(=O)* glu
gly *NCC(=O)* gly
his *N[C@@H](Cc1c[nH]cn1)C(=O)* his
ile *N[C@@H](C(C)CC)C(=O)* ile
leu *N[C@@H](CC(C)C)C(=O)* leu
lys *N[C@@H](CCCCN)C(=O)* lys
met *N[C@@H](CCSC)C(=O)* met
phe *N[C@@H](Cc1ccccc1)C(=O)* phe
pro *N1[C@@H](CCC1)C(=O)* pro
ser *N[C@@H](CO)C(=O)* ser
thr *N[C@@H](C(O)C)C(=O)* thr
trp *N[C@@H](Cc1c[nH]c2ccccc21)C(=O)* trp
tyr *N[C@@H](Cc1ccc(O)cc1)C(=O)* tyr
val *N[C@@H](C(C)C)C(=O)* val
*/
}  // namespace data

namespace detail {
ROMol *createAbbreviationMol(const std::string &txt, bool removeExtraDummies,
                             bool allowConnectionToDummies) {
  std::string smarts;
  if (txt[0] != '*') {
    smarts = "*" + txt;
  } else {
    smarts = txt;
  }
  RWMol *q = SmartsToMol(smarts);
  if (!q) {
    return q;
  }
  if (q->getNumAtoms() < 2) {
    BOOST_LOG(rdErrorLog) << "abbreviation with <2 atoms ignored" << std::endl;
    delete q;
    return nullptr;
  }
  MolOps::AdjustQueryParameters ps;
  ps.adjustDegree = true;
  ps.adjustDegreeFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
  ps.adjustRingCount = true;
  ps.adjustRingCountFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
  MolOps::adjustQueryProperties(*q, &ps);
  if (!allowConnectionToDummies) {
    auto qry = makeAtomNumQuery(0);
    qry->setNegation(true);
    q->getAtomWithIdx(0)->expandQuery(qry);
  }
  unsigned int nDummies = std::count_if(smarts.begin(), smarts.end(),
                                        [](char c) { return c == '*'; });
  if (removeExtraDummies) {
    q->beginBatchEdit();
    for (const auto at : q->atoms()) {
      if (!at->getIdx()) {
        // skip the first atom
        continue;
      }
      if (at->hasQuery() && at->getQuery()->getDescription() == "AtomNull") {
        q->removeAtom(at->getIdx());
        --nDummies;
      }
    }
    q->commitBatchEdit();
  }
  q->setProp(common_properties::numDummies, nDummies);
  return q;
}
}  // namespace detail

std::vector<AbbreviationDefinition> parseAbbreviations(
    const std::string &text, bool removeExtraDummies,
    bool allowConnectionToDummies) {
  std::vector<AbbreviationDefinition> res;
  boost::char_separator<char> lineSep("\n");
  tokenizer lines(text, lineSep);
  boost::char_separator<char> fieldSep(" \t");
  for (const auto &line : lines) {
    AbbreviationDefinition defn;
    tokenizer fields(line, fieldSep);
    tokenizer::iterator field = fields.begin();
    defn.label = *field;
    ++field;
    defn.smarts = *field;
    ++field;
    if (field != fields.end()) {
      defn.displayLabel = *field;
      ++field;
      if (field != fields.end()) {
        defn.displayLabelW = *field;
      }
    }
    defn.mol.reset(detail::createAbbreviationMol(
        defn.smarts, removeExtraDummies, allowConnectionToDummies));
    if (defn.mol) {
      res.push_back(defn);
    }
  }

  return res;
}
std::vector<AbbreviationDefinition> getDefaultAbbreviations() {
  static auto defs = parseAbbreviations(data::defaultAbbreviations);
  return defs;
}
std::vector<AbbreviationDefinition> getDefaultLinkers() {
  static auto defs = parseAbbreviations(data::defaultLinkers, true, true);
  return defs;
}
}  // namespace Utils

}  // namespace Abbreviations
}  // namespace RDKit