1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
|
//
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "Abbreviations.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/RDKitQueries.h>
#include <boost/tokenizer.hpp>
using tokenizer = boost::tokenizer<boost::char_separator<char>>;
namespace RDKit {
namespace Abbreviations {
namespace common_properties {
const std::string numDummies = "_numDummies";
const std::string origAtomMapping = "_origAtomMapping";
const std::string origBondMapping = "_origBondMapping";
} // namespace common_properties
namespace Utils {
namespace data {
/*
Translations of superatom labels to SMILES.
First atom of SMILES string should be the one connected to the rest of
the molecule.
ADAPTED FROM: https://github.com/openbabel/superatoms/blob/master/superatom.txt
Originally from http://cactus.nci.nih.gov/osra/
The left-aligned form is the one recognized in MDL alias lines;
the right-aligned form may be used in 2D depiction.
label smiles display_label display_label_w
*/
const std::string defaultAbbreviations =
R"ABBREVS(CO2Et C(=O)OCC CO<sub>2</sub>Et EtO<sub>2</sub>C
COOEt C(=O)OCC CO<sub>2</sub>Et EtO<sub>2</sub>C
OiBu OCC(C)C OiBu iBuO
nDec CCCCCCCCCC nDec
nNon CCCCCCCCC nNon
nOct CCCCCCCC nOct
nHept CCCCCCC nHept
nHex CCCCCC nHex
nPent CCCCC nPent
iPent C(C)CCC iPent
tBu C(C)(C)C tBu
iBu C(C)CC iBu
nBu CCCC nBu
iPr C(C)C iPr
nPr CCC nPr
Et CC Et
NCF3 NC(F)(F)F NCF<sub>3</sub> F<sub>3</sub>CN
CF3 C(F)(F)F CF<sub>3</sub> F<sub>3</sub>C
CCl3 C(Cl)(Cl)Cl CCl<sub>3</sub> Cl<sub>3</sub>C
CN C#N CN NC
NC [N+]#[C-] NC CN
N(OH)CH3 N([OH])C N(OH)CH<sub>3</sub> CH<sub>3</sub>(OH)N
NO2 [N+](=O)[O-] NO<sub>2</sub> O<sub>2</sub>N
NO N=O NO ON
SO3H S(=O)(=O)[OH] SO<sub>3</sub>H HO<sub>3</sub>S
CO2H C(=O)[OH] CO<sub>2</sub>H HO<sub>2</sub>C
COOH C(=O)[OH] COOH HOOC
OEt OCC OEt EtO
OAc OC(=O)C OAc AcO
NHAc NC(=O)C NHAc AcNH
Ac C(=O)C Ac
CHO C=O CHO OHC
NMe NC NMe MeN
SMe SC SMe MeS
OMe OC OMe MeO
CO2- C(=O)[O-] COO<sup>-</sup> <sup>-</sup>OOC
COO- C(=O)[O-] COO<sup>-</sup> <sup>-</sup>OOC)ABBREVS";
/*
Translations of linker superatom labels to SMILES.
First atom of SMILES string should be a dummy connected to the rest of
the molecule. The other linker dummy/dummies show the other attachments
*/
const std::string defaultLinkers =
R"ABBREVS(PEG6 *OCCOCCOCCOCCOCCOCC* PEG6
PEG5 *OCCOCCOCCOCCOCC* PEG5
PEG4 *OCCOCCOCCOCC* PEG4
PEG3 *OCCOCCOCC* PEG3
Dec *CCCCCCCCCC*
Non *CCCCCCCCC*
Oct *CCCCCCCC*
Hept *CCCCCCC*)ABBREVS";
// other possible abbreviations that might be useful:
/*
PEG6 *OCCOCCOCCOCCOCC* PEG6
PEG5 *OCCOCCOCCOCCOCC* PEG5
PEG4 *OCCOCCOCCOCC* PEG4
PEG3 *OCCOCCOCC* PEG3
Dec *CCCCCCCCCC*
Non *CCCCCCCCC*
Oct *CCCCCCCC*
Hept *CCCCCCC*
Hex *CCCCCC*
Pent *CCCCC*
Cy *C1CCC(*)CC1 Cy
ala *N[C@@H](C)C(=O)* ala
arg *N[C@@H](CCCNC(N)=[NH])C(=O)* arg
asn *N[C@@H](CC(N)=O)C(=O)* asn
asp *N[C@@H](CC(O)=O)C(=O)* asp
cys *N[C@@H](CS)C(=O)* cys
gln *N[C@@H](CCC(N)=O)C(=O)* gln
glu *N[C@@H](CCC(O)=O)C(=O)* glu
gly *NCC(=O)* gly
his *N[C@@H](Cc1c[nH]cn1)C(=O)* his
ile *N[C@@H](C(C)CC)C(=O)* ile
leu *N[C@@H](CC(C)C)C(=O)* leu
lys *N[C@@H](CCCCN)C(=O)* lys
met *N[C@@H](CCSC)C(=O)* met
phe *N[C@@H](Cc1ccccc1)C(=O)* phe
pro *N1[C@@H](CCC1)C(=O)* pro
ser *N[C@@H](CO)C(=O)* ser
thr *N[C@@H](C(O)C)C(=O)* thr
trp *N[C@@H](Cc1c[nH]c2ccccc21)C(=O)* trp
tyr *N[C@@H](Cc1ccc(O)cc1)C(=O)* tyr
val *N[C@@H](C(C)C)C(=O)* val
*/
} // namespace data
namespace detail {
ROMol *createAbbreviationMol(const std::string &txt, bool removeExtraDummies,
bool allowConnectionToDummies) {
std::string smarts;
if (txt[0] != '*') {
smarts = "*" + txt;
} else {
smarts = txt;
}
RWMol *q = SmartsToMol(smarts);
if (!q) {
return q;
}
if (q->getNumAtoms() < 2) {
BOOST_LOG(rdErrorLog) << "abbreviation with <2 atoms ignored" << std::endl;
delete q;
return nullptr;
}
MolOps::AdjustQueryParameters ps;
ps.adjustDegree = true;
ps.adjustDegreeFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
ps.adjustRingCount = true;
ps.adjustRingCountFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
MolOps::adjustQueryProperties(*q, &ps);
if (!allowConnectionToDummies) {
auto qry = makeAtomNumQuery(0);
qry->setNegation(true);
q->getAtomWithIdx(0)->expandQuery(qry);
}
unsigned int nDummies = std::count_if(smarts.begin(), smarts.end(),
[](char c) { return c == '*'; });
if (removeExtraDummies) {
q->beginBatchEdit();
for (const auto at : q->atoms()) {
if (!at->getIdx()) {
// skip the first atom
continue;
}
if (at->hasQuery() && at->getQuery()->getDescription() == "AtomNull") {
q->removeAtom(at->getIdx());
--nDummies;
}
}
q->commitBatchEdit();
}
q->setProp(common_properties::numDummies, nDummies);
return q;
}
} // namespace detail
std::vector<AbbreviationDefinition> parseAbbreviations(
const std::string &text, bool removeExtraDummies,
bool allowConnectionToDummies) {
std::vector<AbbreviationDefinition> res;
boost::char_separator<char> lineSep("\n");
tokenizer lines(text, lineSep);
boost::char_separator<char> fieldSep(" \t");
for (const auto &line : lines) {
AbbreviationDefinition defn;
tokenizer fields(line, fieldSep);
tokenizer::iterator field = fields.begin();
defn.label = *field;
++field;
defn.smarts = *field;
++field;
if (field != fields.end()) {
defn.displayLabel = *field;
++field;
if (field != fields.end()) {
defn.displayLabelW = *field;
}
}
defn.mol.reset(detail::createAbbreviationMol(
defn.smarts, removeExtraDummies, allowConnectionToDummies));
if (defn.mol) {
res.push_back(defn);
}
}
return res;
}
std::vector<AbbreviationDefinition> getDefaultAbbreviations() {
static auto defs = parseAbbreviations(data::defaultAbbreviations);
return defs;
}
std::vector<AbbreviationDefinition> getDefaultLinkers() {
static auto defs = parseAbbreviations(data::defaultLinkers, true, true);
return defs;
}
} // namespace Utils
} // namespace Abbreviations
} // namespace RDKit
|