1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
|
// $Id$
//
// Copyright (C) 2003-2006 Rational Discovery LLC
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <Catalogs/Catalog.h>
#include "FragCatGenerator.h"
#include "FragCatalogEntry.h"
#include "FragCatParams.h"
#include "FragCatalogUtils.h"
#include <RDGeneral/types.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/Subgraphs/SubgraphUtils.h>
#include <GraphMol/Subgraphs/Subgraphs.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
namespace RDKit {
unsigned int addOrder1Paths(PATH_LIST &paths, const ROMol &mol,
FragCatalog *fcat, DOUBLE_INT_MAP &mapkm1,
const MatchVectType &aidToFid) {
PRECONDITION(fcat, "");
bool found;
const FragCatalogEntry *entry;
// INT_VECT o1entries;
const FragCatParams *fparams = fcat->getCatalogParams();
unsigned int lLen = fparams->getLowerFragLength();
unsigned int uLen = fparams->getUpperFragLength();
CHECK_INVARIANT(lLen <= uLen, "");
unsigned int n01 = 0;
double tol = fparams->getTolerance();
PATH_LIST_CI pi;
INT_VECT_CI eti;
double invar;
int vid;
for (pi = paths.begin(); pi != paths.end(); pi++) {
auto *nent = new FragCatalogEntry(&mol, (*pi), aidToFid);
// loop over each order 1 path
found = false;
const INT_VECT &o1entries = fcat->getEntriesOfOrder(1);
for (eti = o1entries.begin(); eti != o1entries.end(); eti++) {
// loop over all the order 1 entries all ready present in the catalog
entry = fcat->getEntryWithIdx(*eti);
if (nent->match(entry, tol)) {
found = true;
invar = computeIntVectPrimesProduct(*pi);
mapkm1[invar] = (*eti);
delete nent;
break;
}
}
if (!found) {
bool updateSigL = false;
if ((lLen <= 1) && (uLen >= 1)) {
// if order 1 subgraphs are of interest in fingerprinting
// asign a bit to this fragment when we add to the catalog and update
// fingerprint len
updateSigL = true;
}
nent->setDescription(fparams);
vid = fcat->addEntry(nent, updateSigL);
invar = computeIntVectPrimesProduct(*pi);
mapkm1[invar] = vid;
n01++;
}
}
return n01;
}
unsigned int addHigherOrderPaths(const INT_PATH_LIST_MAP &allPaths,
const ROMol &mol, FragCatalog *fcat,
DOUBLE_INT_MAP &mapkm1,
const MatchVectType &aidToFid) {
PRECONDITION(fcat, "");
// This works something like this
// - for each path of order k in the mol
// - we find all connected subpaths
// of order (k-1)
// - find the entries in the catalog that correspond to each of these
// order (k-1) paths (using mapkm1 - remember that this maps the
// invariant
// of a path to the entry ID in the catalog graph)
// - Find the intersection of the down entries of these order (k-1)
// - check if order k path we are testing matches any of the order k
// entries in this intersection
// - if we find a match move onto the next order k path
// - if we do not find a match
// - create an entry for the order k path and add it to the catalog
// - also add out edges from each of the entries corresponding to the
// order k-1
// subgraphs to this path.
PATH_LIST paths;
PATH_LIST_CI pi;
bool found;
double invar, sinvar;
int entId;
DOUBLE_INT_MAP mapk;
int mEntId, vid;
const FragCatParams *fparams = fcat->getCatalogParams();
unsigned int lLen = fparams->getLowerFragLength();
unsigned int uLen = fparams->getUpperFragLength();
double tol = fparams->getTolerance();
unsigned int nrem =
0; // counter for number of fragments added to the catalog
const FragCatalogEntry *entry;
INT_PATH_LIST_MAP_CI ordi;
for (ordi = allPaths.begin(); ordi != allPaths.end(); ordi++) {
if (ordi->first < 2) {
continue;
}
mapk.clear();
for (pi = (*ordi).second.begin(); pi != (*ordi).second.end(); pi++) {
found = false;
auto *nent = new FragCatalogEntry(&mol, (*pi), aidToFid);
nent->setDescription(fparams);
unsigned int scnt = 0;
INT_VECT intersect, tmpVect;
INT_VECT_CI iti;
invar = computeIntVectPrimesProduct(*pi);
DOUBLE_VECT sinvarV;
DOUBLE_VECT_CI sci;
// loop over the subpaths (order (k-1) ) (by ignoring one bond
// at a time from consideration) and find out which entries int eh catalog
// they correspond to
// and make an interestion of the down entries (i.e. order k entries that
// contain these order k-1
// entries. - we can baiscally limit our search for an isomorphic entry in
// the
// catalog of the order k path from the molecule to this intersection list
PATH_TYPE::const_iterator pii;
for (pii = pi->begin(); pii != pi->end(); pii++) {
sinvar = invar / firstThousandPrimes[*pii];
// here is a check for "did we see this path before ?"
// this should also take care of disconnected subpaths (since the
// catalog should have only connected subgraphs)
if (mapkm1.find(sinvar) == mapkm1.end()) {
continue;
}
// push this sinvar onto a vector
// we need them to add edges int he catalog graph
sinvarV.push_back(sinvar);
entId = mapkm1[sinvar];
if (scnt == 0) {
intersect = fcat->getDownEntryList(entId);
scnt++;
} else {
tmpVect = intersect;
Intersect(fcat->getDownEntryList(entId), tmpVect, intersect);
scnt++;
}
}
// now search through the intersection list to check if we already have a
// isomorphic
// entry in the catalog
for (iti = intersect.begin(); iti != intersect.end(); iti++) {
entry = fcat->getEntryWithIdx(*iti);
if (nent->match(entry, tol)) {
found = true;
mEntId = (*iti);
delete nent;
break;
}
}
if (found) {
// update the mapk so that the next time we see this path (when
// dealing with order k+1 path we know which entry in the catalog
// to look at
mapk[invar] = mEntId;
} else {
// we have never seen this subgraph before add it to the catalog
unsigned int ordr = nent->getOrder();
bool updateSigL = false;
if ((ordr >= lLen) && (ordr <= uLen)) {
// if this order subgraphs are of interest in fingerprinting
// asign a bit to this fragment when we add to the catalog and update
// fingerprint len
updateSigL = true;
}
vid = fcat->addEntry(nent, updateSigL);
mapk[invar] = vid;
nrem++; // increment the fragment counter
// loop over the entries corresponding to the subpaths and
// add connections to them
for (sci = sinvarV.begin(); sci != sinvarV.end(); sci++) {
entId = mapkm1[*sci];
fcat->addEdge(entId, vid);
}
} // end of never seen this order k subgraph
} // end of loop over order k paths in mol
// overwrite mapkm1 with mapk before we move on to order k+1
mapkm1 = mapk;
} // end of loop over path order
return nrem;
}
unsigned int FragCatGenerator::addFragsFromMol(const ROMol &mol,
FragCatalog *fcat) {
PRECONDITION(fcat, "");
INT_PATH_LIST_MAP allPaths;
allPaths.clear();
DOUBLE_INT_MAP mapkm1;
mapkm1.clear();
const FragCatParams *fparams = fcat->getCatalogParams();
unsigned int lLen = fparams->getLowerFragLength();
unsigned int uLen = fparams->getUpperFragLength();
CHECK_INVARIANT(lLen <= uLen, "");
// prepare the molecule to add to the catalog
// i.e. find functional groups, remove them from the mol etc.
MatchVectType newAidToFid;
INT_VECT fgBonds;
ROMol *coreMol = prepareMol(mol, fparams, newAidToFid);
// mol->debugMol(std::cout);
allPaths = findAllSubgraphsOfLengthsMtoN(*coreMol, 1, uLen);
// deal with order 1 paths
unsigned int nO1Pths =
addOrder1Paths(allPaths[1], *coreMol, fcat, mapkm1, newAidToFid);
// now deal with the higher order paths
unsigned int nremPths =
addHigherOrderPaths(allPaths, *coreMol, fcat, mapkm1, newAidToFid);
delete coreMol;
return (nO1Pths + nremPths);
}
}
|