File: FragCatalogUtils.cpp

package info (click to toggle)
rdkit 201809.1%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 123,688 kB
  • sloc: cpp: 230,509; python: 70,501; java: 6,329; ansic: 5,427; sql: 1,899; yacc: 1,739; lex: 1,243; makefile: 445; xml: 229; fortran: 183; sh: 123; cs: 93
file content (207 lines) | stat: -rw-r--r-- 6,253 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
// $Id$
//
//  Copyright (C) 2003-2006 Rational Discovery LLC
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//

// REVIEW: move this to a GraphMol/FuncGroups directory

#include <RDGeneral/BadFileException.h>
#include "FragCatalogUtils.h"
#include <GraphMol/Subgraphs/Subgraphs.h>
#include <GraphMol/Subgraphs/SubgraphUtils.h>
#include <fstream>
#include <string>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <boost/tokenizer.hpp>
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
#include <boost/algorithm/string.hpp>

namespace RDKit {
// local utility namespace
namespace {
ROMol *getSmarts(const std::string &tmpStr) {
  ROMol *mol = nullptr;
  if (tmpStr.length() == 0) {
    // empty line
    return mol;
  }
  if (tmpStr.substr(0, 2) == "//") {
    // comment line
    return mol;
  }

  boost::char_separator<char> tabSep("\t");
  tokenizer tokens(tmpStr, tabSep);
  tokenizer::iterator token = tokens.begin();

  // name of the functional groups
  std::string name = *token;
  boost::erase_all(name, " ");
  ++token;

  // grab the smarts:
  std::string smarts = *token;
  boost::erase_all(smarts, " ");
  ++token;

  mol = SmartsToMol(smarts);
  CHECK_INVARIANT(mol, smarts);
  mol->setProp(common_properties::_Name, name);
  mol->setProp(common_properties::_fragSMARTS, smarts);
  return mol;
}
}  // end of local utility namespace

MOL_SPTR_VECT readFuncGroups(std::istream &inStream, int nToRead) {
  MOL_SPTR_VECT funcGroups;
  funcGroups.clear();
  if (inStream.bad()) {
    throw BadFileException("Bad stream contents.");
  }
  const int MAX_LINE_LEN = 512;
  char inLine[MAX_LINE_LEN];
  std::string tmpstr;
  int nRead = 0;
  while (!inStream.eof() && (nToRead < 0 || nRead < nToRead)) {
    inStream.getline(inLine, MAX_LINE_LEN, '\n');
    tmpstr = inLine;
    // parse the molecule on this line (if there is one)
    ROMol *mol = getSmarts(tmpstr);
    if (mol) {
      funcGroups.push_back(ROMOL_SPTR(mol));
      nRead++;
    }
  }
  return funcGroups;
}

MOL_SPTR_VECT readFuncGroups(std::string fileName) {
  std::ifstream inStream(fileName.c_str());
  if ((!inStream) || (inStream.bad())) {
    std::ostringstream errout;
    errout << "Bad input file " << fileName;
    throw BadFileException(errout.str());
  }
  MOL_SPTR_VECT funcGroups;
  funcGroups = readFuncGroups(inStream);
  return funcGroups;
}

MatchVectType findFuncGroupsOnMol(const ROMol &mol, const FragCatParams *params,
                                  INT_VECT &fgBonds) {
  PRECONDITION(params, "bad params");

  fgBonds.clear();

  std::pair<int, int> amat;
  MatchVectType aidFgrps;
  std::vector<MatchVectType> fgpMatches;
  std::vector<MatchVectType>::const_iterator mati;
  MatchVectType::const_iterator mi;
  int aid;
  // const ROMol *fgrp;

  INT_VECT_CI bi;
  aidFgrps.clear();

  int fid = 0;
  const MOL_SPTR_VECT &fgrps = params->getFuncGroups();
  MOL_SPTR_VECT::const_iterator fgci;

  for (fgci = fgrps.begin(); fgci != fgrps.end(); fgci++) {
    const ROMol *fgrp = fgci->get();
    std::string fname;
    (*fgci)->getProp(common_properties::_Name, fname);
    // std::cout << "Groups number: " << fname << "\n";
    //(*fgci)->debugMol(std::cout);
    // mol->debugMol(std::cout);
    // match this functional group onto the molecule
    SubstructMatch(mol, *fgrp, fgpMatches);

    // loop over all the matches we get for this fgroup
    for (mati = fgpMatches.begin(); mati != fgpMatches.end(); mati++) {
      // FIX: we will assume that the first atom in fgrp is always the
      // connection
      // atom
      amat = mati->front();
      aid = amat.second;  // FIX: is this correct - the second entry in the pair
                          // is the atom ID from mol

      // grab the list of atom Ids from mol that match the functional group
      INT_VECT bondIds, maids;
      for (mi = mati->begin(); mi != mati->end(); mi++) {
        maids.push_back(mi->second);
      }

      // create a list of bond IDs from these atom ID
      // these are the bond in mol that are part of portion that matches the
      // functional group
      bondIds = Subgraphs::bondListFromAtomList(mol, maids);

      // now check if all these bonds have been covered as part of larger
      // functional group that was dealt with earlier
      // FIX: obviously we assume here that the function groups in params
      // come in decreasing order of size.
      bool allDone = true;
      for (bi = bondIds.begin(); bi != bondIds.end(); bi++) {
        if (std::find(fgBonds.begin(), fgBonds.end(), (*bi)) == fgBonds.end()) {
          allDone = false;
          fgBonds.push_back(*bi);
        }
      }

      if (!allDone) {
        // this functional group mapping onto mol is not part of a larger func
        // group mapping so record it
        aidFgrps.push_back(std::pair<int, int>(aid, fid));
      }
    }
    fid++;
  }

  return aidFgrps;
}

ROMol *prepareMol(const ROMol &mol, const FragCatParams *fparams,
                  MatchVectType &aToFmap) {
  PRECONDITION(fparams, "");

  // get a mapping of the functional groups onto the molecule
  INT_VECT fgBonds;
  MatchVectType aidToFid = findFuncGroupsOnMol(mol, fparams, fgBonds);

  // get the core piece of molecule (i.e. the part of the molecule
  // without the functional groups). This basically the part of the molecule
  // that does not contain the function group bonds given by "fgBonds"
  INT_VECT cBonds;
  int bid, nbds = mol.getNumBonds();

  for (bid = 0; bid < nbds; bid++) {
    if (std::find(fgBonds.begin(), fgBonds.end(), bid) == fgBonds.end()) {
      cBonds.push_back(bid);
    }
  }

  INT_MAP_INT
  aIdxMap;  // a map from atom id in mol to the new atoms id in coreMol

  ROMol *coreMol = Subgraphs::pathToSubmol(mol, cBonds, false, aIdxMap);

  // now map the functional groups on mol to coreMol using aIdxMap
  MatchVectType::iterator mati;

  int newID;
  for (mati = aidToFid.begin(); mati != aidToFid.end(); mati++) {
    newID = aIdxMap[mati->first];
    aToFmap.push_back(std::pair<int, int>(newID, mati->second));
  }

  return coreMol;
}
}