File: SubstructLibrary.cpp

package info (click to toggle)
rdkit 201809.1%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 123,688 kB
  • sloc: cpp: 230,509; python: 70,501; java: 6,329; ansic: 5,427; sql: 1,899; yacc: 1,739; lex: 1,243; makefile: 445; xml: 229; fortran: 183; sh: 123; cs: 93
file content (268 lines) | stat: -rw-r--r-- 10,763 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
//  Copyright (c) 2017, Novartis Institutes for BioMedical Research Inc.
//  All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
//       copyright notice, this list of conditions and the following
//       disclaimer in the documentation and/or other materials provided
//       with the distribution.
//     * Neither the name of Novartis Institutes for BioMedical Research Inc.
//       nor the names of its contributors may be used to endorse or promote
//       products derived from this software without specific prior written
//       permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "SubstructLibrary.h"
#include <RDGeneral/RDThreads.h>
#ifdef RDK_THREADSAFE_SSS
#include <thread>
#include <future>
#endif

#include <atomic>
#include <GraphMol/Substruct/SubstructMatch.h>

namespace RDKit {

struct Bits {
  const ExplicitBitVect *queryBits;
  const FPHolderBase *fps;
  bool recursionPossible;
  bool useChirality;
  bool useQueryQueryMatches;

  Bits(const FPHolderBase *fps, const ROMol &m, bool recursionPossible,
       bool useChirality, bool useQueryQueryMatches)
      : fps(fps),
        recursionPossible(recursionPossible),
        useChirality(useChirality),
        useQueryQueryMatches(useQueryQueryMatches) {
    if (fps) {
      queryBits = fps->makeFingerprint(m);
    } else
      queryBits = nullptr;
  }

  bool check(unsigned int idx) const {
    if (fps) {
      return fps->passesFilter(idx, *queryBits);
    }
    return true;
  }
};

unsigned int SubstructLibrary::addMol(const ROMol &m) {
  unsigned int size = mols->addMol(m);
  if (fps) {
    unsigned int fpsize = fps->addMol(m);
    CHECK_INVARIANT(size == fpsize,
                    "#mols different than #fingerprints in SubstructLibrary");
  }
  return size;
}

namespace {

// end is exclusive here
void SubSearcher(const ROMol &in_query, const Bits &bits,
                 const MolHolderBase &mols, std::vector<unsigned int> &idxs,
                 unsigned int start, unsigned int end, unsigned int numThreads,
                 std::atomic<int> &counter, const int maxResults) {
  ROMol query(in_query);
  MatchVectType matchVect;
  for (unsigned int idx = start;
       idx < end && (maxResults == -1 || counter < maxResults);
       idx += numThreads) {
    if (!bits.check(idx)) continue;
    // need shared_ptr as it (may) control the lifespan of the
    //  returned molecule!
    const boost::shared_ptr<ROMol> &m = mols.getMol(idx);
    const ROMol *mol = m.get();
    if (SubstructMatch(*mol, query, matchVect, bits.recursionPossible,
                       bits.useChirality, bits.useQueryQueryMatches)) {
      // this is squishy when updating the counter.  While incrementing is
      // atomic
      // several substructure runs can update the counter beyond the maxResults
      //  This okay: if we get one or two extra, we can fix it on the way out
      if (maxResults != -1 && counter >= maxResults) break;
      idxs.push_back(idx);
      if (maxResults != -1) counter++;
    }
  }
}

// end is inclusive here
void SubSearchMatchCounter(const ROMol &in_query, const Bits &bits,
                           const MolHolderBase &mols, unsigned int start,
                           unsigned int end, int numThreads,
                           std::atomic<int> &counter) {
  ROMol query(in_query);
  MatchVectType matchVect;
  for (unsigned int idx = start; idx < end; idx += numThreads) {
    if (!bits.check(idx)) continue;
    // need shared_ptr as it (may) controls the lifespan of the
    //  returned molecule!
    const boost::shared_ptr<ROMol> &m = mols.getMol(idx);
    const ROMol *mol = m.get();
    if (SubstructMatch(*mol, query, matchVect, bits.recursionPossible,
                       bits.useChirality, bits.useQueryQueryMatches)) {
      counter++;
    }
  }
}

std::vector<unsigned int> internalGetMatches(
    const ROMol &query, MolHolderBase &mols, const FPHolderBase *fps,
    unsigned int startIdx, unsigned int endIdx, bool recursionPossible,
    bool useChirality, bool useQueryQueryMatches, int numThreads = -1,
    int maxResults = 1000) {
  PRECONDITION(startIdx < mols.size(), "startIdx out of bounds");
  PRECONDITION(endIdx > startIdx, "endIdx > startIdx");
  if (numThreads == -1)
    numThreads = (int)getNumThreadsToUse(numThreads);
  else
    numThreads = std::min(numThreads, (int)getNumThreadsToUse(numThreads));

  endIdx = std::min(mols.size(), endIdx);
  if (endIdx < static_cast<unsigned int>(numThreads)) numThreads = endIdx;

  std::vector<std::future<void>> thread_group;
  std::atomic<int> counter(0);
  std::vector<std::vector<unsigned int>> internal_results(numThreads);

  Bits bits(fps, query, recursionPossible, useChirality, useQueryQueryMatches);

  for (int thread_group_idx = 0; thread_group_idx < numThreads;
       ++thread_group_idx) {
    // need to use boost::ref otherwise things are passed by value
    thread_group.emplace_back(
        std::async(std::launch::async, SubSearcher, std::ref(query), bits,
                   std::ref(mols), std::ref(internal_results[thread_group_idx]),
                   startIdx + thread_group_idx, endIdx, numThreads,
                   std::ref(counter), maxResults));
  }
  for (auto &fut : thread_group) {
    fut.get();
  }
  delete bits.queryBits;

  std::vector<unsigned int> results;
  for (int thread_group_idx = 0; thread_group_idx < numThreads;
       ++thread_group_idx) {
    results.insert(results.end(), internal_results[thread_group_idx].begin(),
                   internal_results[thread_group_idx].end());
  }

  // this is so we don't really have to do locking on the atomic counter...
  if (maxResults != -1 && rdcast<int>(results.size()) > maxResults)
    results.resize(maxResults);

  return results;
}

int internalMatchCounter(const ROMol &query, MolHolderBase &mols,
                         const FPHolderBase *fps, unsigned int startIdx,
                         unsigned int endIdx, bool recursionPossible,
                         bool useChirality, bool useQueryQueryMatches,
                         int numThreads = -1) {
  PRECONDITION(startIdx < mols.size(), "startIdx out of bounds");
  PRECONDITION(endIdx > startIdx, "endIdx > startIdx");

  endIdx = std::min(mols.size(), endIdx);

  if (numThreads == -1)
    numThreads = (int)getNumThreadsToUse(numThreads);
  else
    numThreads = std::min(numThreads, (int)getNumThreadsToUse(numThreads));

  if (endIdx < static_cast<unsigned int>(numThreads)) numThreads = endIdx;

  std::vector<std::future<void>> thread_group;
  std::atomic<int> counter(0);

  Bits bits(fps, query, recursionPossible, useChirality, useQueryQueryMatches);
  for (int thread_group_idx = 0; thread_group_idx < numThreads;
       ++thread_group_idx) {
    // need to use boost::ref otherwise things are passed by value
    thread_group.emplace_back(
        std::async(std::launch::async, SubSearchMatchCounter, std::ref(query),
                   bits, std::ref(mols), startIdx + thread_group_idx, endIdx,
                   numThreads, std::ref(counter)));
  }
  for (auto &thread : thread_group) {
    thread.get();
  }
  delete bits.queryBits;
  return (int)counter;
}
}

std::vector<unsigned int> SubstructLibrary::getMatches(
    const ROMol &query, bool recursionPossible, bool useChirality,
    bool useQueryQueryMatches, int numThreads, int maxResults) {
  return getMatches(query, 0, mols->size(), recursionPossible, useChirality,
                    useQueryQueryMatches, numThreads, maxResults);
}

std::vector<unsigned int> SubstructLibrary::getMatches(
    const ROMol &query, unsigned int startIdx, unsigned int endIdx,
    bool recursionPossible, bool useChirality, bool useQueryQueryMatches,
    int numThreads, int maxResults) {
  return internalGetMatches(query, *mols, fps, startIdx, endIdx,
                            recursionPossible, useChirality,
                            useQueryQueryMatches, numThreads, maxResults);
}

unsigned int SubstructLibrary::countMatches(const ROMol &query,
                                            bool recursionPossible,
                                            bool useChirality,
                                            bool useQueryQueryMatches,
                                            int numThreads) {
  return countMatches(query, 0, mols->size(), recursionPossible, useChirality,
                      useQueryQueryMatches, numThreads);
}

unsigned int SubstructLibrary::countMatches(
    const ROMol &query, unsigned int startIdx, unsigned int endIdx,
    bool recursionPossible, bool useChirality, bool useQueryQueryMatches,
    int numThreads) {
  return internalMatchCounter(query, *mols, fps, startIdx, endIdx,
                              recursionPossible, useChirality,
                              useQueryQueryMatches, numThreads);
}

bool SubstructLibrary::hasMatch(const ROMol &query, bool recursionPossible,
                                bool useChirality, bool useQueryQueryMatches,
                                int numThreads) {
  const int maxResults = 1;
  return getMatches(query, recursionPossible, useChirality,
                    useQueryQueryMatches, numThreads, maxResults)
             .size() > 0;
}

bool SubstructLibrary::hasMatch(const ROMol &query, unsigned int startIdx,
                                unsigned int endIdx, bool recursionPossible,
                                bool useChirality, bool useQueryQueryMatches,
                                int numThreads) {
  const int maxResults = 1;
  return getMatches(query, startIdx, endIdx, recursionPossible, useChirality,
                    useQueryQueryMatches, numThreads, maxResults)
             .size() > 0;
}
}