File: MolProcessing.cpp

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (146 lines) | stat: -rw-r--r-- 4,723 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
//
//  Copyright (C) 2024 Greg Landrum and other RDKit contributors
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//

#include "MolProcessing.h"

namespace RDKit {
namespace MolProcessing {

namespace details {
GeneralMolSupplier::SupplierOptions defaultSupplierOptions;
inline std::unique_ptr<FileParsers::MolSupplier> getSupplier(
    const std::string &fileName,
    const GeneralMolSupplier::SupplierOptions &options) {
#ifdef RDK_BUILD_THREADSAFE_SSS
  static std::once_flag flag;
  std::call_once(flag, []() { defaultSupplierOptions.numWriterThreads = 4; });
#endif
  return GeneralMolSupplier::getSupplier(fileName, options);
}
}  // namespace details

namespace {
#ifdef RDK_BUILD_THREADSAFE_SSS
inline std::mutex &get_fp_mutex() {
  // create on demand
  static std::mutex _mutex;
  return _mutex;
}

template <typename T>
std::vector<std::unique_ptr<T>> mtWorker(
    v2::FileParsers::MultithreadedMolSupplier *suppl,
    std::function<T *(RWMol &)> func) {
  PRECONDITION(suppl, "no supplier");
  std::map<unsigned int, std::unique_ptr<T>> accum;

  auto workerfunc = [&](RWMol &mol, const std::string &,
                        unsigned int recordId) {
    auto item = func(mol);
    {
      std::lock_guard<std::mutex> lock(get_fp_mutex());
      accum[recordId].reset(item);
    }
  };
  suppl->setWriteCallback(workerfunc);
  // loop over the supplier to make sure we read everything
  while (!suppl->atEnd()) {
    auto mol = suppl->next();
  }
  // convert the map to a vector and get the results in the input order
  auto maxv = 0u;
  for (const auto &pr : accum) {
    maxv = std::max(maxv, pr.first);
  }
  std::vector<std::unique_ptr<T>> results(maxv);
  for (auto &pr : accum) {
    results[pr.first - 1] = std::move(pr.second);
  }
  return results;
}
#endif

template <typename T>
std::vector<std::unique_ptr<T>> worker(v2::FileParsers::MolSupplier *suppl,
                                       std::function<T *(RWMol &)> func) {
  PRECONDITION(suppl, "no supplier");
  // if we are using a multi-threaded supplier then we can register a write
  // callback to do our processing multi-threaded too
#ifdef RDK_BUILD_THREADSAFE_SSS
  auto tsuppl =
      dynamic_cast<v2::FileParsers::MultithreadedMolSupplier *>(suppl);
  if (tsuppl) {
    return mtWorker(tsuppl, func);
  } else {
#else
  {
#endif
    // otherwise we just loop through the molecules
    std::vector<std::unique_ptr<T>> results;
    while (!suppl->atEnd()) {
      auto mol = suppl->next();
      if (mol) {
        auto fp = func(*mol);
        results.emplace_back(fp);
      } else if (!suppl->atEnd()) {
        results.emplace_back(nullptr);
      }
    }
    return results;
  }
}
}  // namespace

//! \brief Get fingerprints for all of the molecules in a file
/*!
   \param fileName the name of the file to read
   \param options options controlling how the file is read, if not provided
           four threads will be used whegn reading the file
   \param generator the fingerprint generator to use, if not provided,
           Morgan fingerprints with radius of 3 will be used.

   \return an ExplicitBitVect,bitset pair, the first containing the
           fingerprints and the second a bitset indicating which molecules were
           successfully read
*/
template <typename OutputType>
std::vector<std::unique_ptr<ExplicitBitVect>> getFingerprintsForMolsInFile(
    const std::string &fileName,
    const GeneralMolSupplier::SupplierOptions &options,
    FingerprintGenerator<OutputType> *generator) {
  auto suppl = details::getSupplier(fileName, options);

  std::unique_ptr<FingerprintGenerator<OutputType>> morgan;
  if (generator == nullptr) {
    morgan.reset(MorganFingerprint::getMorganGenerator<OutputType>(3));
    generator = morgan.get();
  }
  std::function<ExplicitBitVect *(RWMol &)> func = [&](RWMol &mol) {
    return generator->getFingerprint(mol);
  };
  auto results = worker(suppl.get(), func);
  return results;
}

template RDKIT_MOLPROCESSING_EXPORT
    std::vector<std::unique_ptr<ExplicitBitVect>>
    getFingerprintsForMolsInFile(
        const std::string &fileName,
        const GeneralMolSupplier::SupplierOptions &options,
        FingerprintGenerator<std::uint32_t> *generator);
template RDKIT_MOLPROCESSING_EXPORT
    std::vector<std::unique_ptr<ExplicitBitVect>>
    getFingerprintsForMolsInFile(
        const std::string &fileName,
        const GeneralMolSupplier::SupplierOptions &options,
        FingerprintGenerator<std::uint64_t> *generator);

}  // namespace MolProcessing
}  // namespace RDKit