File: MolSupplier.v1API.h

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (476 lines) | stat: -rw-r--r-- 18,434 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
//
//  Copyright (C) 2024 greg landrum and other RDKit contributors
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#ifndef RD_MOLSUPPLIER_v1_H
#define RD_MOLSUPPLIER_v1_H

namespace RDKit {
inline namespace v1 {
/*!
//
//  Here are a couple of ways one can interact with MolSuppliers:
//
//  1) Lazy (ForwardIterator):
//     while(!supplier.atEnd()){
//       ROMol *mol = supplier.next();
//       if(mol){
//           do something;
//       }
//     }
//  2) Random Access:
//     for(int i=0;i<supplier.length();i++){
//       ROMol *mol = supplier[i];
//       if(mol){
//           do something;
//       }
//     }
//
//
*/
class RDKIT_FILEPARSERS_EXPORT MolSupplier {
  // this is an abstract base class to supply molecules one at a time
 public:
  MolSupplier() {}
  virtual ~MolSupplier() {}
  void init() {
    if (dp_supplier) {
      dp_supplier->init();
    }
  }
  void reset() {
    if (dp_supplier) {
      dp_supplier->reset();
    }
  }

  bool atEnd() {
    if (dp_supplier) {
      return dp_supplier->atEnd();
    }
    return true;
  }
  ROMol *next() {
    PRECONDITION(dp_supplier, "no supplier");
    return dp_supplier->next().release();
  }

  virtual void close() {
    if (dp_supplier) {
      dp_supplier->close();
    }
  }

 private:
  // disable automatic copy constructors and assignment operators
  // for this class and its subclasses.  They will likely be
  // carrying around stream pointers and copying those is a recipe
  // for disaster.
  MolSupplier(const MolSupplier &);
  MolSupplier &operator=(const MolSupplier &);

 protected:
  std::unique_ptr<v2::FileParsers::MolSupplier> dp_supplier;
};

// \brief a supplier from an SD file that only reads forward:
class RDKIT_FILEPARSERS_EXPORT ForwardSDMolSupplier : public MolSupplier {
  /*************************************************************************
   * A lazy mol supplier from a SD file.
   *  - When new molecules are read using "next" their positions in the file are
   *noted.
   ***********************************************************************************/
 public:
  using ContainedType = v2::FileParsers::ForwardSDMolSupplier;
  ForwardSDMolSupplier() {}

  explicit ForwardSDMolSupplier(std::istream *inStream,
                                bool takeOwnership = true, bool sanitize = true,
                                bool removeHs = true,
                                bool strictParsing = false) {
    v2::FileParsers::MolFileParserParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    params.strictParsing = strictParsing;
    dp_supplier.reset(new v2::FileParsers::ForwardSDMolSupplier(
        inStream, takeOwnership, params));
  };

  ~ForwardSDMolSupplier() override {}

  void setProcessPropertyLists(bool val) {
    PRECONDITION(dp_supplier, "no supplier");
    static_cast<ContainedType *>(dp_supplier.get())
        ->setProcessPropertyLists(val);
  }
  bool getProcessPropertyLists() const {
    if (dp_supplier) {
      return static_cast<ContainedType *>(dp_supplier.get())
          ->getProcessPropertyLists();
    }
    return false;
  }

  bool getEOFHitOnRead() const {
    if (dp_supplier) {
      return static_cast<ContainedType *>(dp_supplier.get())->getEOFHitOnRead();
    }
    return false;
  }
};

// \brief a lazy supplier from an SD file
class RDKIT_FILEPARSERS_EXPORT SDMolSupplier : public ForwardSDMolSupplier {
  /*************************************************************************
   * A lazy mol supplier from a SD file.
   *  - When new molecules are read using "next" their positions in the file are
   *noted.
   *  - A call to the "length" will automatically parse the entire file and
   *cache all the mol
   *    block positions
   *  - [] operator is used to access a molecule at "idx", calling next
   *following this will result
   *    in the next molecule after "idx"
   ***********************************************************************************/

 public:
  using ContainedType = v2::FileParsers::SDMolSupplier;
  SDMolSupplier() { dp_supplier.reset(new ContainedType()); }

  /*!
   *   \param fileName - the name of the SD file
   *   \param sanitize - if true sanitize the molecule before returning it
   *   \param removeHs - if true remove Hs from the molecule before returning it
   *                     (triggers sanitization)
   *   \param strictParsing - if set to false, the parser is more lax about
   * correctness
   *                          of the contents.
   */
  explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
                         bool removeHs = true, bool strictParsing = true) {
    v2::FileParsers::MolFileParserParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    params.strictParsing = strictParsing;
    dp_supplier.reset(new v2::FileParsers::SDMolSupplier(fileName, params));
  }

  explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
                         bool sanitize = true, bool removeHs = true,
                         bool strictParsing = true) {
    v2::FileParsers::MolFileParserParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    params.strictParsing = strictParsing;
    dp_supplier.reset(
        new v2::FileParsers::SDMolSupplier(inStream, takeOwnership, params));
  }

  void moveTo(unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    static_cast<ContainedType *>(dp_supplier.get())->moveTo(idx);
  }
  ROMol *operator[](unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())
        ->operator[](idx)
        .release();
  }
  /*! \brief returns the text block for a particular item
   *
   *  \param idx - which item to return
   */
  std::string getItemText(unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())->getItemText(idx);
  }
  unsigned int length() {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())->length();
  }
  void setData(const std::string &text, bool sanitize = true,
               bool removeHs = true) {
    PRECONDITION(dp_supplier, "no supplier");
    v2::FileParsers::MolFileParserParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
  }
  void setData(const std::string &text, bool sanitize, bool removeHs,
               bool strictParsing) {
    v2::FileParsers::MolFileParserParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    params.strictParsing = strictParsing;
    static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
  }
  /*! Resets our internal state and sets the indices of molecules in the stream.
   *  The client should be *very* careful about calling this method, as it's
   *trivial
   *  to end up with a completely useless supplier.
   *
   *   \param locs - the vector of stream positions.
   *
   *  Note that this can be used not only to make reading selected molecules
   *from a
   *  large SD file much faster, but it can also allow subsetting an SD file or
   *  rearranging the order of the molecules.
   */
  void setStreamIndices(const std::vector<std::streampos> &locs) {
    PRECONDITION(dp_supplier, "no supplier");
    static_cast<ContainedType *>(dp_supplier.get())->setStreamIndices(locs);
  }
};

//! lazy file parser for Smiles tables
class RDKIT_FILEPARSERS_EXPORT SmilesMolSupplier : public MolSupplier {
  /**************************************************************************
   * Lazy file parser for Smiles table file, similar to the lazy SD
   * file parser above
   * - As an when new molecules are read using "next" their
   *    positions in the file are noted.
   *  - A call to the "length" will automatically parse the entire
   *    file and cache all the mol block positions
   *  - [] operator is used to access a molecule at "idx", calling
   *    next following this will result in the next molecule after
   *    "idx"
   ***************************************************************************/
 public:
  using ContainedType = v2::FileParsers::SmilesMolSupplier;
  /*!
   *   \param fileName - the name of smiles table file
   *   \param delimiter - delimiting characters between records on a each
   *     line NOTE that this is not a string, the tokenizer looks for
   *     the individual characters in delimiter, not the full string
   *     itself.  So the default delimiter: " \t", means " " or "\t".
   *   \param smilesColumn - column number for the SMILES string (defaults
   *     to the first column)
   *   \param nameColumn - column number for the molecule name (defaults to
   *     the second column) If set to -1 we assume that no name is
   *     available for the molecule and the name is defaulted to the
   *     smiles string
   *   \param titleLine - if true, the first line is assumed to list the
   *     names of properties in order separated by 'delimiter'. It is
   *     also assume that the 'SMILES' column and the 'name' column
   *     are not specified here if false - no title line is assumed
   *     and the properties are recorded as the "columnX" where "X" is
   *     the column number
   *   \param sanitize - if true sanitize the molecule before returning it
   */
  explicit SmilesMolSupplier(const std::string &fileName,
                             const std::string &delimiter = " \t",
                             int smilesColumn = 0, int nameColumn = 1,
                             bool titleLine = true, bool sanitize = true) {
    v2::FileParsers::SmilesMolSupplierParams params;
    params.delimiter = delimiter;
    params.smilesColumn = smilesColumn;
    params.nameColumn = nameColumn;
    params.titleLine = titleLine;
    params.parseParameters.sanitize = sanitize;
    dp_supplier.reset(new v2::FileParsers::SmilesMolSupplier(fileName, params));
  }
  explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
                             const std::string &delimiter = " \t",
                             int smilesColumn = 0, int nameColumn = 1,
                             bool titleLine = true, bool sanitize = true) {
    v2::FileParsers::SmilesMolSupplierParams params;
    params.delimiter = delimiter;
    params.smilesColumn = smilesColumn;
    params.nameColumn = nameColumn;
    params.titleLine = titleLine;
    params.parseParameters.sanitize = sanitize;
    dp_supplier.reset(new v2::FileParsers::SmilesMolSupplier(
        inStream, takeOwnership, params));
  }
  SmilesMolSupplier() { dp_supplier.reset(new ContainedType()); }

  void setData(const std::string &text, const std::string &delimiter = " ",
               int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
               bool sanitize = true) {
    PRECONDITION(dp_supplier, "no supplier");
    v2::FileParsers::SmilesMolSupplierParams params;
    params.delimiter = delimiter;
    params.smilesColumn = smilesColumn;
    params.nameColumn = nameColumn;
    params.titleLine = titleLine;
    params.parseParameters.sanitize = sanitize;
    static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
  }
  void moveTo(unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    static_cast<ContainedType *>(dp_supplier.get())->moveTo(idx);
  }
  ROMol *operator[](unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())
        ->operator[](idx)
        .release();
  }
  /*! \brief returns the text block for a particular item
   *
   *  \param idx - which item to return
   */
  std::string getItemText(unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())->getItemText(idx);
  }
  unsigned int length() {
    PRECONDITION(dp_supplier, "no supplier")
    return static_cast<ContainedType *>(dp_supplier.get())->length();
  }
};

//! lazy file parser for TDT files
class RDKIT_FILEPARSERS_EXPORT TDTMolSupplier : public MolSupplier {
  /**************************************************************************
   * Lazy file parser for TDT files, similar to the lazy SD
   * file parser above
   * - As an when new molecules are read using "next" their
   *    positions in the file are noted.
   *  - A call to the "length" will automatically parse the entire
   *    file and cache all the mol block positions
   *  - [] operator is used to access a molecule at "idx", calling
   *    next following this will result in the next molecule after
   *    "idx"
   ***************************************************************************/
 public:
  using ContainedType = v2::FileParsers::TDTMolSupplier;
  /*!
   *   \param fileName - the name of the TDT file
   *   \param nameRecord - property name for the molecule name.
   *     If empty (the default), the name defaults to be empty
   *   \param confId2D - if >=0 and 2D coordinates are provided, the 2D
   *                   structure (depiction) in the input will be read into the
   *                   corresponding conformer id.
   *   \param confId3D - if >=0 and 3D coordinates are provided, the 3D
   *                   structure (depiction) in the input will be read into the
   *                   corresponding conformer id.
   *   \param sanitize - if true sanitize the molecule before returning it
   */
  explicit TDTMolSupplier(const std::string &fileName,
                          const std::string &nameRecord = "", int confId2D = -1,
                          int confId3D = 0, bool sanitize = true) {
    v2::FileParsers::TDTMolSupplierParams params;
    params.nameRecord = nameRecord;
    params.confId2D = confId2D;
    params.confId3D = confId3D;
    params.parseParameters.sanitize = sanitize;
    dp_supplier.reset(new v2::FileParsers::TDTMolSupplier(fileName, params));
  }
  explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
                          const std::string &nameRecord = "", int confId2D = -1,
                          int confId3D = 0, bool sanitize = true) {
    v2::FileParsers::TDTMolSupplierParams params;
    params.nameRecord = nameRecord;
    params.confId2D = confId2D;
    params.confId3D = confId3D;
    params.parseParameters.sanitize = sanitize;
    dp_supplier.reset(
        new v2::FileParsers::TDTMolSupplier(inStream, takeOwnership, params));
  }
  TDTMolSupplier() { dp_supplier.reset(new ContainedType()); }
  void setData(const std::string &text, const std::string &nameRecord = "",
               int confId2D = -1, int confId3D = 0, bool sanitize = true) {
    PRECONDITION(dp_supplier, "no supplier");
    v2::FileParsers::TDTMolSupplierParams params;
    params.nameRecord = nameRecord;
    params.confId2D = confId2D;
    params.confId3D = confId3D;
    params.parseParameters.sanitize = sanitize;
    static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
  }
  void moveTo(unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    static_cast<ContainedType *>(dp_supplier.get())->moveTo(idx);
  }
  ROMol *operator[](unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())
        ->operator[](idx)
        .release();
  }
  /*! \brief returns the text block for a particular item
   *
   *  \param idx - which item to return
   */
  std::string getItemText(unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())->getItemText(idx);
  }
  unsigned int length() {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())->length();
  }
};

#ifdef RDK_BUILD_MAEPARSER_SUPPORT
//! lazy file parser for MAE files
class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
  /**
   * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
   * always requires taking ownership of the istream ptr, as the shared ptr will
   * always clear it upon destruction.
   */

 public:
  using ContainedType = v2::FileParsers::MaeMolSupplier;
  MaeMolSupplier() { dp_supplier.reset(new ContainedType()); }

  explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
                          bool sanitize = true, bool removeHs = true) {
    v2::FileParsers::MaeMolSupplierParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    dp_supplier.reset(new ContainedType(inStream, params));
  }

  explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
                          bool sanitize = true, bool removeHs = true) {
    v2::FileParsers::MaeMolSupplierParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    dp_supplier.reset(new ContainedType(inStream, takeOwnership, params));
  }

  explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
                          bool removeHs = true) {
    v2::FileParsers::MaeMolSupplierParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    dp_supplier.reset(new ContainedType(fname, params));
  }
  void moveTo(unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    static_cast<ContainedType *>(dp_supplier.get())->moveTo(idx);
  }
  RWMol *operator[](unsigned int idx) {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())
        ->operator[](idx)
        .release();
  }
  unsigned int length() {
    PRECONDITION(dp_supplier, "no supplier");
    return static_cast<ContainedType *>(dp_supplier.get())->length();
  }

  void setData(const std::string &text, bool sanitize = true,
               bool removeHs = true) {
    PRECONDITION(dp_supplier, "no supplier");
    v2::FileParsers::MaeMolSupplierParams params;
    params.sanitize = sanitize;
    params.removeHs = removeHs;
    static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
  }
};
#endif  // RDK_BUILD_MAEPARSER_SUPPORT

}  // namespace v1
}  // namespace RDKit

#endif