File: Pipeline.h

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (234 lines) | stat: -rw-r--r-- 10,588 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
//
//  Copyright (C) 2023 Novartis Biomedical Research
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#ifndef RD_MOLSTANDARDIZE_PIPELINE_H
#define RD_MOLSTANDARDIZE_PIPELINE_H
#include <RDGeneral/export.h>
#include <GraphMol/RWMol.h>
#include <memory>
#include <string>
#include <utility>
#include <vector>

namespace RDKit {

namespace MolStandardize {

struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineOptions {
  // parsing
  bool strictParsing{false};

  // validation
  bool reportAllFailures{true};
  bool allowEmptyMolecules{false};
  bool allowEnhancedStereo{false};
  bool allowAromaticBondType{false};
  bool allowDativeBondType{false};
  double is2DZeroThreshold{1e-3};
  double atomClashLimit{0.03};
  double minMedianBondLength{1e-3};
  double bondLengthLimit{100.};
  bool allowLongBondsInRings{true};
  bool allowAtomBondClashExemption{true};

  // cleanup/standardization
  // metal disconnector options
  std::string metalNof{"[Li,Na,K,Rb,Cs,Fr]~[#7,#8,F]"};
  std::string metalNon{};
  // normalizer options
  std::string normalizerData{
      "// Name\tSMIRKS\n"
      "Nitro to N+(O-)=O\t[N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]\n"
      "Sulfone to S(=O)(=O)\t[S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])\n"
      "Pyridine oxide to n+O-\t[nH0+0:1]=[OH0+0:2]>>[n+:1][O-:2]\n"
      "Azide to N=N+=N-\t[*:1][N:2]=[N:3]#[N:4]>>[*:1][N:2]=[N+:3]=[N-:4]\n"
      "Diazo/azo to =N+=N-\t[*:1]=[N:2]#[N:3]>>[*:1]=[N+:2]=[N-:3]\n"
      // Note: the sulfoxide transformation by default included in the
      // Normalizer configuration was removed Note: the transformation below was
      // ported from STRUCHK and it's not part of the default Normalizer
      // configuration
      "[SH](=O)(=O) to S(=O)O\t[c,C,N,O,F,Cl,Br,I:1][SH+0:2](=[O:3])=[O:4]>>[*:1][*:2]([*:3])=[*:4]\n"
      // Note: the two transformations below replace the default Phosphate
      // normalization in order to ensure that, if an O is available, the double
      // bond is placed between P and O
      "Phosphate to P(O-)=O\t[O-:1][P+;D4:2][O,S,Se,Te;-1:3]>>[O+0:1]=[P+0;D5:2][*-1:3]\n"
      "Generalized phosphate to P(X-)=Y\t[S,Se,Te;-1:1][P+;D4:2][S,Se,Te;-1:3]>>[*+0:1]=[P+0;D5:2][*-1:3]\n"
      "C/S+N to C/S=N+\t[C,S&!$([S+]-[O-]);X3+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]\n"
      "P+N to P=N+\t[P;X4+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]\n"
      "Recombine 1,3-separated charges\t[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[N,P,As,Sb,O,S,Se,Te;+1:3]>>[*-0:1]=[*:2]-[*+0:3]\n"
      "Recombine 1,3-separated charges\t[n,o,p,s;-1:1]:[a:2]=[N,O,P,S;+1:3]>>[*-0:1]:[*:2]-[*+0:3]\n"
      "Recombine 1,3-separated charges\t[N,O,P,S;-1:1]-[a+0:2]:[n,o,p,s;+1:3]>>[*-0:1]=[*:2]:[*+0:3]\n"
      "Recombine 1,5-separated charges\t[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[A:3]-[A:4]=[N,P,As,Sb,O,S,Se,Te;+1:5]>>[*-0:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
      "Recombine 1,5-separated charges\t[n,o,p,s;-1:1]:[a:2]:[a:3]:[c:4]=[N,O,P,S;+1:5]>>[*-0:1]:[*:2]:[*:3]:[c:4]-[*+0:5]\n"
      "Recombine 1,5-separated charges\t[N,O,P,S;-1:1]-[c:2]:[a:3]:[a:4]:[n,o,p,s;+1:5]>>[*-0:1]=[c:2]:[*:3]:[*:4]:[*+0:5]\n"
      // Note: four transformations were added to the normalization of aliphatic
      // conjug cations in order to favor the positioning of new double bonds
      // within rings
      "Normalize 1,3 conjugated cation\t[N;+0!H0:1]@-[A:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]\n"
      "Normalize 1,5 conjugated cation\t[N;+0!H0:1]@-[A:2]=[A:3]@-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
      "Normalize 1,3 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]\n"
      "Normalize 1,3 conjugated cation\t[n;+0!H0:1]:[c:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]:[*:2]-[*+0:3]\n"
      "Normalize 1,5 conjugated cation\t[N;+0!H0:1]@-[A:2]=[A:3]-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
      "Normalize 1,5 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[A:3]@-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
      "Normalize 1,5 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[A:3]-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
      "Normalize 1,5 conjugated cation\t[n;+0!H0:1]:[a:2]:[a:3]:[c:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]-[*+0:5]\n"
      "Charge normalization\t[F,Cl,Br,I,At;-1:1]=[O:2]>>[*-0:1][O-:2]\n"
      "Charge recombination\t[N,P,As,Sb;-1:1]=[C+;v3:2]>>[*+0:1]#[C+0:2]\n"};
  unsigned int normalizerMaxRestarts{200};
  double scaledMedianBondLength{1.};

  // serialization
  bool outputV2000{false};
};

enum RDKIT_MOLSTANDARDIZE_EXPORT PipelineStatus {
  NO_EVENT = 0,
  INPUT_ERROR = (1 << 0),
  PREPARE_FOR_VALIDATION_ERROR = (1 << 1),
  FEATURES_VALIDATION_ERROR = (1 << 2),
  BASIC_VALIDATION_ERROR = (1 << 3),
  IS2D_VALIDATION_ERROR = (1 << 4),
  LAYOUT2D_VALIDATION_ERROR = (1 << 5),
  STEREO_VALIDATION_ERROR = (1 << 6),
  VALIDATION_ERROR = (FEATURES_VALIDATION_ERROR | BASIC_VALIDATION_ERROR |
                      IS2D_VALIDATION_ERROR | LAYOUT2D_VALIDATION_ERROR |
                      STEREO_VALIDATION_ERROR),
  PREPARE_FOR_STANDARDIZATION_ERROR = (1 << 7),
  METAL_STANDARDIZATION_ERROR = (1 << 8),
  NORMALIZER_STANDARDIZATION_ERROR = (1 << 9),
  FRAGMENT_STANDARDIZATION_ERROR = (1 << 10),
  CHARGE_STANDARDIZATION_ERROR = (1 << 11),
  STANDARDIZATION_ERROR =
      (METAL_STANDARDIZATION_ERROR | NORMALIZER_STANDARDIZATION_ERROR |
       FRAGMENT_STANDARDIZATION_ERROR | CHARGE_STANDARDIZATION_ERROR),
  OUTPUT_ERROR = (1 << 12),
  PIPELINE_ERROR = (INPUT_ERROR | PREPARE_FOR_VALIDATION_ERROR |
                    VALIDATION_ERROR | PREPARE_FOR_STANDARDIZATION_ERROR |
                    STANDARDIZATION_ERROR | OUTPUT_ERROR),
  METALS_DISCONNECTED = (1 << 23),
  NORMALIZATION_APPLIED = (1 << 24),
  FRAGMENTS_REMOVED = (1 << 25),
  PROTONATION_CHANGED = (1 << 26),
  STRUCTURE_MODIFICATION = (METALS_DISCONNECTED | NORMALIZATION_APPLIED |
                            FRAGMENTS_REMOVED | PROTONATION_CHANGED)
};

enum class RDKIT_MOLSTANDARDIZE_EXPORT PipelineStage : std::uint32_t {
  NOT_STARTED = 0,
  PARSING_INPUT,
  PREPARE_FOR_VALIDATION,
  VALIDATION,
  PREPARE_FOR_STANDARDIZATION,
  STANDARDIZATION,
  REAPPLY_WEDGING,
  CLEANUP_2D,
  MAKE_PARENT,
  SERIALIZING_OUTPUT,
  COMPLETED
};

struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineLogEntry {
  PipelineStatus status;
  std::string detail;
};

using PipelineLog = std::vector<PipelineLogEntry>;

struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineResult {
  PipelineStatus status;
  std::uint32_t stage;
  PipelineLog log;
  std::string inputMolData;
  std::string outputMolData;
  std::string parentMolData;

  void append(PipelineStatus newStatus, const std::string &info);
};

using RWMOL_SPTR_PAIR = std::pair<RWMOL_SPTR, RWMOL_SPTR>;

namespace Operations {
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR prepareForValidation(
    RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR validate(RWMOL_SPTR mol,
                                                PipelineResult &result,
                                                const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR prepareForStandardization(
    RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR standardize(
    RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR reapplyWedging(
    RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR cleanup2D(
    RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR_PAIR makeParent(
    RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);

RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR parse(const std::string &molblock,
                                             PipelineResult &result,
                                             const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT void serialize(RWMOL_SPTR_PAIR output,
                                           PipelineResult &result,
                                           const PipelineOptions &options);

using ParseOperation = decltype(&parse);
using SerializeOperation = decltype(&serialize);
using Operation = decltype(&prepareForValidation);
using ParentOperation = decltype(&makeParent);
using PipelineVector = std::vector<std::pair<std::uint32_t, Operation>>;

const PipelineVector validationSteps{
    // input sanitization and cleanup
    {static_cast<uint32_t>(PipelineStage::PREPARE_FOR_VALIDATION),
     &prepareForValidation},
    //  validate the structure
    {static_cast<uint32_t>(PipelineStage::VALIDATION), &validate}};

const PipelineVector standardizationSteps{
    {static_cast<uint32_t>(PipelineStage::PREPARE_FOR_STANDARDIZATION),
     &prepareForStandardization},
    {static_cast<uint32_t>(PipelineStage::STANDARDIZATION), &standardize},
    {static_cast<uint32_t>(PipelineStage::REAPPLY_WEDGING), &reapplyWedging},
    {static_cast<uint32_t>(PipelineStage::CLEANUP_2D), &cleanup2D}};
}  // namespace Operations

class RDKIT_MOLSTANDARDIZE_EXPORT Pipeline {
 private:
  PipelineOptions options;
  Operations::ParseOperation parse = Operations::parse;
  Operations::SerializeOperation serialize = Operations::serialize;
  Operations::PipelineVector validationSteps = Operations::validationSteps;
  Operations::PipelineVector standardizationSteps =
      Operations::standardizationSteps;
  Operations::ParentOperation makeParent = Operations::makeParent;

 public:
  Pipeline() = default;
  explicit Pipeline(const PipelineOptions &o) : options(o) {};
  ~Pipeline() = default;

  PipelineResult run(const std::string &molblock) const;

  void setValidationSteps(const Operations::PipelineVector &steps) {
    validationSteps = steps;
  }
  void setStandardizationSteps(const Operations::PipelineVector &steps) {
    standardizationSteps = steps;
  }
  void setMakeParent(Operations::ParentOperation op) { makeParent = op; }
  void setParse(Operations::ParseOperation op) { parse = op; }
  void setSerialize(Operations::SerializeOperation op) { serialize = op; }

 private:
};

}  // namespace MolStandardize
}  // namespace RDKit

#endif