File: rascal_search_catch_tests.cpp

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (144 lines) | stat: -rw-r--r-- 4,937 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
//
// Copyright (C) David Cosgrove 2024.
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.

#include <algorithm>
#include <fstream>

#include <GraphMol/SubstructLibrary/SubstructLibrary.h>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/Fingerprints/MorganGenerator.h>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/SynthonSpaceSearch/SynthonSpace.h>
#include <GraphMol/SynthonSpaceSearch/SearchResults.h>
#include <GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>

#include <catch2/catch_all.hpp>

using namespace RDKit;
using namespace RDKit::SynthonSpaceSearch;
using namespace RDKit::RascalMCES;

const char *rdbase = getenv("RDBASE");

void getMols(const std::string &molFilename,
             std::map<std::string, std::unique_ptr<RWMol>> &mols) {
  v2::FileParsers::SmilesMolSupplierParams fileparams;
  fileparams.titleLine = false;
  v2::FileParsers::SmilesMolSupplier suppl(molFilename, fileparams);
  std::map<std::string, std::unique_ptr<ExplicitBitVect>> fps;
  while (!suppl.atEnd()) {
    auto mol = suppl.next();
    auto molName = mol->getProp<std::string>(common_properties::_Name);
    mols.insert(std::make_pair(molName, mol.release()));
  }
}

std::set<std::string> bruteForceSearch(
    const ROMol &queryMol,
    const std::map<std::string, std::unique_ptr<RWMol>> &mols,
    const RascalOptions &rascalOptions) {
  std::set<std::string> fullSmi;
  std::set<std::string> names;
  for (auto &[name, mol] : mols) {
    auto res = rascalMCES(queryMol, *mol, rascalOptions);
    if (!res.empty() &&
        res.front().getSimilarity() > rascalOptions.similarityThreshold) {
      mol->setProp<double>("Similarity", res.front().getSimilarity());
      names.insert(name);
    }
  }
  return names;
}

TEST_CASE("RASCAL Small tests") {
  REQUIRE(rdbase);
  std::string fName(rdbase);
  std::string fullRoot(fName + "/Code/GraphMol/SynthonSpaceSearch/data/");
  std::vector<std::string> libNames{
      fullRoot + "amide_space.txt",
      fullRoot + "triazole_space.txt",
      fullRoot + "urea_space.txt",
  };
  std::vector<std::string> enumLibNames{
      fullRoot + "amide_space_enum.smi",
      fullRoot + "triazole_space_enum.smi",
      fullRoot + "urea_space_enum.smi",
  };

  std::vector<std::string> querySmis{
      "c1ccccc1C(=O)N1CCCC1",
      "CC1CCN(c2nnc(CO)n2C2CCCC2)C1",
      "C[C@@H]1CC(NC(=O)NC2COC2)CN(C(=O)c2nccnc2F)C1",
  };

  std::vector<size_t> expNumHits{6, 4, 1};

  RascalOptions rascalOptions;

  for (size_t i = 0; i < libNames.size(); i++) {
    // if (i != 0) {
    // continue;
    // }
    SynthonSpace synthonspace;
    bool cancelled = false;
    synthonspace.readTextFile(libNames[i], cancelled);
    SynthonSpaceSearchParams params;
    auto queryMol = v2::SmilesParse::MolFromSmiles(querySmis[i]);

    auto results = synthonspace.rascalSearch(*queryMol, rascalOptions, params);
    CHECK(results.getHitMolecules().size() == expNumHits[i]);
    std::set<std::string> resSmis;
    for (const auto &r : results.getHitMolecules()) {
      resSmis.insert(MolToSmiles(*r));
    }
    // Do the enumerated library, just to check
    std::map<std::string, std::unique_ptr<RWMol>> mols;
    getMols(enumLibNames[i], mols);
    auto names = bruteForceSearch(*queryMol, mols, rascalOptions);
    std::set<std::string> fullSmis;
    for (const auto &r : names) {
      fullSmis.insert(MolToSmiles(*mols[r]));
    }
    // As with fingerprints, we don't get all the hits with synthon search
    // that we would with a full search.
    for (const auto &rs : resSmis) {
      CHECK(fullSmis.find(rs) != fullSmis.end());
    }
  }
}

TEST_CASE("Rascal Biggy") {
  REQUIRE(rdbase);
  std::string fName(rdbase);
  std::string libName =
      fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.csv";
  SynthonSpace synthonspace;
  bool cancelled = false;
  synthonspace.readTextFile(libName, cancelled);

  const std::vector<std::string> smis{
      "c1ccccc1C(=O)N1CCCC1", "c1ccccc1NC(=O)C1CCN1",
      "c12ccccc1c(N)nc(N)n2", "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1",
      "c1n[nH]cn1",           "C(=O)NC(CC)C(=O)N(CC)C"};
  const std::vector<size_t> numRes{254, 89, 2, 34, 0, 14};
  const std::vector<size_t> maxRes{376110, 278747, 79833, 34817, 190, 45932};
  SynthonSpaceSearchParams params;
  params.maxHits = -1;
  params.numThreads = 1;
  RascalOptions rascalOptions;

  for (size_t i = 0; i < smis.size(); ++i) {
    auto queryMol = v2::SmilesParse::MolFromSmiles(smis[i]);
    auto results = synthonspace.rascalSearch(*queryMol, rascalOptions, params);
    CHECK(results.getHitMolecules().size() == numRes[i]);
    CHECK(results.getMaxNumResults() == maxRes[i]);
  }
}