File: RascalButinaCluster.cpp

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (118 lines) | stat: -rw-r--r-- 4,134 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
//
// Copyright (C) David Cosgrove 2023
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
// This file contains an implementation of Butina clustering
// (Butina JCICS 39 747-750 (1999)) using the RascalMCES
// Johnson similarity metric.  It is largely a transliteration
// of $RDBASE/rdkit/ML/Cluster/Butina.py.

#include <algorithm>
#include <iterator>
#include <vector>
#include <set>

#include <GraphMol/ROMol.h>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalDetails.h>

namespace RDKit {

namespace RascalMCES {
namespace details {
std::vector<std::vector<unsigned int>> buildNborLists(
    const std::vector<std::vector<ClusNode>> &proxGraph) {
  std::vector<std::vector<unsigned int>> nborLists;
  for (size_t i = 0; i < proxGraph.size(); ++i) {
    std::vector<std::pair<unsigned int, double>> tmpList;
    for (const auto &cn : proxGraph[i]) {
      if (cn.d_res) {
        if (i == cn.d_mol1Num) {
          tmpList.push_back({cn.d_mol2Num, cn.d_sim});
        } else {
          tmpList.push_back({cn.d_mol1Num, cn.d_sim});
        }
      }
    }
    std::sort(tmpList.begin(), tmpList.end(),
              [](const std::pair<unsigned int, double> &p1,
                 const std::pair<unsigned int, double> &p2) -> bool {
                return p1.second > p2.second;
              });
    std::vector<unsigned int> nborList(tmpList.size() + 1, 0);
    nborList[0] = i;
    std::transform(
        tmpList.begin(), tmpList.end(), nborList.begin() + 1,
        [](const std::pair<unsigned int, double> &p) -> unsigned int {
          return p.first;
        });
    nborLists.push_back(nborList);
  }
  std::sort(nborLists.begin(), nborLists.end(),
            [](const std::vector<unsigned int> &nl1,
               const std::vector<unsigned int> &nl2) -> bool {
              if (nl1.size() == nl2.size()) {
                return nl1 > nl2;
              } else {
                return nl1.size() > nl2.size();
              }
            });
  return nborLists;
}

// This function destroys nborLists.
std::vector<std::vector<unsigned int>> formClusters(
    std::vector<std::vector<unsigned int>> &nborLists) {
  std::vector<std::vector<unsigned int>> clusters;

  while (!nborLists.empty()) {
    clusters.push_back(nborLists.front());
    std::set<unsigned int> inNborList(nborLists.front().begin(),
                                      nborLists.front().end());
    nborLists.front().clear();
    for (auto &nborList : nborLists) {
      for (auto &n : nborList) {
        if (inNborList.find(n) != inNborList.end()) {
          n = std::numeric_limits<unsigned int>::max();
        }
      }
      nborList.erase(std::remove(nborList.begin(), nborList.end(),
                                 std::numeric_limits<unsigned int>::max()),
                     nborList.end());
    }
    nborLists.erase(
        std::remove_if(nborLists.begin(), nborLists.end(),
                       [](const std::vector<unsigned int> &nl) -> bool {
                         return nl.empty();
                       }),
        nborLists.end());
    std::sort(nborLists.begin(), nborLists.end(),
              [](const std::vector<unsigned int> &nl1,
                 const std::vector<unsigned int> &nl2) -> bool {
                if (nl1.size() == nl2.size()) {
                  return nl1 > nl2;
                } else {
                  return nl1.size() > nl2.size();
                }
              });
  }
  return clusters;
}

}  // namespace details
std::vector<std::vector<unsigned int>> rascalButinaCluster(
    const std::vector<std::shared_ptr<ROMol>> &mols,
    const RascalClusterOptions &clusOpts) {
  auto proxGraph = details::buildProximityGraph(mols, clusOpts);
  auto nborLists = details::buildNborLists(proxGraph);
  auto clusters = details::formClusters(nborLists);
  return clusters;
}
}  // namespace RascalMCES
}  // namespace RDKit