File: Clustering.cpp

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (180 lines) | stat: -rw-r--r-- 5,977 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// $Id$
//
//  Copyright (C) 2002-2010 Greg Landrum and Rational Discovery LLC
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#define PY_ARRAY_UNIQUE_SYMBOL Py_Array_API_Clustering

#include <RDBoost/Wrap.h>
#include <cstdint>

namespace python = boost::python;

#include <RDBoost/import_array.h>

typedef double real;

extern "C" void distdriver_(boost::int64_t *n, boost::int64_t *len, real *dists,
                            boost::int64_t *toggle, boost::int64_t *ia,
                            boost::int64_t *ib, real *crit);

//
// Rather than deal with any nonsense like trying to get
// the distance matrix built properly on the f2c side of things
// (thus drowning in the waves of f2c hate), we'll generate
// the distance matrix on our own here and then call distdriver_
//
static void clusterit(real *dataP, boost::int64_t n, boost::int64_t m,
                      boost::int64_t iopt, boost::int64_t *ia,
                      boost::int64_t *ib, real *crit) {
  real *dists;
  boost::int64_t len;
  boost::int64_t pos = 0;
  boost::int64_t i, j, k, iTab, jTab;
  double tmp;
  len = (n * (n - 1)) / 2;
  dists = (real *)calloc(len, sizeof(real));
  CHECK_INVARIANT(dists, "failed to allocate memory");
  for (i = 1; i < n; i++) {
    iTab = i * m;
    for (j = 0; j < i; j++) {
      jTab = j * m;
      for (k = 0; k < m; k++) {
        tmp = dataP[iTab + k] - dataP[jTab + k];
        dists[pos] += tmp * tmp;
      }
      pos++;
    }
  }
  distdriver_(&n, &len, dists, &iopt, ia, ib, crit);
  free(dists);
};

static void capsule_cleanup(PyObject *capsule) {
  void *ptr = PyCapsule_GetPointer(capsule, nullptr);
  free(ptr);
}

static PyObject *Clustering_MurtaghCluster(python::object data, int nPts,
                                           int sz, int option) {
  PyArrayObject *dataContig;
  boost::int64_t *ia, *ib;
  real *crit;
  PyObject *res;
  PyObject *tmp;
  npy_intp dims[2];

  if (PyArray_Check(data.ptr())) {
    dataContig = reinterpret_cast<PyArrayObject *>(
        PyArray_ContiguousFromObject(data.ptr(), NPY_DOUBLE, 2, 2));
  } else {
    throw_value_error("PyArray_Type expected as input");
    return nullptr;
  }

  ia = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
  auto ia_capsule = PyCapsule_New(ia, nullptr, capsule_cleanup);

  ib = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
  auto ib_capsule = PyCapsule_New(ib, nullptr, capsule_cleanup);

  crit = (real *)calloc(nPts, sizeof(real));
  auto crit_capsule = PyCapsule_New(crit, nullptr, capsule_cleanup);

  clusterit((real *)PyArray_DATA(dataContig), nPts, sz, option, ia, ib, crit);

  dims[0] = nPts;
  res = PyTuple_New(3);

  //  NOTE: these operations maintain pointers to the respective arrays,
  //  that's why it's ok that we do not free them in this function,
  //  Python will take care of it for us.
  //
  tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ia);
  PyArray_SetBaseObject((PyArrayObject *)tmp, ia_capsule);
  PyTuple_SetItem(res, 0, (PyObject *)tmp);

  tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ib);
  PyArray_SetBaseObject((PyArrayObject *)tmp, ib_capsule);
  PyTuple_SetItem(res, 1, (PyObject *)tmp);

  tmp = PyArray_SimpleNewFromData(1, dims, NPY_DOUBLE, (void *)crit);
  PyArray_SetBaseObject((PyArrayObject *)tmp, crit_capsule);
  PyTuple_SetItem(res, 2, (PyObject *)tmp);

  return res;
};

void distclusterit(real *dists, boost::int64_t n, boost::int64_t iopt,
                   boost::int64_t *ia, boost::int64_t *ib, real *crit) {
  boost::int64_t len;

  len = (n * (n - 1)) / 2;
  distdriver_(&n, &len, dists, &iopt, ia, ib, crit);
};

static PyObject *Clustering_MurtaghDistCluster(python::object data, int nPts,
                                               int option) {
  PyArrayObject *dataContig;
  boost::int64_t *ia, *ib;
  real *crit;
  PyObject *res = PyTuple_New(3);
  PyObject *tmp;
  npy_intp dims[] = {1};

  if (PyArray_Check(data.ptr())) {
    dataContig = reinterpret_cast<PyArrayObject *>(
        PyArray_ContiguousFromObject(data.ptr(), NPY_DOUBLE, 1, 1));
  } else {
    throw_value_error("PyArray_Type expected as input");
    return nullptr;
  }

  ia = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
  auto ia_capsule = PyCapsule_New(ia, nullptr, capsule_cleanup);

  ib = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
  auto ib_capsule = PyCapsule_New(ib, nullptr, capsule_cleanup);

  crit = (real *)calloc(nPts, sizeof(real));
  auto crit_capsule = PyCapsule_New(crit, nullptr, capsule_cleanup);

  distclusterit((real *)PyArray_DATA(dataContig), nPts, option, ia, ib, crit);

  dims[0] = nPts;

  //
  //  NOTE: these operations maintain pointers to the respective arrays,
  //  that's why it's ok that we do not free them in this function,
  //  Python will take care of it for us.
  //
  tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ia);
  PyArray_SetBaseObject((PyArrayObject *)tmp, ia_capsule);
  PyTuple_SetItem(res, 0, tmp);

  tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ib);
  PyArray_SetBaseObject((PyArrayObject *)tmp, ib_capsule);
  PyTuple_SetItem(res, 1, tmp);

  tmp = PyArray_SimpleNewFromData(1, dims, NPY_DOUBLE, (void *)crit);
  PyArray_SetBaseObject((PyArrayObject *)tmp, crit_capsule);
  PyTuple_SetItem(res, 2, tmp);

  return res;
};

BOOST_PYTHON_MODULE(Clustering) {
  rdkit_import_array();

  python::def("MurtaghCluster", Clustering_MurtaghCluster,
              (python::arg("data"), python::arg("nPts"), python::arg("sz"),
               python::arg("option")),
              "TODO: provide docstring");
  python::def("MurtaghDistCluster", Clustering_MurtaghDistCluster,
              (python::arg("data"), python::arg("nPts"), python::arg("option")),
              "TODO: provide docstring");
}