File: HierarchicalClusterPicker.h

package info (click to toggle)
rdkit 201809.1%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 123,688 kB
  • sloc: cpp: 230,509; python: 70,501; java: 6,329; ansic: 5,427; sql: 1,899; yacc: 1,739; lex: 1,243; makefile: 445; xml: 229; fortran: 183; sh: 123; cs: 93
file content (109 lines) | stat: -rw-r--r-- 3,670 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
//
//  Copyright (C) 2003-2006 Rational Discovery LLC
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#include <RDGeneral/export.h>
#ifndef _HIERARCHCLUSTERPICKER_H
#define _HIERARCHCLUSTERPICKER_H

#include <RDGeneral/types.h>
#include "DistPicker.h"

namespace RDPickers {

/*! \brief Diversity picker based on hierarchical clustering
 *
 *  This class inherits from DistPicker since it uses the distance matrix
 *  for diversity picking. The clustering itself is done using the Murtagh
 *  code in $RDBASE/Code/ML/Cluster/Mutagh/
 */
class RDKIT_SIMDIVPICKERS_EXPORT HierarchicalClusterPicker : public DistPicker {
 public:
  /*! \brief The type of hierarchical clustering algorithm to use
   */
  typedef enum {
    WARD = 1,
    SLINK = 2,
    CLINK = 3,
    UPGMA = 4,
    MCQUITTY = 5,
    GOWER = 6,
    CENTROID = 7
  } ClusterMethod;

  /*! \brief Constructor - takes a ClusterMethod as an argument
   *
   * Sets the hierarch clustering method
   */
  explicit HierarchicalClusterPicker(ClusterMethod clusterMethod)
      : d_method(clusterMethod) {
    ;
  };

  /*! \brief This is the function that does the picking
   *
   * Here is how the algorithm works \n
   *  FIX: Supply reference
   *
   * - The entire pool is clustered using the distance matrix using one of the
   *   hierachical clustering method (specified via the constructor). \n
   * - Starting with the individaul items in the pool, clusters are merged based
   *   on the output from clustering method. \n
   * - The merging is stopped when the number of clusters is same as
   *   the number of picks.
   * - For each item in a cluster the sum of square of the distances to the rest
   *of
   *   of the items (in the cluster) is computed. The item with the smallest of
   *values is
   *   picked as a representative of the cluster. Basically trying to pick the
   *item closest
   *   to the centroid of the cluster.
   *
   *
   *    \param distMat - distance matrix - a vector of double. It is assumed
   *that only the
   *              lower triangle element of the matrix are supplied in a 1D
   *array\n
   *              NOTE: this matrix WILL BE ALTERED during the picking\n
   *    \param poolSize - the size of the pool to pick the items from. It is
   *assumed that the
   *              distance matrix above contains the right number of elements;
   *i.e.
   *              poolSize*(poolSize-1) \n
   *    \param pickSize - the number items to pick from pool (<= poolSize)
   */
  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
                       unsigned int pickSize) const;

  /*! \brief This is the function that does the clustering of the items - used
   *by the picker
   *
   * ARGUMENTS:
   *
   *   \param distMat - distance matrix - a vector of double. It is assumed that
   *only the
   *              lower triangle element of the matrix are supplied in a 1D
   *array\n
   *              NOTE: this matrix WILL BE ALTERED during the picking\n
   *   \param poolSize - the size of the pool to pick the items from. It is
   *assumed that the
   *              distance matrix above contains the right number of elements;
   *i.e.
   *              poolSize*(poolSize-1) \n
   *   \param pickSize - the number clusters to divide the pool into (<=
   *poolSize)
   */
  RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize,
                               unsigned int pickSize) const;

 private:
  ClusterMethod d_method;
};
};

#endif