File: kmedoids.hpp

package info (click to toggle)
python-pyclustering 0.10.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 11,128 kB
  • sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (216 lines) | stat: -rwxr-xr-x 6,814 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/*!

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

*/

#pragma once


#include <memory>

#include <pyclustering/cluster/kmedoids_data.hpp>

#include <pyclustering/utils/metric.hpp>


using namespace pyclustering::utils::metric;


namespace pyclustering {

namespace clst {


/*!

@brief    Defines data representation (point, distance matrix) that is used for processing by K-Medoids algorithm.

*/
enum class kmedoids_data_t {
    POINTS,
    DISTANCE_MATRIX
};


/*!

@brief    Represents K-Medoids clustering algorithm (PAM algorithm) for cluster analysis.
@details  PAM is a partitioning clustering algorithm that uses the medoids instead of centers like in case of K-Means
           algorithm. Medoid is an object with the smallest dissimilarity to all others in the cluster. PAM algorithm
           complexity is \f$O\left ( k\left ( n-k \right )^{2} \right )\f$.

          Implementation based on paper @cite inproceedings::cluster::kmedoids::1.

*/
class kmedoids {
public:
    static const double      DEFAULT_TOLERANCE;     /**< Default value of the tolerance stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. */

    static const std::size_t DEFAULT_ITERMAX;       /**< Default value of the step stop condition - maximum number of iterations that is used for clustering process. */

private:
    static const std::size_t OBJECT_ALREADY_CONTAINED;

    static const std::size_t INVALID_INDEX;

    static const double      NOTHING_TO_SWAP;

private:
    using distance_calculator = std::function<double(const std::size_t, const std::size_t)>;

    struct appropriate_cluster {
    public:
        appropriate_cluster() = default;
        appropriate_cluster(const std::size_t p_index, const double p_distance_first_medoid, const double p_distance_second_medoid);

    public:
        std::size_t m_index                       = INVALID_INDEX;
        double      m_distance_to_first_medoid    = -1.0;
        double      m_distance_to_second_medoid   = -1.0;
    };

private:
    const dataset                   * m_data_ptr      = nullptr;    /* temporary pointer to input data that is used only during processing */

    kmedoids_data                   * m_result_ptr    = nullptr;    /* temporary pointer to clustering result that is used only during processing */

    medoid_sequence                 m_initial_medoids = { };

    double                          m_tolerance       = DEFAULT_TOLERANCE;

    std::size_t                     m_itermax         = DEFAULT_ITERMAX;

    index_sequence                  m_labels;

    std::vector<double>             m_distance_first_medoid;

    std::vector<double>             m_distance_second_medoid;

    distance_metric<point>          m_metric;

    distance_calculator             m_calculator;

public:
    /*!
    
    @brief    Default constructor of clustering algorithm.
    
    */
    kmedoids() = default;

    /*!
    
    @brief    Constructor of clustering algorithm where algorithm parameters for processing are
               specified.
    
    @param[in] p_initial_medoids: initial medoids that are used for processing.
    @param[in] p_tolerance: stop condition in following way: when maximum value of distance change of
                medoids of clusters is less than tolerance than algorithm will stop processing.
    @param[in] p_itermax: maximum amount of iterations (by default kmedoids::DEFAULT_ITERMAX).
    @param[in] p_metric: distance metric calculator for two points.
    
    */
    kmedoids(const medoid_sequence & p_initial_medoids,
             const double p_tolerance = DEFAULT_TOLERANCE,
             const std::size_t p_itermax = DEFAULT_ITERMAX,
             const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean_square());

    /*!
    
    @brief    Default destructor of the algorithm.
    
    */
    ~kmedoids();

public:
    /*!
    
    @brief    Performs cluster analysis of an input data.
    
    @param[in]  p_data: input data for cluster analysis.
    @param[out] p_result: clustering result of an input data.
    
    */
    void process(const dataset & p_data, kmedoids_data & p_result);

    /*!
    
    @brief    Performs cluster analysis of an input data.
    
    @param[in]  p_data: input data for cluster analysis.
    @param[in]  p_type: data type (points or distance matrix).
    @param[out] p_result: clustering result of an input data.
    
    */
    void process(const dataset & p_data, const kmedoids_data_t p_type, kmedoids_data & p_result);

private:
    /*!
    
    @brief    Updates clusters in line with current medoids.
    
    */
    double update_clusters();

    /*!
    
    @brief    Creates distance calcultor in line with data type and distance metric metric.
    
    @param[in] p_type: data type (points or distance matrix).
    
    @return   Distance calculator.

    */
    distance_calculator create_distance_calculator(const kmedoids_data_t p_type);

    /*!
    
    @brief    Find appropriate cluster for the particular point.
    
    @param[in] p_index: Index of point that should be placed to cluster.
    @param[in] p_medoids: Medoids that corresponds to clusters.
    
    @return   Index of cluster that is appropriate for the particular point and distance from this point to correspoding medoid. 
               If point is a medoid then OBJECT_ALREADY_CONTAINED value is returned.
    
    */
    appropriate_cluster find_appropriate_cluster(const std::size_t p_index, medoid_sequence & p_medoids);

    /*!
    
    @brief  Swap existed medoid with non-medoid points in order to find the most optimal medoid.

    @return Cost that is needed to swap medoid and non-medoid point.
    
    */
    double swap_medoids();

    /*!
    
    @brief  Calculates cost to swap `p_index_candidate` with the current medoid `p_index_cluster`.

    @param[in] p_index_candidate: index point that is considered as a medoid candidate.
    @param[in] p_index_cluster: index of a cluster where the current medoid is used for calculation.

    @return Cost that is needed to swap medoids.
    
    */
    double calculate_swap_cost(const std::size_t p_index_candidate, const std::size_t p_index_cluster) const;

    /*!

    @brief      Erase empty clusters and their medoids.
    @details    Data might have identical points and a lot of identical points and as a result medoids might correspond
                  to points that are totally identical.

    */
    void erase_empty_clusters();
};


}

}