File: kmeans_plus_plus.hpp

package info (click to toggle)
python-pyclustering 0.10.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 11,128 kB
  • sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (288 lines) | stat: -rwxr-xr-x 9,152 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/*!

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

*/


#pragma once


#include <random>
#include <unordered_set>

#include <pyclustering/definitions.hpp>

#include <pyclustering/cluster/center_initializer.hpp>
#include <pyclustering/cluster/cluster_data.hpp>
#include <pyclustering/utils/metric.hpp>


using namespace pyclustering::utils::metric;


namespace pyclustering {

namespace clst {


/**
 *
 * @brief K-Means++ center initializer algorithm.
 *
 */
class kmeans_plus_plus : public center_initializer {
public:
    /**
     *
     * @brief Denotes that the farthest center candidate (with highest probability) should be used as a center.
     *
     */
    static const std::size_t FARTHEST_CENTER_CANDIDATE;

    /**
     *
     * @brief Non-existed index that represents non-initialized value.
     *
     */
    static const std::size_t INVALID_INDEX;

public:
    /**
     *
     * @brief Metric that is used for distance calculation between two points.
     *
     */
    using metric = distance_functor< std::vector<double> >;

private:
    using index_set = std::unordered_set<std::size_t>;

    using center_description = std::tuple<point, std::size_t>;
    enum { POINT, INDEX };

    using store_result = std::function<void(center_description &)>;

private:
    std::size_t             m_amount        = 0;
    std::size_t             m_candidates    = 0;
    metric                  m_dist_func;
    long long               m_random_state  = RANDOM_STATE_CURRENT_TIME;
    mutable std::mt19937    m_generator;

    /* temporal members that are used only during initialization */
    mutable dataset const *           m_data_ptr      = nullptr;
    mutable index_sequence const *    m_indexes_ptr   = nullptr;

    mutable index_set       m_free_indexes;
    mutable index_sequence  m_allocated_indexes;

public:
    /**
     *
     * @brief Default constructor to create initializer algorithm K-Means++.
     *
     */
    kmeans_plus_plus() = default;

    /**
    *
    * @brief    Constructor of center initializer algorithm K-Means++.
    *
    * @param[in] p_amount: amount of centers that should initialized.
    * @param[in] p_candidates: amount of candidates that are considered to find the best center, if
    *             the farthest candidate is required (with highest probability) than static constant
    *             FARTHEST_CENTER_CANDIDATE can be specified.
    * @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
    *
    * @see FARTHEST_CENTER_CANDIDATE
    *
    */
    kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates = 1, const long long p_random_state = RANDOM_STATE_CURRENT_TIME) noexcept;

    /**
    *
    * @brief    Constructor of center initializer algorithm K-Means++.
    * @details  By default algorithm uses square Euclidean distance as a metric.
    *
    * @param[in] p_amount: amount of centers that should initialized.
    * @param[in] p_candidates: amount of candidates that are considered to find the best center, if
    *             the farthest candidate is required (with highest probability) than static constant
    *             FARTHEST_CENTER_CANDIDATE can be specified.
    * @param[in] p_metric: metric for distance calculation between points.
    * @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
    *
    * @see FARTHEST_CENTER_CANDIDATE
    *
    */
    kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric, const long long p_random_state = RANDOM_STATE_CURRENT_TIME) noexcept;

    /**
     *
     * @brief Default copy constructor to create initializer algorithm K-Means++.
     *
     */
    kmeans_plus_plus(const kmeans_plus_plus & p_other) = default;

    /**
     *
     * @brief Default move constructor to create initializer algorithm K-Means++.
     *
     */
    kmeans_plus_plus(kmeans_plus_plus && p_other) = default;

    /**
     *
     * @brief Default destructor to destroy initializer algorithm K-Means++.
     *
     */
    ~kmeans_plus_plus() = default;

public:
    /**
    *
    * @brief    Performs center initialization process in line algorithm configuration.
    *
    * @param[in]  p_data: data for that centers are calculated.
    * @param[out] p_centers: initialized centers for the specified data.
    *
    */
    void initialize(const dataset & p_data, dataset & p_centers) const override;

    /**
    *
    * @brief    Performs center initialization process in line algorithm configuration for
    *           specific range of points.
    *
    * @param[in]  p_data: data for that centers are calculated.
    * @param[in]  p_indexes: point indexes from data that are defines which points should be considered
    *              during calculation process. If empty then all data points are considered.
    * @param[out] p_centers: initialized centers for the specified data.
    *
    */
    void initialize(const dataset & p_data, const index_sequence & p_indexes, dataset & p_centers) const override;

    /**
    *
    * @brief    Performs center initialization process in line algorithm configuration using real points from dataset as centers.
    *
    * @param[in]  p_data: data for that centers are calculated.
    * @param[out] p_center_indexes: initialized center indexes for the specified data where indexes correspond to points from the data.
    *
    */
    void initialize(const dataset & p_data, index_sequence & p_center_indexes) const;

private:
    /**
    *
    * @brief    Assigns seed to the random generator that is used by the algorithm.
    *
    */
    void initialize_random_generator();

    /**
    *
    * @brief    Performs center initialization process in line algorithm configuration.
    *
    * @param[in]  p_data: data for that centers are calculated.
    * @param[in]  p_indexes: point indexes from data that are defines which points should be considered
    *              during calculation process. If empty then all data points are considered.
    * @param[out] p_proc: function that defines how to store output result of the algorithm.
    *
    */
    void initialize(const dataset & p_data, const index_sequence & p_indexes, const store_result & p_proc) const;

    /**
    *
    * @brief    Store obtained center.
    *
    * @param[in]  p_proc: function that defines how to store output result of the algorithm.
    * @param[in]  p_result: initialized center that should be stored.
    *
    */
    void store_center(const store_result & p_proc, center_description & p_result) const;

    /**
    *
    * @brief    Store pointers to data, indexes and centers to avoiding passing them between class methods.
    * @details  Pointers are reseted when center initialization is over.
    *
    * @param[in]  p_data: data for that centers are calculated.
    * @param[in]  p_indexes: point indexes from data that are defines which points should be
    *              considered during calculation process.
    *
    * @return   The first initialized center.
    *
    */
    void store_temporal_params(const dataset & p_data, const index_sequence & p_indexes) const;

    /**
    *
    * @brief    Reset (fill by nullptr) temporal points.
    *
    */
    void free_temporal_params() const;

    /**
    *
    * @brief    Calculates the first initial center using uniform distribution.
    *
    * @return   The first initialized center.
    *
    */
    center_description get_first_center() const;

    /**
    *
    * @brief    Calculates the next most probable center in line with weighted distribution.
    *
    * @return   The next initialized center.
    *
    */
    center_description get_next_center() const;

    /**
    *
    * @brief    Calculates distances from each point to closest center.
    *
    * @param[out] p_distances: the shortest distances from each point to center.
    *
    */
    void calculate_shortest_distances(std::vector<double> & p_distances) const;

    /**
    *
    * @brief    Calculates distance from the specified point to the closest center.
    *
    * @param[in]  p_point: point for that the shortest distance is calculated.
    *
    */
    double get_shortest_distance(const point & p_point) const;

    /**
    *
    * @brief    Calculates center probability for each point using distances to closest centers.
    *
    * @param[in]  p_distances: distances from each point to closest center.
    * @param[out] p_probabilities: probability of each point to be next center.
    *
    */
    void calculate_probabilities(const std::vector<double> & p_distances, std::vector<double> & p_probabilities) const;

    /**
    *
    * @brief    Calculates most probable center.
    *
    * @param[in]  p_distances: distances from each point to closest center.
    * @param[in] p_probabilities: probability of each point to be next center.
    *
    */
    std::size_t get_probable_center(const std::vector<double> & p_distances, const std::vector<double> & p_probabilities) const;
};


}

}