File: kmeans.hpp

package info (click to toggle)
python-pyclustering 0.10.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 11,128 kB
  • sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (153 lines) | stat: -rwxr-xr-x 4,942 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/*!

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

*/

#pragma once


#include <mutex>
#include <vector>

#include <pyclustering/cluster/kmeans_data.hpp>

#include <pyclustering/utils/metric.hpp>


using namespace pyclustering::utils::metric;


namespace pyclustering {

namespace clst {

/*!

@class    kmeans kmeans.hpp pyclustering/cluster/kmeans.hpp

@brief    Represents K-Means clustering algorithm for cluster analysis.
@details  The algorithm related to partitional class when input data is divided into groups.

*/
class kmeans {
public:
    const static double             DEFAULT_TOLERANCE;  /**< Default value of the tolerance stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. */

    const static std::size_t        DEFAULT_ITERMAX;    /**< Default value of the step stop condition - maximum number of iterations that is used for clustering process. */

private:
    double                  m_tolerance             = DEFAULT_TOLERANCE;

    std::size_t             m_itermax               = DEFAULT_ITERMAX;

    dataset                 m_initial_centers       = { };

    kmeans_data             * m_ptr_result          = nullptr;      /* temporary pointer to output result */

    const dataset           * m_ptr_data            = nullptr;      /* used only during processing */

    const index_sequence    * m_ptr_indexes         = nullptr;      /* temporary pointer to indexes */

    distance_metric<point>  m_metric;

public:
    /*!
    
    @brief    Default constructor of clustering algorithm.
    
    */
    kmeans() = default;

    /*!
    
    @brief    Constructor of clustering algorithm where algorithm parameters for processing are
               specified.
    
    @param[in] p_initial_centers: initial centers that are used for processing.
    @param[in] p_tolerance: stop condition in following way: when maximum value of distance change of
                cluster centers is less than tolerance than algorithm will stop processing.
    @param[in] p_itermax: maximum number of iterations (by default kmeans::DEFAULT_ITERMAX).
    @param[in] p_metric: distance metric calculator for two points.
    
    */
    kmeans(const dataset & p_initial_centers, 
           const double p_tolerance = DEFAULT_TOLERANCE,
           const std::size_t p_itermax = DEFAULT_ITERMAX,
           const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean_square());

    /*!
    
    @brief    Default destructor of the algorithm.
    
    */
    ~kmeans() = default;

public:
    /*!
    
    @brief    Performs cluster analysis of an input data.
    
    @param[in]     p_data: input data for cluster analysis.
    @param[in,out] p_result: clustering result of an input data, it is also considered as an input argument to
                    where observer parameter can be set to collect changes of clusters and centers on each step of
                    processing.
    
    */
    void process(const dataset & p_data, kmeans_data & p_result);

    /*!
    
    @brief    Performs cluster analysis of an input data.
    
    @param[in]     p_data: input data for cluster analysis.
    @param[in]     p_indexes: specify indexes of objects in 'p_data' that should be used during clustering process.
    @param[in,out] p_result: clustering result of an input data, it is also considered as an input argument to
                    where observer parameter can be set to collect changes of clusters and centers on each step of
                    processing.
    
    */
    void process(const dataset & p_data, const index_sequence & p_indexes, kmeans_data & p_result);

private:
    void update_clusters(const dataset & p_centers, cluster_sequence & p_clusters);

    double update_centers(const cluster_sequence & clusters, dataset & centers);

    void assign_point_to_cluster(const std::size_t p_index_point, const dataset & p_centers, index_sequence & p_clusters);

    /*!
    
    @brief    Calculate new center for specified cluster.
    
    @param[in] p_cluster: cluster whose center should be calculated.
    @param[in,out] p_center: cluster's center that should calculated.
    
    @return Difference between old and new cluster's center.
    
    */
    double update_center(const cluster & p_cluster, point & p_center);

    /*!
    
    @brief    Calculates total within-cluster errors that is based on distance metric.
    
    */
    void calculate_total_wce();

    /*!
    
    @brief    Erases clusters that do not have any points.
    
    @param[in,out] p_clusters: clusters that should be analyzed and modified.
    
    */
    static void erase_empty_clusters(cluster_sequence & p_clusters);
};


}

}