File: xmeans.hpp

package info (click to toggle)
python-pyclustering 0.10.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 11,128 kB
  • sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (168 lines) | stat: -rwxr-xr-x 6,557 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/*!

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

*/

#pragma once


#include <mutex>
#include <vector>

#include <pyclustering/cluster/xmeans_data.hpp>

#include <pyclustering/utils/metric.hpp>


using namespace pyclustering::utils::metric;


namespace pyclustering {

namespace clst {


/*!

@brief  Defines splitting types for clusters that are used by X-Means algorithm.

*/
enum class splitting_type {
    BAYESIAN_INFORMATION_CRITERION = 0,         /**< Bayesian information criterion (BIC) to approximate the correct number of clusters. */
    MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1,   /**< Minimum noiseless description length (MNDL) to approximate the correct number of clusters. */
};


/*!

@class      xmeans xmeans.hpp pyclustering/cluster/xmeans.hpp

@brief      Class represents clustering algorithm X-Means.
@details    X-means clustering method starts with the assumption of having a minimum number of clusters,
             and then dynamically increases them. X-means uses specified splitting criterion to control
             the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.

*/
class xmeans {
private:
    const static std::size_t        AMOUNT_CENTER_CANDIDATES;

    const static double             DEFAULT_SPLIT_DIFFERENCE;

public:
    const static double             DEFAULT_TOLERANCE;              /**< Default value of the tolerance stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. */

    const static splitting_type     DEFAULT_SPLITTING_TYPE;         /**< Default splitting criteria that is used by the X-Means algorithm. */

    const static double             DEFAULT_MNDL_ALPHA_PROBABILISTIC_VALUE;     /**< Default MNDL alpha probabilistic value. */

    const static double             DEFAULT_MNDL_BETA_PROBABILISTIC_VALUE;      /**< Default MNDL beta probabilistic value. */

private:
    dataset                 m_initial_centers;

    xmeans_data             * m_ptr_result        = nullptr;   /* temporary pointer to output result */

    const dataset           * m_ptr_data          = nullptr;     /* used only during processing */

    double                  m_alpha               = DEFAULT_MNDL_ALPHA_PROBABILISTIC_VALUE;

    double                  m_beta                = DEFAULT_MNDL_BETA_PROBABILISTIC_VALUE;

    std::size_t             m_maximum_clusters;

    double                  m_tolerance           = DEFAULT_TOLERANCE;

    splitting_type          m_criterion           = splitting_type::BAYESIAN_INFORMATION_CRITERION;

    std::size_t             m_repeat              = 1;

    long long               m_random_state        = RANDOM_STATE_CURRENT_TIME;

    distance_metric<point>  m_metric;

public:
    /*!
    
    @brief    Constructor of X-Means clustering algorithm.
    
    @param[in] p_initial_centers: initial centers that are used for processing.
    @param[in] p_kmax: maximum number of clusters that can be allocated.
    @param[in] p_tolerance: stop condition in following way: when maximum value of distance change of
                cluster centers is less than tolerance than algorithm will stop processing.
    @param[in] p_criterion: splitting criterion that is used for making descision about cluster splitting (by default `splitting_type::BAYESIAN_INFORMATION_CRITERION`).
    @param[in] p_repeat: how many times K-Means should be run to improve parameters (by default is 1), 
                with larger 'repeat' values suggesting higher probability of finding global optimum.
    @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
    @param[in] p_metric: distance metric calculator for two points (by default is Euclidean Square metric).
    
    */
    xmeans(const dataset & p_initial_centers, 
           const std::size_t p_kmax, 
           const double p_tolerance = DEFAULT_TOLERANCE, 
           const splitting_type p_criterion = DEFAULT_SPLITTING_TYPE,
           const std::size_t p_repeat = 1, 
           const long long p_random_state = RANDOM_STATE_CURRENT_TIME,
           const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean_square());

    /*!
    
    @brief    Default destructor of the algorithm.
    
    */
    ~xmeans() = default;

public:
    /*!
    
    @brief    Performs cluster analysis of an input data.
    
    @param[in]  p_data: input data for cluster analysis.
    @param[out] p_result: clustering result of an input data.
    
    */
    void process(const dataset & p_data, xmeans_data & p_result);

    /*!

    @brief    Set alpha based probabilistic bound \f$\Q\left(\alpha\right)\f$ that is distributed from [0, 1].
    @details  The alpha probabilistic bound is used only in case of MNDL splitting criteria, in all other cases this value is ignored.

    @param[in]  p_alpha: value distributed [0.0, 1.0] for alpha probabilistic bound \f$\Q\left(\alpha\right)\f$.

    */
    void set_mndl_alpha_bound(const double p_alpha);

    /*!

    @brief    Set beta based probabilistic bound \f$\Q\left(\beta\right)\f$ that is distributed from [0, 1].
    @details  The beta probabilistic bound is used only in case of MNDL splitting criteria, in all other cases this value is ignored.

    @param[in]  p_alpha: value distributed [0.0, 1.0] for beta probabilistic bound \f$\Q\left(\beta\right)\f$.

    */
    void set_mndl_beta_bound(const double p_beta);

private:
    void improve_structure();

    void improve_region_structure(const cluster & p_cluster, const point & p_center, dataset & p_allocated_centers) const;

    double search_optimal_parameters(cluster_sequence & improved_clusters, dataset & improved_centers, const index_sequence & available_indexes) const;

    double improve_parameters(cluster_sequence & improved_clusters, dataset & improved_centers, const index_sequence & available_indexes) const;

    double splitting_criterion(const cluster_sequence & analysed_clusters, const dataset & analysed_centers) const;

    double bayesian_information_criterion(const cluster_sequence & analysed_clusters, const dataset & analysed_centers) const;

    double minimum_noiseless_description_length(const cluster_sequence & clusters, const dataset & centers) const;
};


}

}