1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
|
/*!
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause
*/
#pragma once
#include <mutex>
#include <vector>
#include <pyclustering/cluster/xmeans_data.hpp>
#include <pyclustering/utils/metric.hpp>
using namespace pyclustering::utils::metric;
namespace pyclustering {
namespace clst {
/*!
@brief Defines splitting types for clusters that are used by X-Means algorithm.
*/
enum class splitting_type {
BAYESIAN_INFORMATION_CRITERION = 0, /**< Bayesian information criterion (BIC) to approximate the correct number of clusters. */
MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1, /**< Minimum noiseless description length (MNDL) to approximate the correct number of clusters. */
};
/*!
@class xmeans xmeans.hpp pyclustering/cluster/xmeans.hpp
@brief Class represents clustering algorithm X-Means.
@details X-means clustering method starts with the assumption of having a minimum number of clusters,
and then dynamically increases them. X-means uses specified splitting criterion to control
the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
*/
class xmeans {
private:
const static std::size_t AMOUNT_CENTER_CANDIDATES;
const static double DEFAULT_SPLIT_DIFFERENCE;
public:
const static double DEFAULT_TOLERANCE; /**< Default value of the tolerance stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. */
const static splitting_type DEFAULT_SPLITTING_TYPE; /**< Default splitting criteria that is used by the X-Means algorithm. */
const static double DEFAULT_MNDL_ALPHA_PROBABILISTIC_VALUE; /**< Default MNDL alpha probabilistic value. */
const static double DEFAULT_MNDL_BETA_PROBABILISTIC_VALUE; /**< Default MNDL beta probabilistic value. */
private:
dataset m_initial_centers;
xmeans_data * m_ptr_result = nullptr; /* temporary pointer to output result */
const dataset * m_ptr_data = nullptr; /* used only during processing */
double m_alpha = DEFAULT_MNDL_ALPHA_PROBABILISTIC_VALUE;
double m_beta = DEFAULT_MNDL_BETA_PROBABILISTIC_VALUE;
std::size_t m_maximum_clusters;
double m_tolerance = DEFAULT_TOLERANCE;
splitting_type m_criterion = splitting_type::BAYESIAN_INFORMATION_CRITERION;
std::size_t m_repeat = 1;
long long m_random_state = RANDOM_STATE_CURRENT_TIME;
distance_metric<point> m_metric;
public:
/*!
@brief Constructor of X-Means clustering algorithm.
@param[in] p_initial_centers: initial centers that are used for processing.
@param[in] p_kmax: maximum number of clusters that can be allocated.
@param[in] p_tolerance: stop condition in following way: when maximum value of distance change of
cluster centers is less than tolerance than algorithm will stop processing.
@param[in] p_criterion: splitting criterion that is used for making descision about cluster splitting (by default `splitting_type::BAYESIAN_INFORMATION_CRITERION`).
@param[in] p_repeat: how many times K-Means should be run to improve parameters (by default is 1),
with larger 'repeat' values suggesting higher probability of finding global optimum.
@param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
@param[in] p_metric: distance metric calculator for two points (by default is Euclidean Square metric).
*/
xmeans(const dataset & p_initial_centers,
const std::size_t p_kmax,
const double p_tolerance = DEFAULT_TOLERANCE,
const splitting_type p_criterion = DEFAULT_SPLITTING_TYPE,
const std::size_t p_repeat = 1,
const long long p_random_state = RANDOM_STATE_CURRENT_TIME,
const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean_square());
/*!
@brief Default destructor of the algorithm.
*/
~xmeans() = default;
public:
/*!
@brief Performs cluster analysis of an input data.
@param[in] p_data: input data for cluster analysis.
@param[out] p_result: clustering result of an input data.
*/
void process(const dataset & p_data, xmeans_data & p_result);
/*!
@brief Set alpha based probabilistic bound \f$\Q\left(\alpha\right)\f$ that is distributed from [0, 1].
@details The alpha probabilistic bound is used only in case of MNDL splitting criteria, in all other cases this value is ignored.
@param[in] p_alpha: value distributed [0.0, 1.0] for alpha probabilistic bound \f$\Q\left(\alpha\right)\f$.
*/
void set_mndl_alpha_bound(const double p_alpha);
/*!
@brief Set beta based probabilistic bound \f$\Q\left(\beta\right)\f$ that is distributed from [0, 1].
@details The beta probabilistic bound is used only in case of MNDL splitting criteria, in all other cases this value is ignored.
@param[in] p_alpha: value distributed [0.0, 1.0] for beta probabilistic bound \f$\Q\left(\beta\right)\f$.
*/
void set_mndl_beta_bound(const double p_beta);
private:
void improve_structure();
void improve_region_structure(const cluster & p_cluster, const point & p_center, dataset & p_allocated_centers) const;
double search_optimal_parameters(cluster_sequence & improved_clusters, dataset & improved_centers, const index_sequence & available_indexes) const;
double improve_parameters(cluster_sequence & improved_clusters, dataset & improved_centers, const index_sequence & available_indexes) const;
double splitting_criterion(const cluster_sequence & analysed_clusters, const dataset & analysed_centers) const;
double bayesian_information_criterion(const cluster_sequence & analysed_clusters, const dataset & analysed_centers) const;
double minimum_noiseless_description_length(const cluster_sequence & clusters, const dataset & centers) const;
};
}
}
|