1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
/*!
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause
*/
#pragma once
#include <mutex>
#include <vector>
#include <pyclustering/cluster/gmeans_data.hpp>
#include <pyclustering/utils/metric.hpp>
using namespace pyclustering::utils::metric;
namespace pyclustering {
namespace clst {
/*!
@class gmeans gmeans.hpp pyclustering/cluster/gmeans.hpp
@brief Represents G-Means clustering algorithm for cluster analysis.
@details The G-means algorithm starts with a small number of centers, and grows the number of centers.
Each iteration of the G-Means algorithm splits into two those centers whose data appear not to come from a
Gaussian distribution. G-means repeatedly makes decisions based on a statistical test for the data
assigned to each center.
@image html gmeans_example_clustering.png "G-Means clustering results on most common data-sets."
Example #1. In this example, G-Means starts analysis from single cluster.
@code
int main() {
// Read two-dimensional input data 'Lsun'.
dataset data = read_data("Lsun.txt");
// Create and run G-Means clustering algorithm.
// By default the algorithm starts search from a single cluster.
gmeans_data result;
gmeans().process(data, result);
// Obtain clustering results.
const cluster_sequence & clusters = result.clusters();
const dataset & centers = result.centers();
// Display results to console.
for (std::size_t i = 0; i < clusters.size(); i++) {
std::cout << "Cluster #" << i + 1 << std::endl;
std::cout << " - Center: ";
const auto & center = centers[i];
for (const auto coordinate : center) {
std::cout << coordinate << " ";
}
std::cout << std::endl;
std::cout << "- Size: " << clusters[i].size() << std::endl << std::endl;
}
return 0;
}
@endcode
Example #2. Sometimes G-Means may found local optimum. 'repeat' value can be used to increase probability to
find global optimum. Argument 'repeat' defines how many times K-Means clustering with K-Means++
initialization should be run to find optimal clusters.
@code
// Create and run G-Means clustering algorithm.
const std::size_t initial_k = 1;
const double tolerance = gmeans::DEFAULT_TOLERANCE;
const std::size_t repeat = 5; // Repeat each iteration 5 time to find optimum.
gmeans_data result;
gmeans(initial_k, tolerance, repeat).process(data, result);
@endcode
Implementation based on the paper @cite inproceedings::cluster::gmeans::1.
*/
class gmeans {
private:
using projection = std::vector<double>;
public:
const static long long IGNORE_KMAX; /**< Defines value that means to ignore K maximum value. */
const static std::size_t DEFAULT_AMOUNT_CENTERS; /**< Defaule value of amount of initial K - the value from that the search procedure is started. */
const static double DEFAULT_TOLERANCE; /**< Default value of the tolerance (stop condition): if the maximum value of cluster changes is less than tolerance then the algorithm stops processing. */
const static std::size_t DEFAULT_REPEAT; /**< Default value that defines how many times K-Means should be run to improve parameters. */
const static std::size_t DEFAULT_CANDIDATES; /**< Default value of amount of candidates to consider by K-Means++ to initialize initial centers for K-Means on each iteration. */
private:
std::size_t m_amount = DEFAULT_AMOUNT_CENTERS;
double m_tolerance = DEFAULT_TOLERANCE;
std::size_t m_repeat = DEFAULT_REPEAT;
long long m_kmax = IGNORE_KMAX;
long long m_random_state = RANDOM_STATE_CURRENT_TIME;
gmeans_data * m_ptr_result = nullptr; /* temporary pointer to output result */
const dataset * m_ptr_data = nullptr; /* used only during processing */
public:
/*!
@brief Default constructor of G-Means clustering algorithm.
*/
gmeans() = default;
/*!
@brief Constructor of clustering algorithm where algorithm parameters for processing are
specified.
@param[in] p_k_initial: initial amount of centers.
@param[in] p_tolerance: stop condition in following way: when maximum value of distance change of
cluster centers is less than tolerance then algorithm stops processing.
@param[in] p_repeat: how many times K-Means should be run to improve parameters (by default is 3),
with larger `repeat` values suggesting higher probability of finding global optimum.
@param[in] p_kmax: maximum amount of clusters that might be allocated. The argument is considered as a stop
condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
amount of clusters is not restricted (`k_max` is `IGNORE_KMAX`).
@param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
*/
gmeans(const std::size_t p_k_initial,
const double p_tolerance = DEFAULT_TOLERANCE,
const std::size_t p_repeat = DEFAULT_REPEAT,
const long long p_kmax = IGNORE_KMAX,
const long long p_random_state = RANDOM_STATE_CURRENT_TIME);
/*!
@brief Default destructor of G-Means algorithm.
*/
~gmeans() = default;
public:
/*!
@brief Performs cluster analysis of an input data.
@param[in] p_data: input data for cluster analysis.
@param[in,out] p_result: clustering result of an input data, it is also considered as an input argument to
where observer parameter can be set to collect changes of clusters and centers on each step of
processing.
*/
void process(const dataset & p_data, gmeans_data & p_result);
private:
bool is_run_condition() const;
void search_optimal_parameters(const dataset & p_data, const std::size_t p_amount, cluster_sequence & p_clusters, dataset & p_centers) const;
void statistical_optimization();
void perform_clustering();
void split_and_search_optimal(const cluster & p_cluster, dataset & p_centers) const;
static bool is_null_hypothesis(const dataset & p_data, const point & p_center1, const point & p_center2);
static std::size_t get_amount_candidates(const dataset & p_data);
static projection calculate_projection(const dataset & p_data, const point & p_vector);
};
}
}
|