1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
|
/*!
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause
*/
#pragma once
#include <memory>
#include <pyclustering/cluster/kmedoids_data.hpp>
#include <pyclustering/utils/metric.hpp>
using namespace pyclustering::utils::metric;
namespace pyclustering {
namespace clst {
/*!
@brief Defines data representation (point, distance matrix) that is used for processing by K-Medoids algorithm.
*/
enum class kmedoids_data_t {
POINTS,
DISTANCE_MATRIX
};
/*!
@brief Represents K-Medoids clustering algorithm (PAM algorithm) for cluster analysis.
@details PAM is a partitioning clustering algorithm that uses the medoids instead of centers like in case of K-Means
algorithm. Medoid is an object with the smallest dissimilarity to all others in the cluster. PAM algorithm
complexity is \f$O\left ( k\left ( n-k \right )^{2} \right )\f$.
Implementation based on paper @cite inproceedings::cluster::kmedoids::1.
*/
class kmedoids {
public:
static const double DEFAULT_TOLERANCE; /**< Default value of the tolerance stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. */
static const std::size_t DEFAULT_ITERMAX; /**< Default value of the step stop condition - maximum number of iterations that is used for clustering process. */
private:
static const std::size_t OBJECT_ALREADY_CONTAINED;
static const std::size_t INVALID_INDEX;
static const double NOTHING_TO_SWAP;
private:
using distance_calculator = std::function<double(const std::size_t, const std::size_t)>;
struct appropriate_cluster {
public:
appropriate_cluster() = default;
appropriate_cluster(const std::size_t p_index, const double p_distance_first_medoid, const double p_distance_second_medoid);
public:
std::size_t m_index = INVALID_INDEX;
double m_distance_to_first_medoid = -1.0;
double m_distance_to_second_medoid = -1.0;
};
private:
const dataset * m_data_ptr = nullptr; /* temporary pointer to input data that is used only during processing */
kmedoids_data * m_result_ptr = nullptr; /* temporary pointer to clustering result that is used only during processing */
medoid_sequence m_initial_medoids = { };
double m_tolerance = DEFAULT_TOLERANCE;
std::size_t m_itermax = DEFAULT_ITERMAX;
index_sequence m_labels;
std::vector<double> m_distance_first_medoid;
std::vector<double> m_distance_second_medoid;
distance_metric<point> m_metric;
distance_calculator m_calculator;
public:
/*!
@brief Default constructor of clustering algorithm.
*/
kmedoids() = default;
/*!
@brief Constructor of clustering algorithm where algorithm parameters for processing are
specified.
@param[in] p_initial_medoids: initial medoids that are used for processing.
@param[in] p_tolerance: stop condition in following way: when maximum value of distance change of
medoids of clusters is less than tolerance than algorithm will stop processing.
@param[in] p_itermax: maximum amount of iterations (by default kmedoids::DEFAULT_ITERMAX).
@param[in] p_metric: distance metric calculator for two points.
*/
kmedoids(const medoid_sequence & p_initial_medoids,
const double p_tolerance = DEFAULT_TOLERANCE,
const std::size_t p_itermax = DEFAULT_ITERMAX,
const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean_square());
/*!
@brief Default destructor of the algorithm.
*/
~kmedoids();
public:
/*!
@brief Performs cluster analysis of an input data.
@param[in] p_data: input data for cluster analysis.
@param[out] p_result: clustering result of an input data.
*/
void process(const dataset & p_data, kmedoids_data & p_result);
/*!
@brief Performs cluster analysis of an input data.
@param[in] p_data: input data for cluster analysis.
@param[in] p_type: data type (points or distance matrix).
@param[out] p_result: clustering result of an input data.
*/
void process(const dataset & p_data, const kmedoids_data_t p_type, kmedoids_data & p_result);
private:
/*!
@brief Updates clusters in line with current medoids.
*/
double update_clusters();
/*!
@brief Creates distance calcultor in line with data type and distance metric metric.
@param[in] p_type: data type (points or distance matrix).
@return Distance calculator.
*/
distance_calculator create_distance_calculator(const kmedoids_data_t p_type);
/*!
@brief Find appropriate cluster for the particular point.
@param[in] p_index: Index of point that should be placed to cluster.
@param[in] p_medoids: Medoids that corresponds to clusters.
@return Index of cluster that is appropriate for the particular point and distance from this point to correspoding medoid.
If point is a medoid then OBJECT_ALREADY_CONTAINED value is returned.
*/
appropriate_cluster find_appropriate_cluster(const std::size_t p_index, medoid_sequence & p_medoids);
/*!
@brief Swap existed medoid with non-medoid points in order to find the most optimal medoid.
@return Cost that is needed to swap medoid and non-medoid point.
*/
double swap_medoids();
/*!
@brief Calculates cost to swap `p_index_candidate` with the current medoid `p_index_cluster`.
@param[in] p_index_candidate: index point that is considered as a medoid candidate.
@param[in] p_index_cluster: index of a cluster where the current medoid is used for calculation.
@return Cost that is needed to swap medoids.
*/
double calculate_swap_cost(const std::size_t p_index_candidate, const std::size_t p_index_cluster) const;
/*!
@brief Erase empty clusters and their medoids.
@details Data might have identical points and a lot of identical points and as a result medoids might correspond
to points that are totally identical.
*/
void erase_empty_clusters();
};
}
}
|