1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
|
/*!
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause
*/
#pragma once
#include <pyclustering/cluster/cluster_data.hpp>
#include <pyclustering/cluster/silhouette_data.hpp>
#include <pyclustering/definitions.hpp>
#include <pyclustering/utils/metric.hpp>
using namespace pyclustering::utils::metric;
namespace pyclustering {
namespace clst {
/*!
@brief Defines types that are used for input data representation.
*/
enum class silhouette_data_t {
POINTS,
DISTANCE_MATRIX
};
/*!
@class silhouette silhouette.hpp pyclustering/cluster/silhouette.hpp
@brief Represents Silhouette method that is used interpretation and validation of consistency.
@details The silhouette value is a measure of how similar an object is to its own cluster compared to other clusters.
Be aware that silhouette method is applicable for K algorithm family, such as K-Means, K-Medians,
K-Medoids, X-Means, etc., not not applicable for DBSCAN, OPTICS, CURE, etc. The Silhouette value is
calculated using following formula:
\f[s\left ( i \right )=\frac{ b\left ( i \right ) - a\left ( i \right ) }{ max\left \{ a\left ( i \right ), b\left ( i \right ) \right \}}\f]
where \f$a\left ( i \right )\f$ - is average distance from object i to objects in its own cluster,
\f$b\left ( i \right )\f$ - is average distance from object i to objects in the nearest cluster (the appropriate among other clusters).
Here is an example where Silhouette score is calculated for K-Means's clustering result:
@code
#include <pyclustering/cluster/kmeans_plus_plus.hpp>
#include <pyclustering/cluster/kmeans.hpp>
#include <pyclustering/cluster/silhouette.hpp>
#include <iostream>
// ... `read_data` implementation to read sample ...
int main() {
// Read two-dimensional input data 'Simple03'.
dataset data = read_data("Simple03.txt");
// Prepare initial centers for K-Means algorithm.
dataset initial_centers;
const std::size_t amount_clusters = 4;
const std::size_t candidates_to_consider = 5;
kmeans_plus_plus(amount_clusters, candidates_to_consider).initialize(data, initial_centers);
// Perform cluster analysis.
auto kmeans_instance = kmeans(initial_centers);
kmeans_data clustering_result;
kmeans_instance.process(data, clustering_result);
// Obtain allocated clusters.
const auto & clusters = clustering_result.clusters();
// Calculate Silhouette score.
silhouette_data estimation_result;
silhouette().process(data, clusters, estimation_result);
// Print Silhouette score for each point.
for (const auto score : estimation_result.get_score()) {
std::cout << score << std::endl;
}
return 0;
}
@endcode
Here is an illustration where clustering has been performed using various `K` values (2, 4, 6 and 8) for
the same sample as before. `K = 4` is the optimal amount of clusters in line with Silhouette method because
the score for each point is close to `1.0` and the average score for `K = 4` is biggest value among others `K`.
@image html silhouette_score_for_various_K.png "Fig. 1. Silhouette scores for various K."
Implementation based on paper @cite article::cluster::silhouette::1.
@see kmeans, kmedoids, kmedians, xmeans, elbow
*/
class silhouette {
private:
const dataset * m_data = nullptr; /* temporary object, exists during processing */
const cluster_sequence * m_clusters = nullptr; /* temporary object, exists during processing */
silhouette_data * m_result = nullptr; /* temporary object, exists during processing */
silhouette_data_t m_type = silhouette_data_t::POINTS;
distance_metric<point> m_metric = distance_metric_factory<point>::euclidean_square();
public:
/*!
@brief Default constructor for Silhouette method.
*/
silhouette() = default;
/*!
@brief Constructor for Silhouette method with specific parameters.
@param[in] p_metric: metric that was used for cluster analysis and should be used for Silhouette
score calculation (by default Square Euclidean distance).
*/
explicit silhouette(const distance_metric<point> & p_metric);
/*!
@brief Default copy constructor for Silhouette method.
*/
silhouette(const silhouette & p_other) = default;
/*!
@brief Default move constructor for Silhouette method.
*/
silhouette(silhouette && p_other) = default;
/*!
@brief Default destructor for Silhouette method.
*/
~silhouette() = default;
public:
/*!
@brief Performs analysis of an input data in order to calculate score for each point where input data is represented by points.
@param[in] p_data: input data (points) for analysis.
@param[in] p_clusters: clusters that have been obtained after cluster analysis.
@param[out] p_result: silhouette input data processing result.
*/
void process(const dataset & p_data, const cluster_sequence & p_clusters, silhouette_data & p_result);
/*!
@brief Performs analysis of an input data in order to calculate score for each point.
@param[in] p_data: input data for analysis.
@param[in] p_clusters: clusters that have been obtained after cluster analysis.
@param[in] p_type: data type of input sample `p_data` that is processed by the method (`POINTS`, `DISTANCE_MATRIX`).
@param[out] p_result: silhouette input data processing result.
*/
void process(const dataset & p_data, const cluster_sequence & p_clusters, const silhouette_data_t & p_type, silhouette_data & p_result);
private:
double calculate_score(const std::size_t p_index_point, const std::size_t p_index_cluster) const;
void calculate_dataset_difference(const std::size_t p_index_point, std::vector<double> & p_dataset_difference) const;
double calculate_cluster_difference(const std::size_t p_index_cluster, const std::vector<double> & p_dataset_difference) const;
double calculate_within_cluster_score(const std::size_t p_index_cluster, const std::vector<double> & p_dataset_difference) const;
double calculate_cluster_score(const std::size_t p_index_cluster, const std::vector<double> & p_dataset_difference) const;
double caclulate_optimal_neighbor_cluster_score(const std::size_t p_index_cluster, const std::vector<double> & p_dataset_difference) const;
};
}
}
|