File: silhouette.hpp

package info (click to toggle)
python-pyclustering 0.10.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 11,128 kB
  • sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (198 lines) | stat: -rwxr-xr-x 6,815 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/*!

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

*/

#pragma once


#include <pyclustering/cluster/cluster_data.hpp>
#include <pyclustering/cluster/silhouette_data.hpp>

#include <pyclustering/definitions.hpp>

#include <pyclustering/utils/metric.hpp>


using namespace pyclustering::utils::metric;


namespace pyclustering {

namespace clst {


/*!

@brief Defines types that are used for input data representation.

*/
enum class silhouette_data_t {
    POINTS,
    DISTANCE_MATRIX
};


/*!

@class  silhouette silhouette.hpp pyclustering/cluster/silhouette.hpp

@brief      Represents Silhouette method that is used interpretation and validation of consistency.
@details    The silhouette value is a measure of how similar an object is to its own cluster compared to other clusters.
             Be aware that silhouette method is applicable for K algorithm family, such as K-Means, K-Medians,
             K-Medoids, X-Means, etc., not not applicable for DBSCAN, OPTICS, CURE, etc. The Silhouette value is
             calculated using following formula:

            \f[s\left ( i \right )=\frac{ b\left ( i \right ) - a\left ( i \right ) }{ max\left \{ a\left ( i \right ), b\left ( i \right ) \right \}}\f]

            where \f$a\left ( i \right )\f$ - is average distance from object i to objects in its own cluster,
            \f$b\left ( i \right )\f$ - is average distance from object i to objects in the nearest cluster (the appropriate among other clusters).

Here is an example where Silhouette score is calculated for K-Means's clustering result:
@code
    #include <pyclustering/cluster/kmeans_plus_plus.hpp>
    #include <pyclustering/cluster/kmeans.hpp>
    #include <pyclustering/cluster/silhouette.hpp>

    #include <iostream>
    
    // ... `read_data` implementation to read sample ...

    int main() {
        // Read two-dimensional input data 'Simple03'.
        dataset data = read_data("Simple03.txt");

        // Prepare initial centers for K-Means algorithm.
        dataset initial_centers;

        const std::size_t amount_clusters = 4;
        const std::size_t candidates_to_consider = 5;
        kmeans_plus_plus(amount_clusters, candidates_to_consider).initialize(data, initial_centers);

        // Perform cluster analysis.
        auto kmeans_instance = kmeans(initial_centers);

        kmeans_data clustering_result;
        kmeans_instance.process(data, clustering_result);

        // Obtain allocated clusters.
        const auto & clusters = clustering_result.clusters();

        // Calculate Silhouette score.
        silhouette_data estimation_result;
        silhouette().process(data, clusters, estimation_result);

        // Print Silhouette score for each point.
        for (const auto score : estimation_result.get_score()) {
            std::cout << score << std::endl;
        }

        return 0;
    }
@endcode

Here is an illustration where clustering has been performed using various `K` values (2, 4, 6 and 8) for 
the same sample as before. `K = 4` is the optimal amount of clusters in line with Silhouette method because 
the score for each point is close to `1.0` and the average score for `K = 4` is biggest value among others `K`.

@image html silhouette_score_for_various_K.png "Fig. 1. Silhouette scores for various K."

Implementation based on paper @cite article::cluster::silhouette::1.

@see kmeans, kmedoids, kmedians, xmeans, elbow

*/
class silhouette {
private:
    const dataset *           m_data      = nullptr;  /* temporary object, exists during processing */
    const cluster_sequence *  m_clusters  = nullptr;  /* temporary object, exists during processing */
    silhouette_data *         m_result    = nullptr;  /* temporary object, exists during processing */

    silhouette_data_t         m_type      = silhouette_data_t::POINTS;

    distance_metric<point>    m_metric    = distance_metric_factory<point>::euclidean_square();

public:
    /*!
    
    @brief  Default constructor for Silhouette method.
    
    */
    silhouette() = default;

    /*!

    @brief  Constructor for Silhouette method with specific parameters.

    @param[in] p_metric: metric that was used for cluster analysis and should be used for Silhouette
                score calculation (by default Square Euclidean distance).

    */
    explicit silhouette(const distance_metric<point> & p_metric);

    /*!

    @brief  Default copy constructor for Silhouette method.

    */
    silhouette(const silhouette & p_other) = default;

    /*!

    @brief  Default move constructor for Silhouette method.

    */
    silhouette(silhouette && p_other) = default;

    /*!

    @brief  Default destructor for Silhouette method.

    */
    ~silhouette() = default;

public:
    /*!

    @brief    Performs analysis of an input data in order to calculate score for each point where input data is represented by points.

    @param[in]  p_data: input data (points) for analysis.
    @param[in]  p_clusters: clusters that have been obtained after cluster analysis.
    @param[out] p_result: silhouette input data processing result.

    */
    void process(const dataset & p_data, const cluster_sequence & p_clusters, silhouette_data & p_result);

    /*!

    @brief    Performs analysis of an input data in order to calculate score for each point.

    @param[in]  p_data: input data for analysis.
    @param[in]  p_clusters: clusters that have been obtained after cluster analysis.
    @param[in]  p_type: data type of input sample `p_data` that is processed by the method (`POINTS`, `DISTANCE_MATRIX`).
    @param[out] p_result: silhouette input data processing result.

    */
    void process(const dataset & p_data, const cluster_sequence & p_clusters, const silhouette_data_t & p_type, silhouette_data & p_result);

private:
    double calculate_score(const std::size_t p_index_point, const std::size_t p_index_cluster) const;

    void calculate_dataset_difference(const std::size_t p_index_point, std::vector<double> & p_dataset_difference) const;

    double calculate_cluster_difference(const std::size_t p_index_cluster, const std::vector<double> & p_dataset_difference) const;

    double calculate_within_cluster_score(const std::size_t p_index_cluster, const std::vector<double> & p_dataset_difference) const;

    double calculate_cluster_score(const std::size_t p_index_cluster, const std::vector<double> & p_dataset_difference) const;

    double caclulate_optimal_neighbor_cluster_score(const std::size_t p_index_cluster, const std::vector<double> & p_dataset_difference) const;
};


}

}