File: kmedians.hpp

package info (click to toggle)
python-pyclustering 0.10.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 11,128 kB
  • sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (153 lines) | stat: -rwxr-xr-x 4,721 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/*!

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

*/

#pragma once


#include <memory>

#include <pyclustering/cluster/kmedians_data.hpp>

#include <pyclustering/utils/metric.hpp>


using namespace pyclustering::utils::metric;


namespace pyclustering {

namespace clst {


/**
*
* @brief    Represents K-Medians clustering algorithm for cluster analysis.
* @details  The algorithm related to partitional class when input data is divided into groups.
*
*/
class kmedians {
public:
    const static double         DEFAULT_TOLERANCE;  /**< Default value of the tolerance stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. */

    const static std::size_t    DEFAULT_ITERMAX;    /**< Default value of the step stop condition - maximum number of iterations that is used for clustering process. */

private:
    const static double         THRESHOLD_CHANGE;

private:
    double                  m_tolerance         = 0.0;

    std::size_t             m_max_iter          = 0;

    dataset                 m_initial_medians   = { };

    kmedians_data         * m_ptr_result        = nullptr;   /* temporary pointer to output result */

    const dataset         * m_ptr_data          = nullptr;     /* used only during processing */

    distance_metric<point>  m_metric;

public:
    /**
    *
    * @brief    Default constructor of clustering algorithm.
    *
    */
    kmedians() = default;

    /**
    *
    * @brief    Constructor of clustering algorithm where algorithm parameters for processing are
    *           specified.
    *
    * @param[in] p_initial_medians: initial medians that are used for processing.
    * @param[in] p_tolerance: stop condition in following way: when maximum value of distance change of
    *             medians of clusters is less than tolerance than algorithm will stop processing.
    * @param[in] p_max_iter: maximum amount of iteration for clustering.
    * @param[in] p_metric: distance metric for distance calculation between objects.
    *
    */
    kmedians(const dataset & p_initial_medians, 
             const double p_tolerance = DEFAULT_TOLERANCE,
             const std::size_t p_max_iter = DEFAULT_ITERMAX,
             const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean_square());

    /**
    *
    * @brief    Default destructor of the algorithm.
    *
    */
    ~kmedians() = default;

public:
    /**
    *
    * @brief    Performs cluster analysis of an input data.
    *
    * @param[in]  p_data: input data for cluster analysis.
    * @param[out] p_output_result: clustering result of an input data.
    *
    */
    void process(const dataset & p_data, kmedians_data & p_output_result);

private:
    /**
    *
    * @brief    Updates clusters in line with current medians.
    *
    * @param[in] p_medians: medians that are used for updating clusters.
    * @param[out] p_clusters: updated clusters in line with the specified medians.
    *
    */
    void update_clusters(const dataset & p_medians, cluster_sequence & p_clusters);

    /**
    *
    * @brief    Assign point to cluster by marking corresponding index in container 'p_lables'.
    *
    * @param[in] p_index_point: index of point that should be assigned to cluster.
    * @param[in] p_medians: medians that corresponds to clusters.
    * @param[out] p_lables: cluster labels for each point (cluster labels has the same size as an input data).
    *
    */
    void assign_point_to_cluster(const std::size_t p_index_point, const dataset & p_medians, index_sequence & p_lables);

    /**
    *
    * @brief    Updates medians in line with current clusters.
    *
    * @param[in,out] clusters: clusters that are sorted and used for updating medians.
    * @param[out] medians: updated medians in line with the specified clusters.
    *
    */
    double update_medians(cluster_sequence & clusters, dataset & medians);

    /**
    *
    * @brief    Calculate median for particular cluster.
    *
    * @param[in,out] current_cluster: cluster that is sorted and used for updating medians.
    * @param[out] median: calculate median for particular cluster.
    *
    */
    void calculate_median(cluster & current_cluster, point & median);

    /**
    *
    * @brief    Erases clusters that do not have any points.
    *
    * @param[in,out] p_clusters: clusters that should be analyzed and modified.
    *
    */
    static void erase_empty_clusters(cluster_sequence & p_clusters);
};


}

}