File: bsas.hpp

package info (click to toggle)
python-pyclustering 0.10.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 11,128 kB
  • sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (162 lines) | stat: -rwxr-xr-x 5,083 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/*!

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

*/

#pragma once


#include <pyclustering/cluster/bsas_data.hpp>

#include <pyclustering/utils/metric.hpp>


using namespace pyclustering::utils::metric;


namespace pyclustering {

namespace clst {

/*!

@class    bsas bsas.hpp pyclustering/cluster/bsas.hpp

@brief    Class represents BSAS clustering algorithm - basic sequential algorithmic scheme.
@details  Algorithm has two mandatory parameters: maximum allowable number of clusters and threshold
           of dissimilarity or in other words maximum distance between points. Distance metric also can
           be specified using 'metric' parameters, by default 'Manhattan' distance is used.
           BSAS using following rule for updating cluster representative:

\f[
\vec{m}_{C_{k}}^{new}=\frac{ \left ( n_{C_{k}^{new}} - 1 \right )\vec{m}_{C_{k}}^{old} + \vec{x} }{n_{C_{k}^{new}}}
\f]

Clustering results of this algorithm depends on objects order in input data.

Example of cluster analysis using BSAS algorithm:
@code
    using namespace pyclustering;
    using namespace pyclustering::clst;

    int main() {
        dataset data = read_data("Simple02.txt");

        // Algorithm configuration: amount of clusters to allocate and threshold of dissimilarity.
        const std::size_t amount_clusters = 3;
        const double threshold = 1.0;

        // Create and run BSAS algorithm.
        bsas_data result;
        bsas(amount_clusters, threshold).process(data, result);

        // Extract clustering results: clusters and representative points.
        const cluster_sequence & clusters = result.clusters();
        const representative_sequence & representatives = result.representatives();

        // Display allocated clusters.
        for (const auto group : clusters) {
            for (const auto index : group) { std::cout << index << " "; }
            std::cout << std::endl;
        }

        return 0;
    }
@endcode

There is another one example where distance metric is specified:
@code
    // Create manhattan distance metric.
    auto metric = distance_metric_factory<point>::manhattan();

    // Create BSAS and run clustering algorithm using Manhattan distance.
    bsas_data result;
    bsas(amount_clusters, threshold, metric).process(data, result);
@endcode

Implementation based on paper @cite book::pattern_recognition::2009.

*/
class bsas {
protected:
    /*!

    @brief    Description of the nearest cluster to a point that is described by cluster index and distance from the cluster to the specified point.

    */
    struct nearest_cluster {
        std::size_t   m_index       = (std::size_t) -1;                     /**< Cluster index. */
        double        m_distance    = std::numeric_limits<double>::max();   /**< Distance between the cluster and a specific point. */
    };

protected:
    bsas_data       * m_result_ptr  = nullptr;  /**< Temporary pointer to clustering result that is used only during processing. */

    double          m_threshold     = 0.0;      /**< Threshold of dissimilarity (maximum distance) between points. */

    std::size_t     m_amount        = 0;        /**< Amount of clusters that should be allocated. */

    distance_metric<point>          m_metric;   /**< Metric for distance calculation between points. */

public:
    /*!

    @brief    Default constructor of the clustering algorithm.

    */
    bsas() = default;

    /*!

    @brief    Creates BSAS algorithm using specified parameters.

    @param[in] p_amount: amount of clusters that should be allocated.
    @param[in] p_threshold: threshold of dissimilarity (maximum distance) between points.
    @param[in] p_metric: metric for distance calculation between points.

    */
    bsas(const std::size_t p_amount,
         const double p_threshold,
         const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean());

public:
    /*!
    
    @brief    Performs cluster analysis of an input data.
    
    @param[in]  p_data: input data for cluster analysis.
    @param[out] p_result: clustering result of an input data.
    
    */
    virtual void process(const dataset & p_data, bsas_data & p_result);

protected:
    /*!

    @brief    Find nearest cluster to the specified point.

    @param[in] p_point: point for which nearest cluster is searched.

    @return   Description of nearest cluster that is defined by cluster index and distance to the point.

    */
    nearest_cluster find_nearest_cluster(const point & p_point) const;

    /*!

    @brief    Update cluster representative in line with new cluster size and added point to it.

    @param[in] p_index: index of cluster whose representative should be updated.
    @param[in] p_point: new point that was added to cluster.

    */
    void update_representative(const std::size_t p_index, const point & p_point);
};


}

}