1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
/*!
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause
*/
#pragma once
#include <pyclustering/cluster/bsas_data.hpp>
#include <pyclustering/utils/metric.hpp>
using namespace pyclustering::utils::metric;
namespace pyclustering {
namespace clst {
/*!
@class bsas bsas.hpp pyclustering/cluster/bsas.hpp
@brief Class represents BSAS clustering algorithm - basic sequential algorithmic scheme.
@details Algorithm has two mandatory parameters: maximum allowable number of clusters and threshold
of dissimilarity or in other words maximum distance between points. Distance metric also can
be specified using 'metric' parameters, by default 'Manhattan' distance is used.
BSAS using following rule for updating cluster representative:
\f[
\vec{m}_{C_{k}}^{new}=\frac{ \left ( n_{C_{k}^{new}} - 1 \right )\vec{m}_{C_{k}}^{old} + \vec{x} }{n_{C_{k}^{new}}}
\f]
Clustering results of this algorithm depends on objects order in input data.
Example of cluster analysis using BSAS algorithm:
@code
using namespace pyclustering;
using namespace pyclustering::clst;
int main() {
dataset data = read_data("Simple02.txt");
// Algorithm configuration: amount of clusters to allocate and threshold of dissimilarity.
const std::size_t amount_clusters = 3;
const double threshold = 1.0;
// Create and run BSAS algorithm.
bsas_data result;
bsas(amount_clusters, threshold).process(data, result);
// Extract clustering results: clusters and representative points.
const cluster_sequence & clusters = result.clusters();
const representative_sequence & representatives = result.representatives();
// Display allocated clusters.
for (const auto group : clusters) {
for (const auto index : group) { std::cout << index << " "; }
std::cout << std::endl;
}
return 0;
}
@endcode
There is another one example where distance metric is specified:
@code
// Create manhattan distance metric.
auto metric = distance_metric_factory<point>::manhattan();
// Create BSAS and run clustering algorithm using Manhattan distance.
bsas_data result;
bsas(amount_clusters, threshold, metric).process(data, result);
@endcode
Implementation based on paper @cite book::pattern_recognition::2009.
*/
class bsas {
protected:
/*!
@brief Description of the nearest cluster to a point that is described by cluster index and distance from the cluster to the specified point.
*/
struct nearest_cluster {
std::size_t m_index = (std::size_t) -1; /**< Cluster index. */
double m_distance = std::numeric_limits<double>::max(); /**< Distance between the cluster and a specific point. */
};
protected:
bsas_data * m_result_ptr = nullptr; /**< Temporary pointer to clustering result that is used only during processing. */
double m_threshold = 0.0; /**< Threshold of dissimilarity (maximum distance) between points. */
std::size_t m_amount = 0; /**< Amount of clusters that should be allocated. */
distance_metric<point> m_metric; /**< Metric for distance calculation between points. */
public:
/*!
@brief Default constructor of the clustering algorithm.
*/
bsas() = default;
/*!
@brief Creates BSAS algorithm using specified parameters.
@param[in] p_amount: amount of clusters that should be allocated.
@param[in] p_threshold: threshold of dissimilarity (maximum distance) between points.
@param[in] p_metric: metric for distance calculation between points.
*/
bsas(const std::size_t p_amount,
const double p_threshold,
const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean());
public:
/*!
@brief Performs cluster analysis of an input data.
@param[in] p_data: input data for cluster analysis.
@param[out] p_result: clustering result of an input data.
*/
virtual void process(const dataset & p_data, bsas_data & p_result);
protected:
/*!
@brief Find nearest cluster to the specified point.
@param[in] p_point: point for which nearest cluster is searched.
@return Description of nearest cluster that is defined by cluster index and distance to the point.
*/
nearest_cluster find_nearest_cluster(const point & p_point) const;
/*!
@brief Update cluster representative in line with new cluster size and added point to it.
@param[in] p_index: index of cluster whose representative should be updated.
@param[in] p_point: new point that was added to cluster.
*/
void update_representative(const std::size_t p_index, const point & p_point);
};
}
}
|