1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
|
/*!
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause
*/
#pragma once
#include <pyclustering/cluster/clique_data.hpp>
#include <list>
#include <unordered_map>
namespace pyclustering {
namespace clst {
/*!
@class coordinate_iterator clique.hpp pyclustering/cluster/clique.hpp
@brief Coordinate iterator is used to generate logical location description for each CLIQUE block.
@details This class is used by CLIQUE algorithm for clustering process.
*/
class coordinate_iterator {
private:
std::size_t m_dimension = 0;
std::size_t m_edge = 0;
clique_block_location m_coordinate;
public:
/*!
@brief Constructs coordinate iterator for CLIQUE algorithm.
@param[in] p_dimension: amount of dimensions in input data space.
@param[in] p_edge: amount of intervals in each dimension.
*/
coordinate_iterator(const std::size_t p_dimension, const std::size_t p_edge);
public:
/*!
@brief Returns constant reference to current block coordinate.
*/
const clique_block_location & get_coordinate() const noexcept;
/*!
@brief Returns reference to current block coordinate.
*/
clique_block_location & get_coordinate() noexcept;
public:
/*!
@brief Forms logical location for next block.
@details Method `get_coordinate` should be used to get new coordinates.
*/
coordinate_iterator & operator++();
};
/*!
@class clique clique.hpp pyclustering/cluster/clique.hpp
@brief Class implements CLIQUE grid based clustering algorithm.
@details CLIQUE automatically finds subspaces with high-density clusters. It produces identical results
irrespective of the order in which the input records are presented and it does not presume any canonical
distribution for input data @cite article::clique::1.
Here is an example where data in two-dimensional space is clustered using CLIQUE algorithm:
@code
using namespace pyclustering;
using namespace pyclustering::clst;
int main() {
// Read two-dimensional input data 'Target'.
dataset data = read_data("Target.txt");
// Prepare algorithm's parameters.
const std::size_t intervals = 10; // defines amount of cells in grid in each dimension
const std::size_t threshold = 0; // no outliers
// Create CLIQUE algorithm for processing.
clique clique_instance = clique(intervals, threshold);
// Run clustering process.
clique_data result;
clique_instance.process(data, result);
// Obtain results.
cluster_sequence & clusters = result.clusters();
clique_block_sequence & blocks = result.blocks();
noise & outliers = result.noise(); // in this case it is empty because threshold is 0.
// Display information about extracted clusters:
std::cout << "Amount of clusters: " << clusters.size() << std::endl;
std::cout << "Amount of outliers: " << outliers.size() << std::endl;
return 0;
}
@endcode
Here is one of the example how to implement read function to get input data:
@code
dataset read_data(const std::string & filename) {
dataset data;
std::ifstream file(filename);
std::string line;
while (std::getline(file, line)) {
std::stringstream stream(line);
point coordinates;
double value = 0.0;
while (stream >> value) {
coordinates.push_back(value);
}
data.push_back(coordinates);
}
file.close();
return data;
}
@endcode
In example above, 6 clusters are allocated including four small cluster where each such small cluster consists of
three points. There are visualized clustering results - grid that has been formed by CLIQUE algorithm with
density and clusters itself (see Python version of pyclustering library for visualization):
@image html clique_clustering_target.png "Fig. 1. CLIQUE clustering results (grid and clusters itself)."
Sometimes such small clusters should be considered as outliers taking into account fact that two clusters in the
central are relatively huge. To treat them as a noise threshold value should be increased:
@code
// Prepare algorithm's parameters.
const std::size_t intervals = 10; // defines amount of cells in grid in each dimension
const std::size_t threshold = 3; // block that contains 3 or less points is considered as a outlier as well as its points
@endcode
After execution following output is obtained:
@code
Amount of clusters: 2
Amount of outliers: 25
@endcode
Two clusters are allocated, but in this case some points in cluster-"circle" are also considered as outliers,
because CLIQUE operates with blocks, not with points:
@image html clique_clustering_with_noise.png "Fig. 2. Noise allocation by CLIQUE."
*/
class clique {
private:
struct data_info {
point m_min_corner;
point m_max_corner;
std::vector<double> m_sizes;
};
private:
using block_map = std::unordered_map<std::string, clique_block *>;
private:
std::size_t m_intervals = 0;
std::size_t m_density_threshold = 0;
const dataset * m_data_ptr = nullptr;
clique_data * m_result_ptr = nullptr;
block_map m_cells_map;
public:
/*!
@brief Create CLIQUE clustering algorithm.
@param[in] p_intervals: amount of intervals in each dimension that defines amount of CLIQUE blocks as \f[N_{ blocks } = intervals^{ dimensions }\f].
@param[in] p_threshold: minimum number of points that should be contained by CLIQUE block to consider its points as non-outliers.
*/
clique(const std::size_t p_intervals, const std::size_t p_threshold);
public:
/*!
@brief Performs cluster analysis of an input data.
@param[in] p_data: input data for cluster analysis.
@param[out] p_result: clustering result of an input data.
*/
void process(const dataset & p_data, clique_data & p_result);
private:
void create_grid();
void expand_cluster(clique_block & p_block);
void get_neighbors(const clique_block & p_block, std::list<clique_block *> & p_neighbors) const;
void get_spatial_location(const clique_block_location & p_location, const clique::data_info & p_info, clique_spatial_block & p_block) const;
void get_data_info(clique::data_info & p_info) const;
static std::string location_to_key(const clique_block_location & p_location);
};
}
}
|