1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
|
"""!
@brief Cluster generator.
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause
"""
import collections.abc
import random
class data_generator:
"""!
@brief Data generator provides services to generate data with clusters with normal distribution.
"""
def __init__(self, amount_clusters, dimension, cluster_sizes, cluster_centers=None, cluster_width=1.0):
"""!
@brief Constructs data generator for generating data-sets.
@param[in] amount_clusters (uint): Amount of clusters that should be generated.
@param[in] dimension (uint): Dimension of each generated point.
@param[in] cluster_sizes (uint|array_like): Size of each cluster. In case of 'array_like' input clusters with
corresponding sizes are generated.
@param[in] cluster_centers (array_like): Optional parameter that defines cluster centers (means).
@param[in] cluster_width (uint|array_like): Optional parameter that defines cluster width (standard deviation).
In case of 'array_like' input each cluster has own standard deviation.
"""
self.__amount_clusters = amount_clusters
self.__dimension = dimension
self.__cluster_sizes = cluster_sizes
if not isinstance(self.__cluster_sizes, collections.abc.Iterable):
self.__cluster_sizes = [self.__cluster_sizes] * amount_clusters
self.__cluster_width = cluster_width
if not isinstance(self.__cluster_width, collections.abc.Iterable):
self.__cluster_width = [self.__cluster_width] * amount_clusters
self.__cluster_centers = cluster_centers
if self.__cluster_centers is None:
self.__cluster_centers = self.__generate_cluster_centers(self.__cluster_width)
def generate(self):
"""!
@brief Generates data in line with generator parameters.
"""
data_points = []
for index_cluster in range(self.__amount_clusters):
for _ in range(self.__cluster_sizes[index_cluster]):
point = self.__generate_point(index_cluster)
data_points.append(point)
return data_points
def __generate_point(self, index_cluster):
"""!
@brief Generates point in line with parameters of specified cluster.
@param[in] index_cluster (uint): Index of cluster whose parameters are used for point generation.
@return (list) New generated point in line with normal distribution and cluster parameters.
"""
return [ random.gauss(self.__cluster_centers[index_cluster][index_dimension],
self.__cluster_width[index_cluster] / 2.0)
for index_dimension in range(self.__dimension) ]
def __generate_cluster_centers(self, width):
"""!
@brief Generates centers (means in statistical term) for clusters.
@param[in] width (list): Width of generated clusters.
@return (list) Generated centers in line with normal distribution.
"""
centers = []
default_offset = max(width) * 4.0
for i in range(self.__amount_clusters):
center = [ random.gauss(i * default_offset, width[i] / 2.0) for _ in range(self.__dimension) ]
centers.append(center)
return centers
|