File: generator.py

package info (click to toggle)
python-pyclustering 0.10.1.2-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 11,128 kB
  • sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (96 lines) | stat: -rwxr-xr-x 3,475 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""!

@brief Cluster generator.

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

"""


import collections.abc
import random


class data_generator:
    """!
    @brief Data generator provides services to generate data with clusters with normal distribution.

    """

    def __init__(self, amount_clusters, dimension, cluster_sizes, cluster_centers=None, cluster_width=1.0):
        """!
        @brief Constructs data generator for generating data-sets.

        @param[in] amount_clusters (uint): Amount of clusters that should be generated.
        @param[in] dimension (uint): Dimension of each generated point.
        @param[in] cluster_sizes (uint|array_like): Size of each cluster. In case of 'array_like' input clusters with
                    corresponding sizes are generated.
        @param[in] cluster_centers (array_like): Optional parameter that defines cluster centers (means).
        @param[in] cluster_width (uint|array_like): Optional parameter that defines cluster width (standard deviation).
                    In case of 'array_like' input each cluster has own standard deviation.

        """

        self.__amount_clusters = amount_clusters
        self.__dimension = dimension

        self.__cluster_sizes = cluster_sizes
        if not isinstance(self.__cluster_sizes, collections.abc.Iterable):
            self.__cluster_sizes = [self.__cluster_sizes] * amount_clusters

        self.__cluster_width = cluster_width
        if not isinstance(self.__cluster_width, collections.abc.Iterable):
            self.__cluster_width = [self.__cluster_width] * amount_clusters

        self.__cluster_centers = cluster_centers
        if self.__cluster_centers is None:
            self.__cluster_centers = self.__generate_cluster_centers(self.__cluster_width)


    def generate(self):
        """!
        @brief Generates data in line with generator parameters.

        """
        data_points = []

        for index_cluster in range(self.__amount_clusters):
            for _ in range(self.__cluster_sizes[index_cluster]):
                point = self.__generate_point(index_cluster)
                data_points.append(point)

        return data_points


    def __generate_point(self, index_cluster):
        """!
        @brief Generates point in line with parameters of specified cluster.

        @param[in] index_cluster (uint): Index of cluster whose parameters are used for point generation.

        @return (list) New generated point in line with normal distribution and cluster parameters.

        """
        return [ random.gauss(self.__cluster_centers[index_cluster][index_dimension],
                              self.__cluster_width[index_cluster] / 2.0)
                 for index_dimension in range(self.__dimension) ]


    def __generate_cluster_centers(self, width):
        """!
        @brief Generates centers (means in statistical term) for clusters.

        @param[in] width (list): Width of generated clusters.

        @return (list) Generated centers in line with normal distribution.

        """
        centers = []
        default_offset = max(width) * 4.0
        for i in range(self.__amount_clusters):
            center = [ random.gauss(i * default_offset, width[i] / 2.0) for _ in range(self.__dimension) ]
            centers.append(center)

        return centers