File: kmeans.py

package info (click to toggle)
python-pyclustering 0.10.1.2-2
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 11,128 kB
sloc: cpp: 38,888; python: 24,311; sh: 384; makefile: 105
file content (615 lines) | stat: -rwxr-xr-x 23,824 bytes
parent folder | download | duplicates (2)
"""!

@brief The module contains K-Means algorithm and other related services.
@details Implementation based on paper @cite inproceedings::kmeans::1.

@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause

"""


import copy
import numpy

import matplotlib.pyplot as plt
import matplotlib.animation as animation

import pyclustering.core.kmeans_wrapper as wrapper

from pyclustering.core.wrapper import ccore_library
from pyclustering.core.metric_wrapper import metric_wrapper

from pyclustering.cluster.encoder import type_encoding
from pyclustering.cluster import cluster_visualizer

from pyclustering.utils.metric import distance_metric, type_metric


class kmeans_observer:
    """!
    @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.
    
    @see kmeans
    
    """
    
    def __init__(self):
        """!
        @brief Initializer of observer of K-Means algorithm.
        
        """
        self.__evolution_clusters = []
        self.__evolution_centers = []
        self.__initial_centers = []


    def __len__(self):
        """!
        @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.
        
        """
        return len(self.__evolution_clusters)


    def notify(self, clusters, centers):
        """!
        @brief This method is called by K-Means algorithm to notify about changes.
        
        @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
        @param[in] centers (array_like): Allocated centers by K-Means algorithm.
        
        """
        self.__evolution_clusters.append(clusters)
        self.__evolution_centers.append(centers)


    def set_evolution_centers(self, evolution_centers):
        """!
        @brief Set evolution of changes of centers during clustering process.
        
        @param[in] evolution_centers (array_like): Evolution of changes of centers during clustering process.
        
        """
        self.__evolution_centers = evolution_centers


    def get_centers(self, iteration):
        """!
        @brief Get method to return centers at specific iteration of clustering process.
        
        @param[in] iteration (uint): Clustering process iteration at which centers are required.
        
        @return (array_like) Centers at specific iteration.
        
        """
        return self.__evolution_centers[iteration]


    def set_evolution_clusters(self, evolution_clusters):
        """!
        @brief Set evolution of changes of centers during clustering process.
        
        @param[in] evolution_clusters (array_like): Evolution of changes of clusters during clustering process.
        
        """
        self.__evolution_clusters = evolution_clusters


    def get_clusters(self, iteration):
        """!
        @brief Get method to return allocated clusters at specific iteration of clustering process.
        
        @param[in] iteration (uint): Clustering process iteration at which clusters are required.
        
        @return (array_like) Clusters at specific iteration.
        
        """
        return self.__evolution_clusters[iteration]



class kmeans_visualizer:
    """!
    @brief Visualizer of K-Means algorithm's results.
    @details K-Means visualizer provides visualization services that are specific for K-Means algorithm.
    
    """
    
    __default_2d_marker_size = 15
    __default_3d_marker_size = 70
    
    
    @staticmethod
    def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
        """!
        @brief Display K-Means clustering results.
        
        @param[in] sample (list): Dataset that was used for clustering.
        @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
        @param[in] centers (array_like): Centers that were allocated by the algorithm.
        @param[in] initial_centers (array_like): Initial centers that were used by the algorithm, if 'None' then initial centers are not displyed.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display', 'offset').
        
        <b>Keyword Args:</b><br>
            - figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization.
            - display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
            - offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
        
        @return (figure) Figure where clusters were drawn.
        
        """

        visualizer = cluster_visualizer()
        visualizer.append_clusters(clusters, sample)
        
        offset = kwargs.get('offset', 0)
        figure = kwargs.get('figure', None)
        display = kwargs.get('display', True)

        if figure is None:
            figure = visualizer.show(display=False)
        else:
            visualizer.show(figure=figure, display=False)
        
        kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers)
        kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers)
        
        if display is True:
            plt.show()

        return figure


    @staticmethod
    def __draw_rays(figure, offset, visualizer, sample, clusters, centers):
        ax = figure.get_axes()[offset]
        
        for index_cluster in range(len(clusters)):
            color = visualizer.get_cluster_color(index_cluster, 0)
            kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])


    @staticmethod
    def __draw_cluster_rays(ax, color, sample, cluster, center):
        dimension = len(sample[0])
        
        for index_point in cluster:
            point = sample[index_point]
            if dimension == 1:
                ax.plot([point[0], center[0]], [0.0, 0.0], '-', color=color, linewidth=0.5)
            elif dimension == 2:
                ax.plot([point[0], center[0]], [point[1], center[1]], '-', color=color, linewidth=0.5)
            elif dimension == 3:
                ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]], '-', color=color, linewidth=0.5)


    @staticmethod
    def __draw_center(ax, center, color, marker, alpha):
        dimension = len(center)
        
        if dimension == 1:
            ax.plot(center[0], 0.0, color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
        elif dimension == 2:
            ax.plot(center[0], center[1], color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size)
        elif dimension == 3:
            ax.scatter(center[0], center[1], center[2], c=color, alpha=alpha, marker=marker, s=kmeans_visualizer.__default_3d_marker_size)


    @staticmethod
    def __draw_centers(figure, offset, visualizer, centers, initial_centers):
        ax = figure.get_axes()[offset]
        
        for index_center in range(len(centers)):
            color = visualizer.get_cluster_color(index_center, 0)
            kmeans_visualizer.__draw_center(ax, centers[index_center], color, '*', 1.0)
            
            if initial_centers is not None:
                kmeans_visualizer.__draw_center(ax, initial_centers[index_center], color, '*', 0.4)


    @staticmethod
    def animate_cluster_allocation(data, observer, animation_velocity=500, movie_fps=1, save_movie=None):
        """!
        @brief Animates clustering process that is performed by K-Means algorithm.

        @param[in] data (list): Dataset that is used for clustering.
        @param[in] observer (kmeans_observer): EM observer that was used for collection information about clustering process.
        @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
        @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
        @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.

        """
        figure = plt.figure()

        def init_frame():
            return frame_generation(0)

        def frame_generation(index_iteration):
            figure.clf()

            figure.suptitle("K-Means algorithm (iteration: " + str(index_iteration) + ")", fontsize=18, fontweight='bold')

            clusters = observer.get_clusters(index_iteration)
            centers = observer.get_centers(index_iteration)
            kmeans_visualizer.show_clusters(data, clusters, centers, None, figure=figure, display=False)

            figure.subplots_adjust(top=0.85)

            return [figure.gca()]

        iterations = len(observer)
        cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval=animation_velocity,
                                                    init_func=init_frame, repeat_delay=5000)

        if save_movie is not None:
            cluster_animation.save(save_movie, writer='ffmpeg', fps=movie_fps, bitrate=3000)
        else:
            plt.show()



class kmeans:
    """!
    @brief Class implements K-Means clustering algorithm.
    @details K-Means clustering aims to partition n observations into k clusters in which each observation belongs to
              the cluster with the nearest mean, serving as a prototype of the cluster. This results in a partitioning
              of the data space into Voronoi cells.

    K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization of
    initial centers - see module 'pyclustering.cluster.center_initializer'.

    CCORE implementation (C/C++ part of the library) of the algorithm performs parallel processing to ensure maximum
    performance.

    Implementation based on the paper @cite inproceedings::kmeans::1.

    @image html kmeans_example_clustering.png "Fig. 1. K-Means clustering results. At the left - 'Simple03.data' sample, at the right - 'Lsun.data' sample."

    Example #1 - Clustering using K-Means++ for center initialization:
    @code
        from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
        from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
        from pyclustering.samples.definitions import FCPS_SAMPLES
        from pyclustering.utils import read_sample

        # Load list of points for cluster analysis.
        sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)

        # Prepare initial centers using K-Means++ method.
        initial_centers = kmeans_plusplus_initializer(sample, 2).initialize()

        # Create instance of K-Means algorithm with prepared centers.
        kmeans_instance = kmeans(sample, initial_centers)

        # Run cluster analysis and obtain results.
        kmeans_instance.process()
        clusters = kmeans_instance.get_clusters()
        final_centers = kmeans_instance.get_centers()

        # Visualize obtained results
        kmeans_visualizer.show_clusters(sample, clusters, final_centers)
    @endcode

    Example #2 - Clustering using specific distance metric, for example, Manhattan distance:
    @code
        # prepare input data and initial centers for cluster analysis using K-Means

        # create metric that will be used for clustering
        manhattan_metric = distance_metric(type_metric.MANHATTAN)

        # create instance of K-Means using specific distance metric:
        kmeans_instance = kmeans(sample, initial_centers, metric=manhattan_metric)

        # run cluster analysis and obtain results
        kmeans_instance.process()
        clusters = kmeans_instance.get_clusters()
    @endcode

    @see center_initializer
    
    """
    
    def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
        """!
        @brief Constructor of clustering algorithm K-Means.
        @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
        
        @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
        @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer', 'metric', 'itermax').
        
        <b>Keyword Args:</b><br>
            - observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
            - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance).
            - itermax (uint): Maximum number of iterations that is used for clustering process (by default: 200).
        
        @see center_initializer
        
        """
        self.__pointer_data = numpy.array(data)
        self.__clusters = []
        self.__centers = numpy.array(initial_centers)
        self.__tolerance = tolerance
        self.__total_wce = 0.0

        self.__observer = kwargs.get('observer', None)
        self.__metric = copy.copy(kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)))
        self.__itermax = kwargs.get('itermax', 100)

        if self.__metric.get_type() != type_metric.USER_DEFINED:
            self.__metric.enable_numpy_usage()
        else:
            self.__metric.disable_numpy_usage()

        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
        if self.__ccore is True:
            self.__ccore = ccore_library.workable()

        self.__verify_arguments()


    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of K-Means algorithm.

        @return (kmeans) Returns itself (K-Means instance).
        
        @see get_clusters()
        @see get_centers()
        
        """

        if len(self.__pointer_data[0]) != len(self.__centers[0]):
            raise ValueError("Dimension of the input data and dimension of the initial cluster centers must be equal.")

        if self.__ccore is True:
            self.__process_by_ccore()
        else:
            self.__process_by_python()

        return self


    def __process_by_ccore(self):
        """!
        @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).

        """
        ccore_metric = metric_wrapper.create_instance(self.__metric)

        results = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance, self.__itermax,
                                 (self.__observer is not None), ccore_metric.get_pointer())

        self.__clusters = results[0]
        self.__centers = results[1]

        if self.__observer is not None:
            self.__observer.set_evolution_clusters(results[2])
            self.__observer.set_evolution_centers(results[3])

        self.__total_wce = results[4][0]


    def __process_by_python(self):
        """!
        @brief Performs cluster analysis using python code.

        """

        maximum_change = float('inf')
        iteration = 0

        if self.__observer is not None:
            initial_clusters = self.__update_clusters()
            self.__observer.notify(initial_clusters, self.__centers.tolist())

        while maximum_change > self.__tolerance and iteration < self.__itermax:
            self.__clusters = self.__update_clusters()
            updated_centers = self.__update_centers()  # changes should be calculated before assignment

            if self.__observer is not None:
                self.__observer.notify(self.__clusters, updated_centers.tolist())

            maximum_change = self.__calculate_changes(updated_centers)

            self.__centers = updated_centers    # assign center after change calculation
            iteration += 1

        self.__calculate_total_wce()


    def predict(self, points):
        """!
        @brief Calculates the closest cluster to each point.

        @param[in] points (array_like): Points for which closest clusters are calculated.

        @return (list) List of closest clusters for each point. Each cluster is denoted by index. Return empty
                 collection if 'process()' method was not called.

        """

        nppoints = numpy.array(points)
        if len(self.__clusters) == 0:
            return []

        differences = numpy.zeros((len(nppoints), len(self.__centers)))
        for index_point in range(len(nppoints)):
            if self.__metric.get_type() != type_metric.USER_DEFINED:
                differences[index_point] = self.__metric(nppoints[index_point], self.__centers)
            else:
                differences[index_point] = [self.__metric(nppoints[index_point], center) for center in self.__centers]

        return numpy.argmin(differences, axis=1)


    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @see process()
        @see get_centers()
        
        """
        
        return self.__clusters


    def get_centers(self):
        """!
        @brief Returns list of centers of allocated clusters.
        
        @see process()
        @see get_clusters()
        
        """

        if isinstance(self.__centers, list):
            return self.__centers

        return self.__centers.tolist()


    def get_total_wce(self):
        """!
        @brief Returns sum of metric errors that depends on metric that was used for clustering (by default SSE - Sum of Squared Errors).
        @details Sum of metric errors is calculated using distance between point and its center:
                 \f[error=\sum_{i=0}^{N}distance(x_{i}-center(x_{i}))\f]

        @see process()
        @see get_clusters()

        """

        return self.__total_wce


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION


    def __update_clusters(self):
        """!
        @brief Calculate distance (in line with specified metric) to each point from the each cluster. Nearest points
                are captured by according clusters and as a result clusters are updated.
        
        @return (list) Updated clusters as list of clusters. Each cluster contains indexes of objects from data.
        
        """
        
        clusters = [[] for _ in range(len(self.__centers))]
        
        dataset_differences = self.__calculate_dataset_difference(len(clusters))
        
        optimum_indexes = numpy.argmin(dataset_differences, axis=0)
        for index_point in range(len(optimum_indexes)):
            index_cluster = optimum_indexes[index_point]
            clusters[index_cluster].append(index_point)
        
        clusters = [cluster for cluster in clusters if len(cluster) > 0]

        return clusters


    def __update_centers(self):
        """!
        @brief Calculate centers of clusters in line with contained objects.
        
        @return (numpy.array) Updated centers.
        
        """
        
        dimension = self.__pointer_data.shape[1]
        centers = numpy.zeros((len(self.__clusters), dimension))
        
        for index in range(len(self.__clusters)):
            cluster_points = self.__pointer_data[self.__clusters[index], :]
            centers[index] = cluster_points.mean(axis=0)

        return numpy.array(centers)


    def __calculate_total_wce(self):
        """!
        @brief Calculate total within cluster errors that is depend on metric that was chosen for K-Means algorithm.

        """

        dataset_differences = self.__calculate_dataset_difference(len(self.__clusters))

        self.__total_wce = 0.0
        for index_cluster in range(len(self.__clusters)):
            for index_point in self.__clusters[index_cluster]:
                self.__total_wce += dataset_differences[index_cluster][index_point]


    def __calculate_dataset_difference(self, amount_clusters):
        """!
        @brief Calculate distance from each point to each cluster center.

        """
        dataset_differences = numpy.zeros((amount_clusters, len(self.__pointer_data)))
        for index_center in range(amount_clusters):
            if self.__metric.get_type() != type_metric.USER_DEFINED:
                dataset_differences[index_center] = self.__metric(self.__pointer_data, self.__centers[index_center])
            else:
                dataset_differences[index_center] = [self.__metric(point, self.__centers[index_center])
                                                     for point in self.__pointer_data]

        return dataset_differences


    def __calculate_changes(self, updated_centers):
        """!
        @brief Calculates changes estimation between previous and current iteration using centers for that purpose.

        @param[in] updated_centers (array_like): New cluster centers.

        @return (float) Maximum changes between centers.

        """
        if len(self.__centers) != len(updated_centers):
            maximum_change = float('inf')

        else:
            if self.__metric.get_type() != type_metric.USER_DEFINED:
                changes = self.__metric(self.__centers, updated_centers)
            else:
                changes = [self.__metric(center, updated_center) for center, updated_center in zip(self.__centers, updated_centers)]

            maximum_change = numpy.max(changes)

        return maximum_change


    def __verify_arguments(self):
        """!
        @brief Verify input parameters for the algorithm and throw exception in case of incorrectness.

        """
        if len(self.__pointer_data) == 0:
            raise ValueError("Input data is empty (size: '%d')." % len(self.__pointer_data))

        if len(self.__centers) == 0:
            raise ValueError("Initial centers are empty (size: '%d')." % len(self.__pointer_data))

        if self.__tolerance < 0:
            raise ValueError("Tolerance (current value: '%d') should be greater or equal to 0." %
                             self.__tolerance)

        if self.__itermax < 0:
            raise ValueError("Maximum iterations (current value: '%d') should be greater or equal to 0." %
                             self.__tolerance)