File: kmeans.py

package info (click to toggle)
python-cluster 1.4.1.post3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 412 kB
  • sloc: python: 812; makefile: 146; sh: 4
file content (168 lines) | stat: -rw-r--r-- 6,614 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#
# This is part of "python-cluster". A library to group similar items together.
# Copyright (C) 2006    Michel Albert
#
# This library is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation; either version 2.1 of the License, or (at your
# option) any later version.
# This library is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
# for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with this library; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#


from cluster.util import ClusteringError, centroid, minkowski_distance


class KMeansClustering(object):
    """
    Implementation of the kmeans clustering method as explained in a tutorial_
    by *matteucc*.

    .. _tutorial: http://www.elet.polimi.it/upload/matteucc/Clustering/tutorial_html/kmeans.html

    Example:

      >>> from cluster import KMeansClustering
      >>> cl = KMeansClustering([(1,1), (2,1), (5,3), ...])
      >>> clusters = cl.getclusters(2)

    :param data: A list of tuples or integers.
    :param distance: A function determining the distance between two items.
        Default (if ``None`` is passed): It assumes the tuples contain numeric
        values and appiles a generalised form of the euclidian-distance
        algorithm on them.
    :param equality: A function to test equality of items. By default the
        standard python equality operator (``==``) is applied.
    :raises ValueError: if the list contains heterogeneous items or if the
        distance between items cannot be determined.
    """

    def __init__(self, data, distance=None, equality=None):
        self.__clusters = []
        self.__data = data
        self.distance = distance
        self.__initial_length = len(data)
        self.equality = equality

        # test if each item is of same dimensions
        if len(data) > 1 and isinstance(data[0], tuple):
            control_length = len(data[0])
            for item in data[1:]:
                if len(item) != control_length:
                    raise ValueError("Each item in the data list must have "
                                     "the same amount of dimensions. Item "
                                     "%r was out of line!" % (item,))
        # now check if we need and have a distance function
        if (len(data) > 1 and not isinstance(data[0], tuple) and
                distance is None):
            raise ValueError("You supplied non-standard items but no "
                             "distance function! We cannot continue!")
        # we now know that we have tuples, and assume therefore that it's
        # items are numeric
        elif distance is None:
            self.distance = minkowski_distance

    def getclusters(self, count):
        """
        Generates *count* clusters.

        :param count: The amount of clusters that should be generated.  count
            must be greater than ``1``.
        :raises ClusteringError: if *count* is out of bounds.
        """

        # only proceed if we got sensible input
        if count <= 1:
            raise ClusteringError("When clustering, you need to ask for at "
                                  "least two clusters! "
                                  "You asked for %d" % count)

        # return the data straight away if there is nothing to cluster
        if (self.__data == [] or len(self.__data) == 1 or
                count == self.__initial_length):
            return self.__data

        # It makes no sense to ask for more clusters than data-items available
        if count > self.__initial_length:
            raise ClusteringError(
                "Unable to generate more clusters than "
                "items available. You supplied %d items, and asked for "
                "%d clusters." % (self.__initial_length, count))

        self.initialise_clusters(self.__data, count)

        items_moved = True  # tells us if any item moved between the clusters,
                            # as we initialised the clusters, we assume that
                            # is the case

        while items_moved is True:
            items_moved = False
            for cluster in self.__clusters:
                for item in cluster:
                    res = self.assign_item(item, cluster)
                    if items_moved is False:
                        items_moved = res
        return self.__clusters

    def assign_item(self, item, origin):
        """
        Assigns an item from a given cluster to the closest located cluster.

        :param item: the item to be moved.
        :param origin: the originating cluster.
        """
        closest_cluster = origin
        for cluster in self.__clusters:
            if self.distance(item, centroid(cluster)) < self.distance(
                    item, centroid(closest_cluster)):
                closest_cluster = cluster

        if id(closest_cluster) != id(origin):
            self.move_item(item, origin, closest_cluster)
            return True
        else:
            return False

    def move_item(self, item, origin, destination):
        """
        Moves an item from one cluster to anoter cluster.

        :param item: the item to be moved.
        :param origin: the originating cluster.
        :param destination: the target cluster.
        """
        if self.equality:
            item_index = 0
            for i, element in enumerate(origin):
                if self.equality(element, item):
                    item_index = i
                    break
        else:
            item_index = origin.index(item)

        destination.append(origin.pop(item_index))

    def initialise_clusters(self, input_, clustercount):
        """
        Initialises the clusters by distributing the items from the data.
        evenly across n clusters

        :param input_: the data set (a list of tuples).
        :param clustercount: the amount of clusters (n).
        """
        # initialise the clusters with empty lists
        self.__clusters = []
        for _ in range(clustercount):
            self.__clusters.append([])

        # distribute the items into the clusters
        count = 0
        for item in input_:
            self.__clusters[count % clustercount].append(item)
            count += 1