1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
|
"""Algorithms for spectral clustering"""
# Author: Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD
import warnings
import numpy as np
from ..base import BaseEstimator
from ..utils import check_random_state
from ..utils.graph import graph_laplacian
from .k_means_ import k_means
def spectral_embedding(adjacency, n_components=8, mode=None,
random_state=None):
"""Project the sample on the first eigen vectors of the graph Laplacian
The adjacency matrix is used to compute a normalized graph Laplacian
whose spectrum (especially the eigen vectors associated to the
smallest eigen values) has an interpretation in terms of minimal
number of cuts necessary to split the graph into comparably sized
components.
This embedding can also 'work' even if the ``adjacency`` variable is
not strictly the adjacency matrix of a graph but more generally
an affinity or similarity matrix between samples (for instance the
heat kernel of a euclidean distance matrix or a k-NN matrix).
However care must taken to always make the affinity matrix symmetric
so that the eigen vector decomposition works as expected.
Parameters
-----------
adjacency: array-like or sparse matrix, shape: (n_samples, n_samples)
The adjacency matrix of the graph to embed.
n_components: integer, optional
The dimension of the projection subspace.
mode: {None, 'arpack' or 'amg'}
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities
random_state: int seed, RandomState instance, or None (default)
A pseudo random number generator used for the initialization of the
lobpcg eigen vectors decomposition when mode == 'amg'. By default
arpack is used.
Returns
--------
embedding: array, shape: (n_samples, n_components)
The reduced samples
Notes
------
The graph should contain only one connected component, elsewhere the
results make little sense.
"""
from scipy import sparse
from ..utils.arpack import eigsh
from scipy.sparse.linalg import lobpcg
try:
from pyamg import smoothed_aggregation_solver
amg_loaded = True
except ImportError:
amg_loaded = False
random_state = check_random_state(random_state)
n_nodes = adjacency.shape[0]
# XXX: Should we check that the matrices given is symmetric
if not amg_loaded:
warnings.warn('pyamg not available, using scipy.sparse')
if mode is None:
mode = 'arpack'
laplacian, dd = graph_laplacian(adjacency,
normed=True, return_diag=True)
if (mode == 'arpack'
or not sparse.isspmatrix(laplacian)
or n_nodes < 5 * n_components):
# lobpcg used with mode='amg' has bugs for low number of nodes
# We need to put the diagonal at zero
if not sparse.isspmatrix(laplacian):
laplacian[::n_nodes + 1] = 0
else:
laplacian = laplacian.tocoo()
diag_idx = (laplacian.row == laplacian.col)
laplacian.data[diag_idx] = 0
# If the matrix has a small number of diagonals (as in the
# case of structured matrices comming from images), the
# dia format might be best suited for matvec products:
n_diags = np.unique(laplacian.row - laplacian.col).size
if n_diags <= 7:
# 3 or less outer diagonals on each side
laplacian = laplacian.todia()
else:
# csr has the fastest matvec and is thus best suited to
# arpack
laplacian = laplacian.tocsr()
# Here we'll use shift-invert mode for fast eigenvalues
# (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
# for a short explanation of what this means)
# Because the normalized Laplacian has eigenvalues between 0 and 2,
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
# when finding eigenvalues of largest magnitude (keyword which='LM')
# and when these eigenvalues are very large compared to the rest.
# For very large, very sparse graphs, I - L can have many, many
# eigenvalues very near 1.0. This leads to slow convergence. So
# instead, we'll use ARPACK's shift-invert mode, asking for the
# eigenvalues near 1.0. This effectively spreads-out the spectrum
# near 1.0 and leads to much faster convergence: potentially an
# orders-of-magnitude speedup over simply using keyword which='LA'
# in standard mode.
lambdas, diffusion_map = eigsh(-laplacian, k=n_components,
sigma=1.0, which='LM')
embedding = diffusion_map.T[::-1] * dd
elif mode == 'amg':
# Use AMG to get a preconditioner and speed up the eigenvalue
# problem.
laplacian = laplacian.astype(np.float) # lobpcg needs native floats
ml = smoothed_aggregation_solver(laplacian.tocsr())
X = random_state.rand(laplacian.shape[0], n_components)
X[:, 0] = 1. / dd.ravel()
M = ml.aspreconditioner()
lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12,
largest=False)
embedding = diffusion_map.T * dd
if embedding.shape[0] == 1:
raise ValueError
else:
raise ValueError("Unknown value for mode: '%s'."
"Should be 'amg' or 'arpack'" % mode)
return embedding
def spectral_clustering(affinity, k=8, n_components=None, mode=None,
random_state=None, n_init=10):
"""Apply k-means to a projection to the normalized laplacian
In practice Spectral Clustering is very useful when the structure of
the individual clusters is highly non-convex or more generally when
a measure of the center and spread of the cluster is not a suitable
description of the complete cluster. For instance when clusters are
nested circles on the 2D plan.
If affinity is the adjacency matrix of a graph, this method can be
used to find normalized graph cuts.
Parameters
-----------
affinity: array-like or sparse matrix, shape: (n_samples, n_samples)
The affinity matrix describing the relationship of the samples to
embed. **Must be symetric**.
Possible examples:
- adjacency matrix of a graph,
- heat kernel of the pairwise distance matrix of the samples,
- symmetic k-nearest neighbours connectivity matrix of the samples.
k: integer, optional
Number of clusters to extract.
n_components: integer, optional, default is k
Number of eigen vectors to use for the spectral embedding
mode: {None, 'arpack' or 'amg'}
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities
random_state: int seed, RandomState instance, or None (default)
A pseudo random number generator used for the initialization
of the lobpcg eigen vectors decomposition when mode == 'amg'
and by the K-Means initialization.
n_init: int, optional, default: 10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
Returns
-------
labels: array of integers, shape: n_samples
The labels of the clusters.
centers: array of integers, shape: k
The indices of the cluster centers
References
----------
- Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
- A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
Notes
------
The graph should contain only one connect component, elsewhere
the results make little sense.
This algorithm solves the normalized cut for k=2: it is a
normalized spectral clustering.
"""
random_state = check_random_state(random_state)
n_components = k if n_components is None else n_components
maps = spectral_embedding(affinity, n_components=n_components,
mode=mode, random_state=random_state)
maps = maps[1:]
_, labels, _ = k_means(maps.T, k, random_state=random_state,
n_init=n_init)
return labels
class SpectralClustering(BaseEstimator):
"""Apply k-means to a projection to the normalized laplacian
In practice Spectral Clustering is very useful when the structure of
the individual clusters is highly non-convex or more generally when
a measure of the center and spread of the cluster is not a suitable
description of the complete cluster. For instance when clusters are
nested circles on the 2D plan.
If affinity is the adjacency matrix of a graph, this method can be
used to find normalized graph cuts.
Parameters
-----------
k: integer, optional
The dimension of the projection subspace.
mode: {None, 'arpack' or 'amg'}
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities
random_state: int seed, RandomState instance, or None (default)
A pseudo random number generator used for the initialization
of the lobpcg eigen vectors decomposition when mode == 'amg'
and by the K-Means initialization.
n_init: int, optional, default: 10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
Attributes
----------
`labels_` :
Labels of each point
References
----------
- Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
- A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
"""
def __init__(self, k=8, mode=None, random_state=None, n_init=10):
self.k = k
self.mode = mode
self.random_state = random_state
self.n_init = n_init
def fit(self, X):
"""Compute the spectral clustering from the affinity matrix
Parameters
-----------
X: array-like or sparse matrix, shape: (n_samples, n_samples)
An affinity matrix describing the pairwise similarity of the
data. If can also be an adjacency matrix of the graph to embed.
X must be symmetric and its entries must be positive or
zero. Zero means that elements have nothing in common,
whereas high values mean that elements are strongly similar.
Notes
------
If you have an affinity matrix, such as a distance matrix,
for which 0 means identical elements, and high values means
very dissimilar elements, it can be transformed in a
similarity matrix that is well suited for the algorithm by
applying the gaussian (heat) kernel::
np.exp(- X ** 2 / (2. * delta ** 2))
Another alternative is to take a symmetric version of the k
nearest neighbors connectivity matrix of the points.
If the pyamg package is installed, it is used: this greatly
speeds up computation.
"""
self.random_state = check_random_state(self.random_state)
self.labels_ = spectral_clustering(X, k=self.k, mode=self.mode,
random_state=self.random_state,
n_init=self.n_init)
return self
|