1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
""" Unsupervised evaluation metrics. """
# Authors: Robert Layton <robertlayton@gmail.com>
#
# License: BSD Style.
import numpy as np
from ...utils import check_random_state
from ..pairwise import pairwise_distances
def silhouette_score(X, labels, metric='euclidean',
sample_size=None, random_state=None, **kwds):
"""Compute the mean Silhouette Coefficient of all samples.
The Silhouette Coefficient is calculated using the mean intra-cluster
distance (a) and the mean nearest-cluster distance (b) for each sample.
The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``.
To clarrify, b is the distance between a sample and the nearest cluster
that b is not a part of.
This function returns the mean Silhoeutte Coefficient over all samples.
To obtain the values for each sample, use silhouette_samples
The best value is 1 and the worst value is -1. Values near 0 indicate
overlapping clusters. Negative values generally indicate that a sample has
been assigned to the wrong cluster, as a different cluster is more similar.
Parameters
----------
X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
[n_samples_a, n_features] otherwise
Array of pairwise distances between samples, or a feature array.
labels : array, shape = [n_samples]
label values for each sample
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by metrics.pairwise.pairwise_distances. If X is the distance
array itself, use "precomputed" as the metric.
sample_size : int or None
The size of the sample to use when computing the Silhouette
Coefficient. If sample_size is None, no sampling is used.
random_state : integer or numpy.RandomState, optional
The generator used to initialize the centers. If an integer is
given, it fixes the seed. Defaults to the global numpy random
number generator.
`**kwds` : optional keyword parameters
Any further parameters are passed directly to the distance function.
If using a scipy.spatial.distance metric, the parameters are still
metric dependent. See the scipy docs for usage examples.
Returns
-------
silhouette : float
Mean Silhouette Coefficient for all samples.
References
----------
Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
Interpretation and Validation of Cluster Analysis". Computational
and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7.
http://en.wikipedia.org/wiki/Silhouette_(clustering)
"""
if sample_size is not None:
random_state = check_random_state(random_state)
indices = random_state.permutation(X.shape[0])[:sample_size]
if metric == "precomputed":
X, labels = X[indices].T[indices].T, labels[indices]
else:
X, labels = X[indices], labels[indices]
return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
def silhouette_samples(X, labels, metric='euclidean', **kwds):
"""Compute the Silhouette Coefficient for each sample.
The Silhoeutte Coefficient is a measure of how well samples are clustered
with samples that are similar to themselves. Clustering models with a high
Silhouette Coefficient are said to be dense, where samples in the same
cluster are similar to each other, and well separated, where samples in
different clusters are not very similar to each other.
The Silhouette Coefficient is calculated using the mean intra-cluster
distance (a) and the mean nearest-cluster distance (b) for each sample.
The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``.
This function returns the Silhoeutte Coefficient for each sample.
The best value is 1 and the worst value is -1. Values near 0 indicate
overlapping clusters.
Parameters
----------
X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
[n_samples_a, n_features] otherwise
Array of pairwise distances between samples, or a feature array.
labels : array, shape = [n_samples]
label values for each sample
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by metrics.pairwise.pairwise_distances. If X is the distance
array itself, use "precomputed" as the metric.
`**kwds` : optional keyword parameters
Any further parameters are passed directly to the distance function.
If using a scipy.spatial.distance metric, the parameters are still
metric dependent. See the scipy docs for usage examples.
Returns
-------
silhouette : array, shape = [n_samples]
Silhouette Coefficient for each samples.
References
----------
Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
Interpretation and Validation of Cluster Analysis". Computational
and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7.
http://en.wikipedia.org/wiki/Silhouette_(clustering)
"""
distances = pairwise_distances(X, metric=metric, **kwds)
n = labels.shape[0]
A = np.array([_intra_cluster_distance(distances[i], labels, i)
for i in range(n)])
B = np.array([_nearest_cluster_distance(distances[i], labels, i)
for i in range(n)])
return (B - A) / np.maximum(A, B)
def _intra_cluster_distance(distances_row, labels, i):
"""Calculate the mean intra-cluster distance for sample i.
Parameters
----------
distances_row : array, shape = [n_samples]
Pairwise distance matrix between sample i and each sample.
labels : array, shape = [n_samples]
label values for each sample
i : int
Sample index being calculated. It is excluded from calculation and
used to determine the current label
Returns
-------
a : float
Mean intra-cluster distance for sample i
"""
mask = labels == labels[i]
mask[i] = False
a = np.mean(distances_row[mask])
return a
def _nearest_cluster_distance(distances_row, labels, i):
"""Calculate the mean nearest-cluster distance for sample i.
Parameters
----------
distances_row : array, shape = [n_samples]
Pairwise distance matrix between sample i and each sample.
labels : array, shape = [n_samples]
label values for each sample
i : int
Sample index being calculated. It is used to determine the current
label.
Returns
-------
b : float
Mean nearest-cluster distance for sample i
"""
label = labels[i]
b = np.min([np.mean(distances_row[labels == cur_label])
for cur_label in set(labels) if not cur_label == label])
return b
|