1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
|
import warnings
import numpy as np
from sklearn.metrics import silhouette_score, adjusted_mutual_info_score, silhouette_samples
from Orange.data import Table
from Orange.evaluation.testing import Results, Validation
from Orange.evaluation.scoring import Score
__all__ = ['ClusteringEvaluation']
class ClusteringResults(Results):
def get_fold(self, fold):
results = Results()
results.data = self.data
if self.folds is None:
raise ValueError("This 'Results' instance does not have folds.")
if self.models is not None:
results.models = self.models[fold]
results.row_indices = self.row_indices
results.actual = self.actual
results.predicted = self.predicted[:, fold, :]
results.domain = self.domain
return results
class ClusteringScore(Score):
considers_actual = False
@staticmethod
def is_compatible(domain) -> bool:
return True
# pylint: disable=arguments-differ
def from_predicted(self, results, score_function):
# Clustering scores from labels
if self.considers_actual:
return np.fromiter(
(score_function(results.actual.flatten(),
predicted.flatten())
for predicted in results.predicted),
dtype=np.float64, count=len(results.predicted))
# Clustering scores from data only
else:
return np.fromiter(
(score_function(results.data.X, predicted.flatten())
for predicted in results.predicted),
dtype=np.float64, count=len(results.predicted))
class Silhouette(ClusteringScore):
separate_folds = True
def compute_score(self, results):
return self.from_predicted(results, silhouette_score)
class AdjustedMutualInfoScore(ClusteringScore):
separate_folds = True
considers_actual = True
def compute_score(self, results):
return self.from_predicted(results, adjusted_mutual_info_score)
# Class overrides fit and doesn't need to define the abstract get_indices
# pylint: disable=abstract-method
class ClusteringEvaluation(Validation):
"""
Clustering evaluation.
.. attribute:: k
The number of runs.
"""
def __init__(self, k=1, store_data=False, store_models=False):
super().__init__(store_data=store_data, store_models=store_models)
self.k = k
def __call__(self, data, learners, preprocessor=None, *, callback=None):
res = ClusteringResults()
res.data = data
res.predicted = np.empty((len(learners), self.k, len(data)))
res.folds = range(self.k)
res.row_indices = np.arange(len(data))
res.actual = data.Y.flatten() if hasattr(data, "Y") else None
if self.store_models:
res.models = np.tile(None, (self.k, len(learners)))
for k in range(self.k):
for i, learner in enumerate(learners):
model = learner.get_model(data)
if self.store_models:
res.models[k, i] = model
res.predicted[i, k, :] = model.labels
return res
def graph_silhouette(X, y, xlim=None, colors=None, figsize=None, filename=None):
"""
Silhouette plot.
:param filename:
Output file name.
:param X Orange.data.Table or numpy.ndarray
Data table.
:param y Orange.data.Table or numpy.ndarray:
Cluster labels (integers).
:param colors list, optional (default = None):
List of colors. If provided, it must equal the number of clusters.
:param figsize tuple (float, float):
Figure size (width, height) in inches.
:param xlim tuple (float, float):
Limit x-axis values.
"""
# If the module is not there, let the user install it
# pylint: disable=import-error
import matplotlib.pyplot as plt
if isinstance(X, Table):
X = X.X
if isinstance(y, Table):
y = y.X
y = y.ravel()
# Detect number of clusters and set colors
N = len(set(y))
if isinstance(colors, type(None)):
colors = ["g" if i % 2 else "b" for i in range(N)]
elif len(colors) != N:
import sys
sys.stderr.write("Number of colors does not match the number of clusters. \n")
return
# Silhouette coefficients
s = silhouette_samples(X, y)
s = s[np.argsort(y)] # Sort by clusters
parts = []
# Within clusters sort by silhouette scores
for label, (i, j) in enumerate([(sum(y == c1), sum(y == c1) + sum(y == c2))
for c1, c2 in zip(range(-1, N-1), range(0, N))]):
scores = sorted(s[i:j])
parts.append((scores, label))
# Plot data
if figsize:
plt.figure(figsize=figsize)
else:
plt.figure()
plt.title("Silhouette score")
total = 0
centers = []
for i, (scores, label) in enumerate(parts):
plt.barh(range(total, total + len(scores)),
scores, color=colors[i], edgecolor=colors[i])
centers.append(total+len(scores)/2)
total += len(scores)
if not isinstance(xlim, type(None)):
plt.xlim(xlim)
plt.yticks(centers)
plt.gca().set_yticklabels(range(N))
plt.ylabel("Cluster label")
if filename:
plt.savefig(filename)
plt.close()
else:
plt.show()
|