File: test_nearest_centroid.py

package info (click to toggle)
scikit-learn 1.7.2%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,752 kB
  • sloc: python: 219,120; cpp: 5,790; ansic: 846; makefile: 191; javascript: 110
file content (237 lines) | stat: -rw-r--r-- 7,572 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""
Testing for the nearest centroid module.
"""

import numpy as np
import pytest

from sklearn import datasets
from sklearn.neighbors import NearestCentroid
from sklearn.utils._testing import (
    assert_allclose,
    assert_array_almost_equal,
    assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS

# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
true_result = [-1, 1, 1]
true_result_prior1 = [-1, 1, 1]

true_discriminant_scores = [-32, 64, 80]
true_proba = [[1, 1.26642e-14], [1.60381e-28, 1], [1.80485e-35, 1]]


# also load the iris dataset
# and randomly permute it
iris = datasets.load_iris()
rng = np.random.RandomState(1)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_classification_toy(csr_container):
    # Check classification on a toy dataset, including sparse versions.
    X_csr = csr_container(X)
    T_csr = csr_container(T)

    # Check classification on a toy dataset, including sparse versions.
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
    assert_array_almost_equal(clf.predict_proba(T), true_proba)

    # Test uniform priors
    clf = NearestCentroid(priors="uniform")
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
    assert_array_almost_equal(clf.predict_proba(T), true_proba)

    clf = NearestCentroid(priors="empirical")
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores)
    assert_array_almost_equal(clf.predict_proba(T), true_proba)

    # Test custom priors
    clf = NearestCentroid(priors=[0.25, 0.75])
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result_prior1)

    # Same test, but with a sparse matrix to fit and test.
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit with sparse, test with non-sparse
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T), true_result)

    # Fit with non-sparse, test with sparse
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit and predict with non-CSR sparse matrices
    clf = NearestCentroid()
    clf.fit(X_csr.tocoo(), y)
    assert_array_equal(clf.predict(T_csr.tolil()), true_result)


def test_iris():
    # Check consistency on dataset iris.
    for metric in ("euclidean", "manhattan"):
        clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
        score = np.mean(clf.predict(iris.data) == iris.target)
        assert score > 0.9, "Failed with score = " + str(score)


def test_iris_shrinkage():
    # Check consistency on dataset iris, when using shrinkage.
    for metric in ("euclidean", "manhattan"):
        for shrink_threshold in [None, 0.1, 0.5]:
            clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
            clf = clf.fit(iris.data, iris.target)
            score = np.mean(clf.predict(iris.data) == iris.target)
            assert score > 0.8, "Failed with score = " + str(score)


def test_pickle():
    import pickle

    # classification
    obj = NearestCentroid()
    obj.fit(iris.data, iris.target)
    score = obj.score(iris.data, iris.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert type(obj2) == obj.__class__
    score2 = obj2.score(iris.data, iris.target)
    assert_array_equal(
        score,
        score2,
        "Failed to generate same score after pickling (classification).",
    )


def test_shrinkage_correct():
    # Ensure that the shrinking is correct.
    # The expected result is calculated by R (pamr),
    # which is implemented by the author of the original paper.
    # (One need to modify the code to output the new centroid in pamr.predict)

    X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
    y = np.array([1, 1, 2, 2, 2])
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
    np.testing.assert_array_almost_equal(clf.centroids_, expected_result)


def test_shrinkage_threshold_decoded_y():
    clf = NearestCentroid(shrink_threshold=0.01)
    y_ind = np.asarray(y)
    y_ind[y_ind == -1] = 0
    clf.fit(X, y_ind)
    centroid_encoded = clf.centroids_
    clf.fit(X, y)
    assert_array_equal(centroid_encoded, clf.centroids_)


def test_predict_translated_data():
    # Test that NearestCentroid gives same results on translated data

    rng = np.random.RandomState(0)
    X = rng.rand(50, 50)
    y = rng.randint(0, 3, 50)
    noise = rng.rand(50)
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    y_init = clf.predict(X)
    clf = NearestCentroid(shrink_threshold=0.1)
    X_noise = X + noise
    clf.fit(X_noise, y)
    y_translate = clf.predict(X_noise)
    assert_array_equal(y_init, y_translate)


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_manhattan_metric(csr_container):
    # Test the manhattan metric.
    X_csr = csr_container(X)

    clf = NearestCentroid(metric="manhattan")
    clf.fit(X, y)
    dense_centroid = clf.centroids_
    clf.fit(X_csr, y)
    assert_array_equal(clf.centroids_, dense_centroid)
    assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])


def test_features_zero_var():
    # Test that features with 0 variance throw error

    X = np.empty((10, 2))
    X[:, 0] = -0.13725701
    X[:, 1] = -0.9853293
    y = np.zeros((10))
    y[0] = 1

    clf = NearestCentroid(shrink_threshold=0.1)
    with pytest.raises(ValueError):
        clf.fit(X, y)


def test_negative_priors_error():
    """Check that we raise an error when the user-defined priors are negative."""
    clf = NearestCentroid(priors=[-2, 4])
    with pytest.raises(ValueError, match="priors must be non-negative"):
        clf.fit(X, y)


def test_warn_non_normalized_priors():
    """Check that we raise a warning and normalize the user-defined priors when they
    don't sum to 1.
    """
    priors = [2, 4]
    clf = NearestCentroid(priors=priors)
    with pytest.warns(
        UserWarning,
        match="The priors do not sum to 1. Normalizing such that it sums to one.",
    ):
        clf.fit(X, y)

    assert_allclose(clf.class_prior_, np.asarray(priors) / np.asarray(priors).sum())


@pytest.mark.parametrize(
    "response_method", ["decision_function", "predict_proba", "predict_log_proba"]
)
def test_method_not_available_with_manhattan(response_method):
    """Check that we raise an AttributeError with Manhattan metric when trying
    to call a non-thresholded response method.
    """
    clf = NearestCentroid(metric="manhattan").fit(X, y)
    with pytest.raises(AttributeError):
        getattr(clf, response_method)(T)


@pytest.mark.parametrize("array_constructor", [np.array] + CSR_CONTAINERS)
def test_error_zero_variances(array_constructor):
    """Check that we raise an error when the variance for all features is zero."""
    X = np.ones((len(y), 2))
    X[:, 1] *= 2
    X = array_constructor(X)

    clf = NearestCentroid()
    with pytest.raises(ValueError, match="All features have zero variance"):
        clf.fit(X, y)