File: test_pairwise.py

package info (click to toggle)
imbalanced-learn 0.12.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,160 kB
  • sloc: python: 17,221; sh: 481; makefile: 187; javascript: 50
file content (172 lines) | stat: -rw-r--r-- 6,395 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""Test for the metrics that perform pairwise distance computation."""

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

import numpy as np
import pytest
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.utils._testing import _convert_container

from imblearn.metrics.pairwise import ValueDifferenceMetric


@pytest.fixture
def data():
    rng = np.random.RandomState(0)

    feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30
    feature_2 = ["A"] * 40 + ["B"] * 20
    feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10
    X = np.array([feature_1, feature_2, feature_3], dtype=object).T
    rng.shuffle(X)
    y = rng.randint(low=0, high=2, size=X.shape[0])
    y_labels = np.array(["not apple", "apple"], dtype=object)
    y = y_labels[y]
    return X, y


@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
@pytest.mark.parametrize("k, r", [(1, 1), (1, 2), (2, 1), (2, 2)])
@pytest.mark.parametrize("y_type", ["list", "array"])
@pytest.mark.parametrize("encode_label", [True, False])
def test_value_difference_metric(data, dtype, k, r, y_type, encode_label):
    # Check basic feature of the metric:
    # * the shape of the distance matrix is (n_samples, n_samples)
    # * computing pairwise distance of X is the same than explicitely between
    #   X and X.
    X, y = data
    y = _convert_container(y, y_type)
    if encode_label:
        y = LabelEncoder().fit_transform(y)

    encoder = OrdinalEncoder(dtype=dtype)
    X_encoded = encoder.fit_transform(X)

    vdm = ValueDifferenceMetric(k=k, r=r)
    vdm.fit(X_encoded, y)

    dist_1 = vdm.pairwise(X_encoded)
    dist_2 = vdm.pairwise(X_encoded, X_encoded)

    np.testing.assert_allclose(dist_1, dist_2)
    assert dist_1.shape == (X.shape[0], X.shape[0])
    assert dist_2.shape == (X.shape[0], X.shape[0])


@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
@pytest.mark.parametrize("k, r", [(1, 1), (1, 2), (2, 1), (2, 2)])
@pytest.mark.parametrize("y_type", ["list", "array"])
@pytest.mark.parametrize("encode_label", [True, False])
def test_value_difference_metric_property(dtype, k, r, y_type, encode_label):
    # Check the property of the vdm distance. Let's check the property
    # described in "Improved Heterogeneous Distance Functions", D.R. Wilson and
    # T.R. Martinez, Journal of Artificial Intelligence Research 6 (1997) 1-34
    # https://arxiv.org/pdf/cs/9701101.pdf
    #
    # "if an attribute color has three values red, green and blue, and the
    # application is to identify whether or not an object is an apple, red and
    # green would be considered closer than red and blue because the former two
    # both have similar correlations with the output class apple."

    # defined our feature
    X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1)
    # 0 - not an apple / 1 - an apple
    y = np.array([1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1])
    y_labels = np.array(["not apple", "apple"], dtype=object)
    y = y_labels[y]
    y = _convert_container(y, y_type)
    if encode_label:
        y = LabelEncoder().fit_transform(y)

    encoder = OrdinalEncoder(dtype=dtype)
    X_encoded = encoder.fit_transform(X)

    vdm = ValueDifferenceMetric(k=k, r=r)
    vdm.fit(X_encoded, y)

    sample_green = encoder.transform([["green"]])
    sample_red = encoder.transform([["red"]])
    sample_blue = encoder.transform([["blue"]])

    for sample in (sample_green, sample_red, sample_blue):
        # computing the distance between a sample of the same category should
        # give a null distance
        dist = vdm.pairwise(sample).squeeze()
        assert dist == pytest.approx(0)

    # check the property explained in the introduction example
    dist_1 = vdm.pairwise(sample_green, sample_red).squeeze()
    dist_2 = vdm.pairwise(sample_blue, sample_red).squeeze()
    dist_3 = vdm.pairwise(sample_blue, sample_green).squeeze()

    # green and red are very close
    # blue is closer to red than green
    assert dist_1 < dist_2
    assert dist_1 < dist_3
    assert dist_2 < dist_3


def test_value_difference_metric_categories(data):
    # Check that "auto" is equivalent to provide the number categories
    # beforehand
    X, y = data

    encoder = OrdinalEncoder(dtype=np.int32)
    X_encoded = encoder.fit_transform(X)
    n_categories = np.array([len(cat) for cat in encoder.categories_])

    vdm_auto = ValueDifferenceMetric().fit(X_encoded, y)
    vdm_categories = ValueDifferenceMetric(n_categories=n_categories)
    vdm_categories.fit(X_encoded, y)

    np.testing.assert_array_equal(vdm_auto.n_categories_, n_categories)
    np.testing.assert_array_equal(vdm_auto.n_categories_, vdm_categories.n_categories_)


def test_value_difference_metric_categories_error(data):
    # Check that we raise an error if n_categories is inconsistent with the
    # number of features in X
    X, y = data

    encoder = OrdinalEncoder(dtype=np.int32)
    X_encoded = encoder.fit_transform(X)
    n_categories = [1, 2]

    vdm = ValueDifferenceMetric(n_categories=n_categories)
    err_msg = "The length of n_categories is not consistent with the number"
    with pytest.raises(ValueError, match=err_msg):
        vdm.fit(X_encoded, y)


def test_value_difference_metric_missing_categories(data):
    # Check that we don't get issue when a category is missing between 0
    # n_categories - 1
    X, y = data

    encoder = OrdinalEncoder(dtype=np.int32)
    X_encoded = encoder.fit_transform(X)
    n_categories = np.array([len(cat) for cat in encoder.categories_])

    # remove a categories that could be between 0 and n_categories
    X_encoded[X_encoded[:, -1] == 1] = 0
    np.testing.assert_array_equal(np.unique(X_encoded[:, -1]), [0, 2, 3])

    vdm = ValueDifferenceMetric(n_categories=n_categories)
    vdm.fit(X_encoded, y)

    for n_cats, proba in zip(n_categories, vdm.proba_per_class_):
        assert proba.shape == (n_cats, len(np.unique(y)))


def test_value_difference_value_unfitted(data):
    # Check that we raise a NotFittedError when `fit` is not not called before
    # pairwise.
    X, y = data

    encoder = OrdinalEncoder(dtype=np.int32)
    X_encoded = encoder.fit_transform(X)

    with pytest.raises(NotFittedError):
        ValueDifferenceMetric().pairwise(X_encoded)