File: test_stats.py

package info (click to toggle)
scikit-learn 1.7.2%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 25,752 kB
  • sloc: python: 219,120; cpp: 5,790; ansic: 846; makefile: 190; javascript: 110
file content (352 lines) | stat: -rw-r--r-- 12,576 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from pytest import approx

from sklearn._config import config_context
from sklearn.utils._array_api import (
    _convert_to_numpy,
    get_namespace,
    yield_namespace_device_dtype_combinations,
)
from sklearn.utils._array_api import device as array_device
from sklearn.utils.estimator_checks import _array_api_for_tests
from sklearn.utils.fixes import np_version, parse_version
from sklearn.utils.stats import _averaged_weighted_percentile, _weighted_percentile


def test_averaged_weighted_median():
    y = np.array([0, 1, 2, 3, 4, 5])
    sw = np.array([1, 1, 1, 1, 1, 1])

    score = _averaged_weighted_percentile(y, sw, 50)

    assert score == np.median(y)


def test_averaged_weighted_percentile(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    y = rng.randint(20, size=10)

    sw = np.ones(10)

    score = _averaged_weighted_percentile(y, sw, 20)

    assert score == np.percentile(y, 20, method="averaged_inverted_cdf")


def test_averaged_and_weighted_percentile():
    y = np.array([0, 1, 2])
    sw = np.array([5, 1, 5])
    q = 50

    score_averaged = _averaged_weighted_percentile(y, sw, q)
    score = _weighted_percentile(y, sw, q)

    assert score_averaged == score


def test_weighted_percentile():
    """Check `weighted_percentile` on artificial data with obvious median."""
    y = np.empty(102, dtype=np.float64)
    y[:50] = 0
    y[-51:] = 2
    y[-1] = 100000
    y[50] = 1
    sw = np.ones(102, dtype=np.float64)
    sw[-1] = 0.0
    value = _weighted_percentile(y, sw, 50)
    assert approx(value) == 1


def test_weighted_percentile_equal():
    """Check `weighted_percentile` with all weights equal to 1."""
    y = np.empty(102, dtype=np.float64)
    y.fill(0.0)
    sw = np.ones(102, dtype=np.float64)
    score = _weighted_percentile(y, sw, 50)
    assert approx(score) == 0


def test_weighted_percentile_zero_weight():
    """Check `weighted_percentile` with all weights equal to 0."""
    y = np.empty(102, dtype=np.float64)
    y.fill(1.0)
    sw = np.ones(102, dtype=np.float64)
    sw.fill(0.0)
    value = _weighted_percentile(y, sw, 50)
    assert approx(value) == 1.0


def test_weighted_percentile_zero_weight_zero_percentile():
    """Check `weighted_percentile(percentile_rank=0)` behaves correctly.

    Ensures that (leading)zero-weight observations ignored when `percentile_rank=0`.
    See #20528 for details.
    """
    y = np.array([0, 1, 2, 3, 4, 5])
    sw = np.array([0, 0, 1, 1, 1, 0])
    value = _weighted_percentile(y, sw, 0)
    assert approx(value) == 2

    value = _weighted_percentile(y, sw, 50)
    assert approx(value) == 3

    value = _weighted_percentile(y, sw, 100)
    assert approx(value) == 4


def test_weighted_median_equal_weights(global_random_seed):
    """Checks `_weighted_percentile(percentile_rank=50)` is the same as `np.median`.

    `sample_weights` are all 1s and the number of samples is odd.
    When number of samples is odd, `_weighted_percentile` always falls on a single
    observation (not between 2 values, in which case the lower value would be taken)
    and is thus equal to `np.median`.
    For an even number of samples, this check will not always hold as (note that
    for some other percentile methods it will always hold). See #17370 for details.
    """
    rng = np.random.RandomState(global_random_seed)
    x = rng.randint(10, size=11)
    weights = np.ones(x.shape)
    median = np.median(x)
    w_median = _weighted_percentile(x, weights)
    assert median == approx(w_median)


def test_weighted_median_integer_weights(global_random_seed):
    # Checks average weighted percentile_rank=0.5 is same as median when manually weight
    # data
    rng = np.random.RandomState(global_random_seed)
    x = rng.randint(20, size=10)
    weights = rng.choice(5, size=10)
    x_manual = np.repeat(x, weights)
    median = np.median(x_manual)
    w_median = _averaged_weighted_percentile(x, weights)
    assert median == approx(w_median)


def test_weighted_percentile_2d(global_random_seed):
    # Check for when array 2D and sample_weight 1D
    rng = np.random.RandomState(global_random_seed)
    x1 = rng.randint(10, size=10)
    w1 = rng.choice(5, size=10)

    x2 = rng.randint(20, size=10)
    x_2d = np.vstack((x1, x2)).T

    w_median = _weighted_percentile(x_2d, w1)
    p_axis_0 = [_weighted_percentile(x_2d[:, i], w1) for i in range(x_2d.shape[1])]
    assert_allclose(w_median, p_axis_0)
    # Check when array and sample_weight both 2D
    w2 = rng.choice(5, size=10)
    w_2d = np.vstack((w1, w2)).T

    w_median = _weighted_percentile(x_2d, w_2d)
    p_axis_0 = [
        _weighted_percentile(x_2d[:, i], w_2d[:, i]) for i in range(x_2d.shape[1])
    ]
    assert_allclose(w_median, p_axis_0)


@pytest.mark.parametrize(
    "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize(
    "data, weights, percentile",
    [
        # NumPy scalars input (handled as 0D arrays on array API)
        (np.float32(42), np.int32(1), 50),
        # Random 1D array, constant weights
        (lambda rng: rng.rand(50), np.ones(50).astype(np.int32), 50),
        # Random 2D array and random 1D weights
        (lambda rng: rng.rand(50, 3), lambda rng: rng.rand(50).astype(np.float32), 75),
        # Random 2D array and random 2D weights
        (
            lambda rng: rng.rand(20, 3),
            lambda rng: rng.rand(20, 3).astype(np.float32),
            25,
        ),
        # zero-weights and `rank_percentile=0` (#20528) (`sample_weight` dtype: int64)
        (np.array([0, 1, 2, 3, 4, 5]), np.array([0, 0, 1, 1, 1, 0]), 0),
        # np.nan's in data and some zero-weights (`sample_weight` dtype: int64)
        (np.array([np.nan, np.nan, 0, 3, 4, 5]), np.array([0, 1, 1, 1, 1, 0]), 0),
        # `sample_weight` dtype: int32
        (
            np.array([0, 1, 2, 3, 4, 5]),
            np.array([0, 1, 1, 1, 1, 0], dtype=np.int32),
            25,
        ),
    ],
)
def test_weighted_percentile_array_api_consistency(
    global_random_seed, array_namespace, device, dtype_name, data, weights, percentile
):
    """Check `_weighted_percentile` gives consistent results with array API."""
    if array_namespace == "array_api_strict":
        try:
            import array_api_strict
        except ImportError:
            pass
        else:
            if device == array_api_strict.Device("device1"):
                # See https://github.com/data-apis/array-api-strict/issues/134
                pytest.xfail(
                    "array_api_strict has bug when indexing with tuple of arrays "
                    "on non-'CPU_DEVICE' devices."
                )

    xp = _array_api_for_tests(array_namespace, device)

    # Skip test for percentile=0 edge case (#20528) on namespace/device where
    # xp.nextafter is broken. This is the case for torch with MPS device:
    # https://github.com/pytorch/pytorch/issues/150027
    zero = xp.zeros(1, device=device)
    one = xp.ones(1, device=device)
    if percentile == 0 and xp.all(xp.nextafter(zero, one) == zero):
        pytest.xfail(f"xp.nextafter is broken on {device}")

    rng = np.random.RandomState(global_random_seed)
    X_np = data(rng) if callable(data) else data
    weights_np = weights(rng) if callable(weights) else weights
    # Ensure `data` of correct dtype
    X_np = X_np.astype(dtype_name)

    result_np = _weighted_percentile(X_np, weights_np, percentile)
    # Convert to Array API arrays
    X_xp = xp.asarray(X_np, device=device)
    weights_xp = xp.asarray(weights_np, device=device)

    with config_context(array_api_dispatch=True):
        result_xp = _weighted_percentile(X_xp, weights_xp, percentile)
        assert array_device(result_xp) == array_device(X_xp)
        assert get_namespace(result_xp)[0] == get_namespace(X_xp)[0]
        result_xp_np = _convert_to_numpy(result_xp, xp=xp)

    assert result_xp_np.dtype == result_np.dtype
    assert result_xp_np.shape == result_np.shape
    assert_allclose(result_np, result_xp_np)

    # Check dtype correct (`sample_weight` should follow `array`)
    if dtype_name == "float32":
        assert result_xp_np.dtype == result_np.dtype == np.float32
    else:
        assert result_xp_np.dtype == np.float64


@pytest.mark.parametrize("sample_weight_ndim", [1, 2])
def test_weighted_percentile_nan_filtered(sample_weight_ndim, global_random_seed):
    """Test that calling _weighted_percentile on an array with nan values returns
    the same results as calling _weighted_percentile on a filtered version of the data.
    We test both with sample_weight of the same shape as the data and with
    one-dimensional sample_weight."""

    rng = np.random.RandomState(global_random_seed)
    array_with_nans = rng.rand(100, 10)
    array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
    nan_mask = np.isnan(array_with_nans)

    if sample_weight_ndim == 2:
        sample_weight = rng.randint(1, 6, size=(100, 10))
    else:
        sample_weight = rng.randint(1, 6, size=(100,))

    # Find the weighted percentile on the array with nans:
    results = _weighted_percentile(array_with_nans, sample_weight, 30)

    # Find the weighted percentile on the filtered array:
    filtered_array = [
        array_with_nans[~nan_mask[:, col], col]
        for col in range(array_with_nans.shape[1])
    ]
    if sample_weight.ndim == 1:
        sample_weight = np.repeat(sample_weight, array_with_nans.shape[1]).reshape(
            array_with_nans.shape[0], array_with_nans.shape[1]
        )
    filtered_weights = [
        sample_weight[~nan_mask[:, col], col] for col in range(array_with_nans.shape[1])
    ]

    expected_results = np.array(
        [
            _weighted_percentile(filtered_array[col], filtered_weights[col], 30)
            for col in range(array_with_nans.shape[1])
        ]
    )

    assert_array_equal(expected_results, results)


def test_weighted_percentile_all_nan_column():
    """Check that nans are ignored in general, except for all NaN columns."""

    array = np.array(
        [
            [np.nan, 5],
            [np.nan, 1],
            [np.nan, np.nan],
            [np.nan, np.nan],
            [np.nan, 2],
            [np.nan, np.nan],
        ]
    )
    weights = np.ones_like(array)
    percentile_rank = 90

    values = _weighted_percentile(array, weights, percentile_rank)

    # The percentile of the second column should be `5` even though there are many nan
    # values present; the percentile of the first column can only be nan, since there
    # are no other possible values:
    assert np.array_equal(values, np.array([np.nan, 5]), equal_nan=True)


@pytest.mark.skipif(
    np_version < parse_version("2.0"),
    reason="np.quantile only accepts weights since version 2.0",
)
@pytest.mark.parametrize("percentile", [66, 10, 50])
def test_weighted_percentile_like_numpy_quantile(percentile, global_random_seed):
    """Check that _weighted_percentile delivers equivalent results as np.quantile
    with weights."""

    rng = np.random.RandomState(global_random_seed)
    array = rng.rand(10, 100)
    sample_weight = rng.randint(1, 6, size=(10, 100))

    percentile_weighted_percentile = _weighted_percentile(
        array, sample_weight, percentile
    )
    percentile_numpy_quantile = np.quantile(
        array, percentile / 100, weights=sample_weight, axis=0, method="inverted_cdf"
    )

    assert_array_equal(percentile_weighted_percentile, percentile_numpy_quantile)


@pytest.mark.skipif(
    np_version < parse_version("2.0"),
    reason="np.nanquantile only accepts weights since version 2.0",
)
@pytest.mark.parametrize("percentile", [66, 10, 50])
def test_weighted_percentile_like_numpy_nanquantile(percentile, global_random_seed):
    """Check that _weighted_percentile delivers equivalent results as np.nanquantile
    with weights."""

    rng = np.random.RandomState(global_random_seed)
    array_with_nans = rng.rand(10, 100)
    array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
    sample_weight = rng.randint(1, 6, size=(10, 100))

    percentile_weighted_percentile = _weighted_percentile(
        array_with_nans, sample_weight, percentile
    )
    percentile_numpy_nanquantile = np.nanquantile(
        array_with_nans,
        percentile / 100,
        weights=sample_weight,
        axis=0,
        method="inverted_cdf",
    )

    assert_array_equal(percentile_weighted_percentile, percentile_numpy_nanquantile)