File: test_stats.py

package info (click to toggle)
scikit-learn 1.7.2%2Bdfsg-4
links: PTS, VCS
area: main
in suites: forky
size: 25,752 kB
sloc: python: 219,120; cpp: 5,790; ansic: 846; makefile: 190; javascript: 110
file content (352 lines) | stat: -rw-r--r-- 12,576 bytes
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from pytest import approx

from sklearn._config import config_context
from sklearn.utils._array_api import (
    _convert_to_numpy,
    get_namespace,
    yield_namespace_device_dtype_combinations,
)
from sklearn.utils._array_api import device as array_device
from sklearn.utils.estimator_checks import _array_api_for_tests
from sklearn.utils.fixes import np_version, parse_version
from sklearn.utils.stats import _averaged_weighted_percentile, _weighted_percentile


def test_averaged_weighted_median():
    y = np.array([0, 1, 2, 3, 4, 5])
    sw = np.array([1, 1, 1, 1, 1, 1])

    score = _averaged_weighted_percentile(y, sw, 50)

    assert score == np.median(y)


def test_averaged_weighted_percentile(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    y = rng.randint(20, size=10)

    sw = np.ones(10)

    score = _averaged_weighted_percentile(y, sw, 20)

    assert score == np.percentile(y, 20, method="averaged_inverted_cdf")


def test_averaged_and_weighted_percentile():
    y = np.array([0, 1, 2])
    sw = np.array([5, 1, 5])
    q = 50

    score_averaged = _averaged_weighted_percentile(y, sw, q)
    score = _weighted_percentile(y, sw, q)

    assert score_averaged == score


def test_weighted_percentile():
    """Check `weighted_percentile` on artificial data with obvious median."""
    y = np.empty(102, dtype=np.float64)
    y[:50] = 0
    y[-51:] = 2
    y[-1] = 100000
    y[50] = 1
    sw = np.ones(102, dtype=np.float64)
    sw[-1] = 0.0
    value = _weighted_percentile(y, sw, 50)
    assert approx(value) == 1


def test_weighted_percentile_equal():
    """Check `weighted_percentile` with all weights equal to 1."""
    y = np.empty(102, dtype=np.float64)
    y.fill(0.0)
    sw = np.ones(102, dtype=np.float64)
    score = _weighted_percentile(y, sw, 50)
    assert approx(score) == 0


def test_weighted_percentile_zero_weight():
    """Check `weighted_percentile` with all weights equal to 0."""
    y = np.empty(102, dtype=np.float64)
    y.fill(1.0)
    sw = np.ones(102, dtype=np.float64)
    sw.fill(0.0)
    value = _weighted_percentile(y, sw, 50)
    assert approx(value) == 1.0


def test_weighted_percentile_zero_weight_zero_percentile():
    """Check `weighted_percentile(percentile_rank=0)` behaves correctly.

    Ensures that (leading)zero-weight observations ignored when `percentile_rank=0`.
    See #20528 for details.
    """
    y = np.array([0, 1, 2, 3, 4, 5])
    sw = np.array([0, 0, 1, 1, 1, 0])
    value = _weighted_percentile(y, sw, 0)
    assert approx(value) == 2

    value = _weighted_percentile(y, sw, 50)
    assert approx(value) == 3

    value = _weighted_percentile(y, sw, 100)
    assert approx(value) == 4


def test_weighted_median_equal_weights(global_random_seed):
    """Checks `_weighted_percentile(percentile_rank=50)` is the same as `np.median`.

    `sample_weights` are all 1s and the number of samples is odd.
    When number of samples is odd, `_weighted_percentile` always falls on a single
    observation (not between 2 values, in which case the lower value would be taken)
    and is thus equal to `np.median`.
    For an even number of samples, this check will not always hold as (note that
    for some other percentile methods it will always hold). See #17370 for details.
    """
    rng = np.random.RandomState(global_random_seed)
    x = rng.randint(10, size=11)
    weights = np.ones(x.shape)
    median = np.median(x)
    w_median = _weighted_percentile(x, weights)
    assert median == approx(w_median)


def test_weighted_median_integer_weights(global_random_seed):
    # Checks average weighted percentile_rank=0.5 is same as median when manually weight
    # data
    rng = np.random.RandomState(global_random_seed)
    x = rng.randint(20, size=10)
    weights = rng.choice(5, size=10)
    x_manual = np.repeat(x, weights)
    median = np.median(x_manual)
    w_median = _averaged_weighted_percentile(x, weights)
    assert median == approx(w_median)


def test_weighted_percentile_2d(global_random_seed):
    # Check for when array 2D and sample_weight 1D
    rng = np.random.RandomState(global_random_seed)
    x1 = rng.randint(10, size=10)
    w1 = rng.choice(5, size=10)

    x2 = rng.randint(20, size=10)
    x_2d = np.vstack((x1, x2)).T

    w_median = _weighted_percentile(x_2d, w1)
    p_axis_0 = [_weighted_percentile(x_2d[:, i], w1) for i in range(x_2d.shape[1])]
    assert_allclose(w_median, p_axis_0)
    # Check when array and sample_weight both 2D
    w2 = rng.choice(5, size=10)
    w_2d = np.vstack((w1, w2)).T

    w_median = _weighted_percentile(x_2d, w_2d)
    p_axis_0 = [
        _weighted_percentile(x_2d[:, i], w_2d[:, i]) for i in range(x_2d.shape[1])
    ]
    assert_allclose(w_median, p_axis_0)


@pytest.mark.parametrize(
    "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize(
    "data, weights, percentile",
    [
        # NumPy scalars input (handled as 0D arrays on array API)
        (np.float32(42), np.int32(1), 50),
        # Random 1D array, constant weights
        (lambda rng: rng.rand(50), np.ones(50).astype(np.int32), 50),
        # Random 2D array and random 1D weights
        (lambda rng: rng.rand(50, 3), lambda rng: rng.rand(50).astype(np.float32), 75),
        # Random 2D array and random 2D weights
        (
            lambda rng: rng.rand(20, 3),
            lambda rng: rng.rand(20, 3).astype(np.float32),
            25,
        ),
        # zero-weights and `rank_percentile=0` (#20528) (`sample_weight` dtype: int64)
        (np.array([0, 1, 2, 3, 4, 5]), np.array([0, 0, 1, 1, 1, 0]), 0),
        # np.nan's in data and some zero-weights (`sample_weight` dtype: int64)
        (np.array([np.nan, np.nan, 0, 3, 4, 5]), np.array([0, 1, 1, 1, 1, 0]), 0),
        # `sample_weight` dtype: int32
        (
            np.array([0, 1, 2, 3, 4, 5]),
            np.array([0, 1, 1, 1, 1, 0], dtype=np.int32),
            25,
        ),
    ],
)
def test_weighted_percentile_array_api_consistency(
    global_random_seed, array_namespace, device, dtype_name, data, weights, percentile
):
    """Check `_weighted_percentile` gives consistent results with array API."""
    if array_namespace == "array_api_strict":
        try:
            import array_api_strict
        except ImportError:
            pass
        else:
            if device == array_api_strict.Device("device1"):
                # See https://github.com/data-apis/array-api-strict/issues/134
                pytest.xfail(
                    "array_api_strict has bug when indexing with tuple of arrays "
                    "on non-'CPU_DEVICE' devices."
                )

    xp = _array_api_for_tests(array_namespace, device)

    # Skip test for percentile=0 edge case (#20528) on namespace/device where
    # xp.nextafter is broken. This is the case for torch with MPS device:
    # https://github.com/pytorch/pytorch/issues/150027
    zero = xp.zeros(1, device=device)
    one = xp.ones(1, device=device)
    if percentile == 0 and xp.all(xp.nextafter(zero, one) == zero):
        pytest.xfail(f"xp.nextafter is broken on {device}")

    rng = np.random.RandomState(global_random_seed)
    X_np = data(rng) if callable(data) else data
    weights_np = weights(rng) if callable(weights) else weights
    # Ensure `data` of correct dtype
    X_np = X_np.astype(dtype_name)

    result_np = _weighted_percentile(X_np, weights_np, percentile)
    # Convert to Array API arrays
    X_xp = xp.asarray(X_np, device=device)
    weights_xp = xp.asarray(weights_np, device=device)

    with config_context(array_api_dispatch=True):
        result_xp = _weighted_percentile(X_xp, weights_xp, percentile)
        assert array_device(result_xp) == array_device(X_xp)
        assert get_namespace(result_xp)[0] == get_namespace(X_xp)[0]
        result_xp_np = _convert_to_numpy(result_xp, xp=xp)

    assert result_xp_np.dtype == result_np.dtype
    assert result_xp_np.shape == result_np.shape
    assert_allclose(result_np, result_xp_np)

    # Check dtype correct (`sample_weight` should follow `array`)
    if dtype_name == "float32":
        assert result_xp_np.dtype == result_np.dtype == np.float32
    else:
        assert result_xp_np.dtype == np.float64


@pytest.mark.parametrize("sample_weight_ndim", [1, 2])
def test_weighted_percentile_nan_filtered(sample_weight_ndim, global_random_seed):
    """Test that calling _weighted_percentile on an array with nan values returns
    the same results as calling _weighted_percentile on a filtered version of the data.
    We test both with sample_weight of the same shape as the data and with
    one-dimensional sample_weight."""

    rng = np.random.RandomState(global_random_seed)
    array_with_nans = rng.rand(100, 10)
    array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
    nan_mask = np.isnan(array_with_nans)

    if sample_weight_ndim == 2:
        sample_weight = rng.randint(1, 6, size=(100, 10))
    else:
        sample_weight = rng.randint(1, 6, size=(100,))

    # Find the weighted percentile on the array with nans:
    results = _weighted_percentile(array_with_nans, sample_weight, 30)

    # Find the weighted percentile on the filtered array:
    filtered_array = [
        array_with_nans[~nan_mask[:, col], col]
        for col in range(array_with_nans.shape[1])
    ]
    if sample_weight.ndim == 1:
        sample_weight = np.repeat(sample_weight, array_with_nans.shape[1]).reshape(
            array_with_nans.shape[0], array_with_nans.shape[1]
        )
    filtered_weights = [
        sample_weight[~nan_mask[:, col], col] for col in range(array_with_nans.shape[1])
    ]

    expected_results = np.array(
        [
            _weighted_percentile(filtered_array[col], filtered_weights[col], 30)
            for col in range(array_with_nans.shape[1])
        ]
    )

    assert_array_equal(expected_results, results)


def test_weighted_percentile_all_nan_column():
    """Check that nans are ignored in general, except for all NaN columns."""

    array = np.array(
        [
            [np.nan, 5],
            [np.nan, 1],
            [np.nan, np.nan],
            [np.nan, np.nan],
            [np.nan, 2],
            [np.nan, np.nan],
        ]
    )
    weights = np.ones_like(array)
    percentile_rank = 90

    values = _weighted_percentile(array, weights, percentile_rank)

    # The percentile of the second column should be `5` even though there are many nan
    # values present; the percentile of the first column can only be nan, since there
    # are no other possible values:
    assert np.array_equal(values, np.array([np.nan, 5]), equal_nan=True)


@pytest.mark.skipif(
    np_version < parse_version("2.0"),
    reason="np.quantile only accepts weights since version 2.0",
)
@pytest.mark.parametrize("percentile", [66, 10, 50])
def test_weighted_percentile_like_numpy_quantile(percentile, global_random_seed):
    """Check that _weighted_percentile delivers equivalent results as np.quantile
    with weights."""

    rng = np.random.RandomState(global_random_seed)
    array = rng.rand(10, 100)
    sample_weight = rng.randint(1, 6, size=(10, 100))

    percentile_weighted_percentile = _weighted_percentile(
        array, sample_weight, percentile
    )
    percentile_numpy_quantile = np.quantile(
        array, percentile / 100, weights=sample_weight, axis=0, method="inverted_cdf"
    )

    assert_array_equal(percentile_weighted_percentile, percentile_numpy_quantile)


@pytest.mark.skipif(
    np_version < parse_version("2.0"),
    reason="np.nanquantile only accepts weights since version 2.0",
)
@pytest.mark.parametrize("percentile", [66, 10, 50])
def test_weighted_percentile_like_numpy_nanquantile(percentile, global_random_seed):
    """Check that _weighted_percentile delivers equivalent results as np.nanquantile
    with weights."""

    rng = np.random.RandomState(global_random_seed)
    array_with_nans = rng.rand(10, 100)
    array_with_nans[rng.rand(*array_with_nans.shape) < 0.5] = np.nan
    sample_weight = rng.randint(1, 6, size=(10, 100))

    percentile_weighted_percentile = _weighted_percentile(
        array_with_nans, sample_weight, percentile
    )
    percentile_numpy_nanquantile = np.nanquantile(
        array_with_nans,
        percentile / 100,
        weights=sample_weight,
        axis=0,
        method="inverted_cdf",
    )

    assert_array_equal(percentile_weighted_percentile, percentile_numpy_nanquantile)