File: test_bounds.py

package info (click to toggle)
scikit-learn 1.2.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 23,280 kB
  • sloc: python: 184,491; cpp: 5,783; ansic: 854; makefile: 307; sh: 45; javascript: 1
file content (132 lines) | stat: -rw-r--r-- 4,663 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
from scipy import sparse as sp
from scipy import stats

import pytest

from sklearn.svm._bounds import l1_min_c
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap


dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]]
sparse_X = sp.csr_matrix(dense_X)

Y1 = [0, 1, 1, 1]
Y2 = [2, 1, 0, 0]


@pytest.mark.parametrize("loss", ["squared_hinge", "log"])
@pytest.mark.parametrize("X_label", ["sparse", "dense"])
@pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"])
@pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"])
def test_l1_min_c(loss, X_label, Y_label, intercept_label):
    Xs = {"sparse": sparse_X, "dense": dense_X}
    Ys = {"two-classes": Y1, "multi-class": Y2}
    intercepts = {
        "no-intercept": {"fit_intercept": False},
        "fit-intercept": {"fit_intercept": True, "intercept_scaling": 10},
    }

    X = Xs[X_label]
    Y = Ys[Y_label]
    intercept_params = intercepts[intercept_label]
    check_l1_min_c(X, Y, loss, **intercept_params)


def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=1.0):
    min_c = l1_min_c(
        X,
        y,
        loss=loss,
        fit_intercept=fit_intercept,
        intercept_scaling=intercept_scaling,
    )

    clf = {
        "log": LogisticRegression(penalty="l1", solver="liblinear"),
        "squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False),
    }[loss]

    clf.fit_intercept = fit_intercept
    clf.intercept_scaling = intercept_scaling

    clf.C = min_c
    clf.fit(X, y)
    assert (np.asarray(clf.coef_) == 0).all()
    assert (np.asarray(clf.intercept_) == 0).all()

    clf.C = min_c * 1.01
    clf.fit(X, y)
    assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any()


def test_ill_posed_min_c():
    X = [[0, 0], [0, 0]]
    y = [0, 1]
    with pytest.raises(ValueError):
        l1_min_c(X, y)


_MAX_UNSIGNED_INT = 4294967295


@pytest.mark.parametrize("seed, val", [(None, 81), (0, 54), (_MAX_UNSIGNED_INT, 9)])
def test_newrand_set_seed(seed, val):
    """Test that `set_seed` produces deterministic results"""
    if seed is not None:
        set_seed_wrap(seed)
    x = bounded_rand_int_wrap(100)
    assert x == val, f"Expected {val} but got {x} instead"


@pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1])
def test_newrand_set_seed_overflow(seed):
    """Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
    with pytest.raises(OverflowError):
        set_seed_wrap(seed)


@pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)])
def test_newrand_bounded_rand_int(range_, n_pts):
    """Test that `bounded_rand_int` follows a uniform distribution"""
    n_iter = 100
    ks_pvals = []
    uniform_dist = stats.uniform(loc=0, scale=range_)
    # perform multiple samplings to make chance of outlier sampling negligible
    for _ in range(n_iter):
        # Deterministic random sampling
        sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
        res = stats.kstest(sample, uniform_dist.cdf)
        ks_pvals.append(res.pvalue)
    # Null hypothesis = samples come from an uniform distribution.
    # Under the null hypothesis, p-values should be uniformly distributed
    # and not concentrated on low values
    # (this may seem counter-intuitive but is backed by multiple refs)
    # So we can do two checks:

    # (1) check uniformity of p-values
    uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
    res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
    assert res_pvals.pvalue > 0.05, (
        "Null hypothesis rejected: generated random numbers are not uniform."
        " Details: the (meta) p-value of the test of uniform distribution"
        f" of p-values is {res_pvals.pvalue} which is not > 0.05"
    )

    # (2) (safety belt) check that 90% of p-values are above 0.05
    min_10pct_pval = np.percentile(ks_pvals, q=10)
    # lower 10th quantile pvalue <= 0.05 means that the test rejects the
    # null hypothesis that the sample came from the uniform distribution
    assert min_10pct_pval > 0.05, (
        "Null hypothesis rejected: generated random numbers are not uniform. "
        f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
    )


@pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1])
def test_newrand_bounded_rand_int_limits(range_):
    """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
    with pytest.raises(OverflowError):
        bounded_rand_int_wrap(range_)