1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
# Authors: Manoj Kumar mks542@nyu.edu
# License: BSD 3 clause
import numpy as np
from scipy import optimize, sparse
import pytest
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_false
from sklearn.datasets import make_regression
from sklearn.linear_model import (
HuberRegressor, LinearRegression, SGDRegressor, Ridge)
from sklearn.linear_model.huber import _huber_loss_and_gradient
def make_regression_with_outliers(n_samples=50, n_features=20):
rng = np.random.RandomState(0)
# Generate data with outliers by replacing 10% of the samples with noise.
X, y = make_regression(
n_samples=n_samples, n_features=n_features,
random_state=0, noise=0.05)
# Replace 10% of the sample with noise.
num_noise = int(0.1 * n_samples)
random_samples = rng.randint(0, n_samples, num_noise)
X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1]))
return X, y
def test_huber_equals_lr_for_high_epsilon():
# Test that Ridge matches LinearRegression for large epsilon
X, y = make_regression_with_outliers()
lr = LinearRegression(fit_intercept=True)
lr.fit(X, y)
huber = HuberRegressor(fit_intercept=True, epsilon=1e3, alpha=0.0)
huber.fit(X, y)
assert_almost_equal(huber.coef_, lr.coef_, 3)
assert_almost_equal(huber.intercept_, lr.intercept_, 2)
def test_huber_max_iter():
X, y = make_regression_with_outliers()
huber = HuberRegressor(max_iter=1)
huber.fit(X, y)
assert huber.n_iter_ == huber.max_iter
def test_huber_gradient():
# Test that the gradient calculated by _huber_loss_and_gradient is correct
rng = np.random.RandomState(1)
X, y = make_regression_with_outliers()
sample_weight = rng.randint(1, 3, (y.shape[0]))
loss_func = lambda x, *args: _huber_loss_and_gradient(x, *args)[0]
grad_func = lambda x, *args: _huber_loss_and_gradient(x, *args)[1]
# Check using optimize.check_grad that the gradients are equal.
for _ in range(5):
# Check for both fit_intercept and otherwise.
for n_features in [X.shape[1] + 1, X.shape[1] + 2]:
w = rng.randn(n_features)
w[-1] = np.abs(w[-1])
grad_same = optimize.check_grad(
loss_func, grad_func, w, X, y, 0.01, 0.1, sample_weight)
assert_almost_equal(grad_same, 1e-6, 4)
def test_huber_sample_weights():
# Test sample_weights implementation in HuberRegressor"""
X, y = make_regression_with_outliers()
huber = HuberRegressor(fit_intercept=True)
huber.fit(X, y)
huber_coef = huber.coef_
huber_intercept = huber.intercept_
# Rescale coefs before comparing with assert_array_almost_equal to make sure
# that the number of decimal places used is somewhat insensitive to the
# amplitude of the coefficients and therefore to the scale of the data
# and the regularization parameter
scale = max(np.mean(np.abs(huber.coef_)),
np.mean(np.abs(huber.intercept_)))
huber.fit(X, y, sample_weight=np.ones(y.shape[0]))
assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
assert_array_almost_equal(huber.intercept_ / scale,
huber_intercept / scale)
X, y = make_regression_with_outliers(n_samples=5, n_features=20)
X_new = np.vstack((X, np.vstack((X[1], X[1], X[3]))))
y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]]))
huber.fit(X_new, y_new)
huber_coef = huber.coef_
huber_intercept = huber.intercept_
sample_weight = np.ones(X.shape[0])
sample_weight[1] = 3
sample_weight[3] = 2
huber.fit(X, y, sample_weight=sample_weight)
assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
assert_array_almost_equal(huber.intercept_ / scale,
huber_intercept / scale)
# Test sparse implementation with sample weights.
X_csr = sparse.csr_matrix(X)
huber_sparse = HuberRegressor(fit_intercept=True)
huber_sparse.fit(X_csr, y, sample_weight=sample_weight)
assert_array_almost_equal(huber_sparse.coef_ / scale,
huber_coef / scale)
def test_huber_sparse():
X, y = make_regression_with_outliers()
huber = HuberRegressor(fit_intercept=True, alpha=0.1)
huber.fit(X, y)
X_csr = sparse.csr_matrix(X)
huber_sparse = HuberRegressor(fit_intercept=True, alpha=0.1)
huber_sparse.fit(X_csr, y)
assert_array_almost_equal(huber_sparse.coef_, huber.coef_)
assert_array_equal(huber.outliers_, huber_sparse.outliers_)
def test_huber_scaling_invariant():
# Test that outliers filtering is scaling independent.
X, y = make_regression_with_outliers()
huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100)
huber.fit(X, y)
n_outliers_mask_1 = huber.outliers_
assert_false(np.all(n_outliers_mask_1))
huber.fit(X, 2. * y)
n_outliers_mask_2 = huber.outliers_
assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)
huber.fit(2. * X, 2. * y)
n_outliers_mask_3 = huber.outliers_
assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
# 0.23. warning about tol not having its correct default value.
@pytest.mark.filterwarnings('ignore:max_iter and tol parameters have been')
def test_huber_and_sgd_same_results():
# Test they should converge to same coefficients for same parameters
X, y = make_regression_with_outliers(n_samples=10, n_features=2)
# Fit once to find out the scale parameter. Scale down X and y by scale
# so that the scale parameter is optimized to 1.0
huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100,
epsilon=1.35)
huber.fit(X, y)
X_scale = X / huber.scale_
y_scale = y / huber.scale_
huber.fit(X_scale, y_scale)
assert_almost_equal(huber.scale_, 1.0, 3)
sgdreg = SGDRegressor(
alpha=0.0, loss="huber", shuffle=True, random_state=0, max_iter=10000,
fit_intercept=False, epsilon=1.35, tol=None)
sgdreg.fit(X_scale, y_scale)
assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
def test_huber_warm_start():
X, y = make_regression_with_outliers()
huber_warm = HuberRegressor(
fit_intercept=True, alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1)
huber_warm.fit(X, y)
huber_warm_coef = huber_warm.coef_.copy()
huber_warm.fit(X, y)
# SciPy performs the tol check after doing the coef updates, so
# these would be almost same but not equal.
assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)
assert huber_warm.n_iter_ == 0
def test_huber_better_r2_score():
# Test that huber returns a better r2 score than non-outliers"""
X, y = make_regression_with_outliers()
huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=100)
huber.fit(X, y)
linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
huber_score = huber.score(X[mask], y[mask])
huber_outlier_score = huber.score(X[~mask], y[~mask])
# The Ridge regressor should be influenced by the outliers and hence
# give a worse score on the non-outliers as compared to the huber regressor.
ridge = Ridge(fit_intercept=True, alpha=0.01)
ridge.fit(X, y)
ridge_score = ridge.score(X[mask], y[mask])
ridge_outlier_score = ridge.score(X[~mask], y[~mask])
assert_greater(huber_score, ridge_score)
# The huber model should also fit poorly on the outliers.
assert_greater(ridge_outlier_score, huber_outlier_score)
|