File: plot_lasso_dense_vs_sparse_data.py

package info (click to toggle)
scikit-learn 1.7.2%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 25,748 kB
  • sloc: python: 219,120; cpp: 5,790; ansic: 846; makefile: 189; javascript: 110
file content (89 lines) | stat: -rw-r--r-- 2,906 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
==============================
Lasso on dense and sparse data
==============================

We show that linear_model.Lasso provides the same results for dense and sparse
data and that in the case of sparse data the speed is improved.

"""

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

from time import time

from scipy import linalg, sparse

from sklearn.datasets import make_regression
from sklearn.linear_model import Lasso

# %%
# Comparing the two Lasso implementations on Dense data
# -----------------------------------------------------
#
# We create a linear regression problem that is suitable for the Lasso,
# that is to say, with more features than samples. We then store the data
# matrix in both dense (the usual) and sparse format, and train a Lasso on
# each. We compute the runtime of both and check that they learned the
# same model by computing the Euclidean norm of the difference between the
# coefficients they learned. Because the data is dense, we expect better
# runtime with a dense data format.

X, y = make_regression(n_samples=200, n_features=5000, random_state=0)
# create a copy of X in sparse format
X_sp = sparse.coo_matrix(X)

alpha = 1
sparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)
dense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=1000)

t0 = time()
sparse_lasso.fit(X_sp, y)
print(f"Sparse Lasso done in {(time() - t0):.3f}s")

t0 = time()
dense_lasso.fit(X, y)
print(f"Dense Lasso done in {(time() - t0):.3f}s")

# compare the regression coefficients
coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
print(f"Distance between coefficients : {coeff_diff:.2e}")

#
# %%
# Comparing the two Lasso implementations on Sparse data
# ------------------------------------------------------
#
# We make the previous problem sparse by replacing all small values with 0
# and run the same comparisons as above. Because the data is now sparse, we
# expect the implementation that uses the sparse data format to be faster.

# make a copy of the previous data
Xs = X.copy()
# make Xs sparse by replacing the values lower than 2.5 with 0s
Xs[Xs < 2.5] = 0.0
# create a copy of Xs in sparse format
Xs_sp = sparse.coo_matrix(Xs)
Xs_sp = Xs_sp.tocsc()

# compute the proportion of non-zero coefficient in the data matrix
print(f"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%")

alpha = 0.1
sparse_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)
dense_lasso = Lasso(alpha=alpha, fit_intercept=False, max_iter=10000)

t0 = time()
sparse_lasso.fit(Xs_sp, y)
print(f"Sparse Lasso done in {(time() - t0):.3f}s")

t0 = time()
dense_lasso.fit(Xs, y)
print(f"Dense Lasso done in  {(time() - t0):.3f}s")

# compare the regression coefficients
coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)
print(f"Distance between coefficients : {coeff_diff:.2e}")

# %%