1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
|
"""
==========================
Model Complexity Influence
==========================
Demonstrate how model complexity influences both prediction accuracy and
computational performance.
We will be using two datasets:
- :ref:`diabetes_dataset` for regression.
This dataset consists of 10 measurements taken from diabetes patients.
The task is to predict disease progression;
- :ref:`20newsgroups_dataset` for classification. This dataset consists of
newsgroup posts. The task is to predict on which topic (out of 20 topics)
the post is written about.
We will model the complexity influence on three different estimators:
- :class:`~sklearn.linear_model.SGDClassifier` (for classification data)
which implements stochastic gradient descent learning;
- :class:`~sklearn.svm.NuSVR` (for regression data) which implements
Nu support vector regression;
- :class:`~sklearn.ensemble.GradientBoostingRegressor` builds an additive
model in a forward stage-wise fashion. Notice that
:class:`~sklearn.ensemble.HistGradientBoostingRegressor` is much faster
than :class:`~sklearn.ensemble.GradientBoostingRegressor` starting with
intermediate datasets (`n_samples >= 10_000`), which is not the case for
this example.
We make the model complexity vary through the choice of relevant model
parameters in each of our selected models. Next, we will measure the influence
on both computational performance (latency) and predictive power (MSE or
Hamming Loss).
"""
# Authors: Eustache Diemert <eustache@diemert.fr>
# Maria Telenczuk <https://github.com/maikia>
# Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: BSD 3 clause
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import hamming_loss, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import NuSVR
# Initialize random generator
np.random.seed(0)
##############################################################################
# Load the data
# -------------
#
# First we load both datasets.
#
# .. note:: We are using
# :func:`~sklearn.datasets.fetch_20newsgroups_vectorized` to download 20
# newsgroups dataset. It returns ready-to-use features.
#
# .. note:: ``X`` of the 20 newsgroups dataset is a sparse matrix while ``X``
# of diabetes dataset is a numpy array.
#
def generate_data(case):
"""Generate regression/classification data."""
if case == "regression":
X, y = datasets.load_diabetes(return_X_y=True)
train_size = 0.8
elif case == "classification":
X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True)
train_size = 0.4 # to make the example run faster
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=train_size, random_state=0
)
data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
return data
regression_data = generate_data("regression")
classification_data = generate_data("classification")
##############################################################################
# Benchmark influence
# -------------------
# Next, we can calculate the influence of the parameters on the given
# estimator. In each round, we will set the estimator with the new value of
# ``changing_param`` and we will be collecting the prediction times, prediction
# performance and complexities to see how those changes affect the estimator.
# We will calculate the complexity using ``complexity_computer`` passed as a
# parameter.
#
def benchmark_influence(conf):
"""
Benchmark influence of `changing_param` on both MSE and latency.
"""
prediction_times = []
prediction_powers = []
complexities = []
for param_value in conf["changing_param_values"]:
conf["tuned_params"][conf["changing_param"]] = param_value
estimator = conf["estimator"](**conf["tuned_params"])
print("Benchmarking %s" % estimator)
estimator.fit(conf["data"]["X_train"], conf["data"]["y_train"])
conf["postfit_hook"](estimator)
complexity = conf["complexity_computer"](estimator)
complexities.append(complexity)
start_time = time.time()
for _ in range(conf["n_samples"]):
y_pred = estimator.predict(conf["data"]["X_test"])
elapsed_time = (time.time() - start_time) / float(conf["n_samples"])
prediction_times.append(elapsed_time)
pred_score = conf["prediction_performance_computer"](
conf["data"]["y_test"], y_pred
)
prediction_powers.append(pred_score)
print(
"Complexity: %d | %s: %.4f | Pred. Time: %fs\n"
% (
complexity,
conf["prediction_performance_label"],
pred_score,
elapsed_time,
)
)
return prediction_powers, prediction_times, complexities
##############################################################################
# Choose parameters
# -----------------
#
# We choose the parameters for each of our estimators by making
# a dictionary with all the necessary values.
# ``changing_param`` is the name of the parameter which will vary in each
# estimator.
# Complexity will be defined by the ``complexity_label`` and calculated using
# `complexity_computer`.
# Also note that depending on the estimator type we are passing
# different data.
#
def _count_nonzero_coefficients(estimator):
a = estimator.coef_.toarray()
return np.count_nonzero(a)
configurations = [
{
"estimator": SGDClassifier,
"tuned_params": {
"penalty": "elasticnet",
"alpha": 0.001,
"loss": "modified_huber",
"fit_intercept": True,
"tol": 1e-1,
"n_iter_no_change": 2,
},
"changing_param": "l1_ratio",
"changing_param_values": [0.25, 0.5, 0.75, 0.9],
"complexity_label": "non_zero coefficients",
"complexity_computer": _count_nonzero_coefficients,
"prediction_performance_computer": hamming_loss,
"prediction_performance_label": "Hamming Loss (Misclassification Ratio)",
"postfit_hook": lambda x: x.sparsify(),
"data": classification_data,
"n_samples": 5,
},
{
"estimator": NuSVR,
"tuned_params": {"C": 1e3, "gamma": 2**-15},
"changing_param": "nu",
"changing_param_values": [0.05, 0.1, 0.2, 0.35, 0.5],
"complexity_label": "n_support_vectors",
"complexity_computer": lambda x: len(x.support_vectors_),
"data": regression_data,
"postfit_hook": lambda x: x,
"prediction_performance_computer": mean_squared_error,
"prediction_performance_label": "MSE",
"n_samples": 15,
},
{
"estimator": GradientBoostingRegressor,
"tuned_params": {
"loss": "squared_error",
"learning_rate": 0.05,
"max_depth": 2,
},
"changing_param": "n_estimators",
"changing_param_values": [10, 25, 50, 75, 100],
"complexity_label": "n_trees",
"complexity_computer": lambda x: x.n_estimators,
"data": regression_data,
"postfit_hook": lambda x: x,
"prediction_performance_computer": mean_squared_error,
"prediction_performance_label": "MSE",
"n_samples": 15,
},
]
##############################################################################
# Run the code and plot the results
# ---------------------------------
#
# We defined all the functions required to run our benchmark. Now, we will loop
# over the different configurations that we defined previously. Subsequently,
# we can analyze the plots obtained from the benchmark:
# Relaxing the `L1` penalty in the SGD classifier reduces the prediction error
# but leads to an increase in the training time.
# We can draw a similar analysis regarding the training time which increases
# with the number of support vectors with a Nu-SVR. However, we observed that
# there is an optimal number of support vectors which reduces the prediction
# error. Indeed, too few support vectors lead to an under-fitted model while
# too many support vectors lead to an over-fitted model.
# The exact same conclusion can be drawn for the gradient-boosting model. The
# only the difference with the Nu-SVR is that having too many trees in the
# ensemble is not as detrimental.
#
def plot_influence(conf, mse_values, prediction_times, complexities):
"""
Plot influence of model complexity on both accuracy and latency.
"""
fig = plt.figure()
fig.subplots_adjust(right=0.75)
# first axes (prediction error)
ax1 = fig.add_subplot(111)
line1 = ax1.plot(complexities, mse_values, c="tab:blue", ls="-")[0]
ax1.set_xlabel("Model Complexity (%s)" % conf["complexity_label"])
y1_label = conf["prediction_performance_label"]
ax1.set_ylabel(y1_label)
ax1.spines["left"].set_color(line1.get_color())
ax1.yaxis.label.set_color(line1.get_color())
ax1.tick_params(axis="y", colors=line1.get_color())
# second axes (latency)
ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)
line2 = ax2.plot(complexities, prediction_times, c="tab:orange", ls="-")[0]
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position("right")
y2_label = "Time (s)"
ax2.set_ylabel(y2_label)
ax1.spines["right"].set_color(line2.get_color())
ax2.yaxis.label.set_color(line2.get_color())
ax2.tick_params(axis="y", colors=line2.get_color())
plt.legend(
(line1, line2), ("prediction error", "prediction latency"), loc="upper center"
)
plt.title(
"Influence of varying '%s' on %s"
% (conf["changing_param"], conf["estimator"].__name__)
)
for conf in configurations:
prediction_performances, prediction_times, complexities = benchmark_influence(conf)
plot_influence(conf, prediction_performances, prediction_times, complexities)
plt.show()
##############################################################################
# Conclusion
# ----------
#
# As a conclusion, we can deduce the following insights:
#
# * a model which is more complex (or expressive) will require a larger
# training time;
# * a more complex model does not guarantee to reduce the prediction error.
#
# These aspects are related to model generalization and avoiding model
# under-fitting or over-fitting.
|