File: plot_cv_predict.py

package info (click to toggle)
scikit-learn 1.4.2%2Bdfsg-8
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 25,036 kB
  • sloc: python: 201,105; cpp: 5,790; ansic: 854; makefile: 304; sh: 56; javascript: 20
file content (79 lines) | stat: -rw-r--r-- 2,551 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
====================================
Plotting Cross-Validated Predictions
====================================

This example shows how to use
:func:`~sklearn.model_selection.cross_val_predict` together with
:class:`~sklearn.metrics.PredictionErrorDisplay` to visualize prediction
errors.
"""

# %%
# We will load the diabetes dataset and create an instance of a linear
# regression model.
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

X, y = load_diabetes(return_X_y=True)
lr = LinearRegression()

# %%
# :func:`~sklearn.model_selection.cross_val_predict` returns an array of the
# same size of `y` where each entry is a prediction obtained by cross
# validation.
from sklearn.model_selection import cross_val_predict

y_pred = cross_val_predict(lr, X, y, cv=10)

# %%
# Since `cv=10`, it means that we trained 10 models and each model was
# used to predict on one of the 10 folds. We can now use the
# :class:`~sklearn.metrics.PredictionErrorDisplay` to visualize the
# prediction errors.
#
# On the left axis, we plot the observed values :math:`y` vs. the predicted
# values :math:`\hat{y}` given by the models. On the right axis, we plot the
# residuals (i.e. the difference between the observed values and the predicted
# values) vs. the predicted values.
import matplotlib.pyplot as plt

from sklearn.metrics import PredictionErrorDisplay

fig, axs = plt.subplots(ncols=2, figsize=(8, 4))
PredictionErrorDisplay.from_predictions(
    y,
    y_pred=y_pred,
    kind="actual_vs_predicted",
    subsample=100,
    ax=axs[0],
    random_state=0,
)
axs[0].set_title("Actual vs. Predicted values")
PredictionErrorDisplay.from_predictions(
    y,
    y_pred=y_pred,
    kind="residual_vs_predicted",
    subsample=100,
    ax=axs[1],
    random_state=0,
)
axs[1].set_title("Residuals vs. Predicted Values")
fig.suptitle("Plotting cross-validated predictions")
plt.tight_layout()
plt.show()

# %%
# It is important to note that we used
# :func:`~sklearn.model_selection.cross_val_predict` for visualization
# purpose only in this example.
#
# It would be problematic to
# quantitatively assess the model performance by computing a single
# performance metric from the concatenated predictions returned by
# :func:`~sklearn.model_selection.cross_val_predict`
# when the different CV folds vary by size and distributions.
#
# It is recommended to compute per-fold performance metrics using:
# :func:`~sklearn.model_selection.cross_val_score` or
# :func:`~sklearn.model_selection.cross_validate` instead.