1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
|
"""
===============================================================
Visualizing the probabilistic predictions of a VotingClassifier
===============================================================
.. currentmodule:: sklearn
Plot the predicted class probabilities in a toy dataset predicted by three
different classifiers and averaged by the :class:`~ensemble.VotingClassifier`.
First, three linear classifiers are initialized. Two are spline models with
interaction terms, one using constant extrapolation and the other using periodic
extrapolation. The third classifier is a :class:`~kernel_approximation.Nystroem`
with the default "rbf" kernel.
In the first part of this example, these three classifiers are used to
demonstrate soft-voting using :class:`~ensemble.VotingClassifier` with weighted
average. We set `weights=[2, 1, 3]`, meaning the constant extrapolation spline
model's predictions are weighted twice as much as the periodic spline model's,
and the Nystroem model's predictions are weighted three times as much as the
periodic spline.
The second part demonstrates how soft predictions can be converted into hard
predictions.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# %%
# We first generate a noisy XOR dataset, which is a binary classification task.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import ListedColormap
n_samples = 500
rng = np.random.default_rng(0)
feature_names = ["Feature #0", "Feature #1"]
common_scatter_plot_params = dict(
cmap=ListedColormap(["tab:red", "tab:blue"]),
edgecolor="white",
linewidth=1,
)
xor = pd.DataFrame(
np.random.RandomState(0).uniform(low=-1, high=1, size=(n_samples, 2)),
columns=feature_names,
)
noise = rng.normal(loc=0, scale=0.1, size=(n_samples, 2))
target_xor = np.logical_xor(
xor["Feature #0"] + noise[:, 0] > 0, xor["Feature #1"] + noise[:, 1] > 0
)
X = xor[feature_names]
y = target_xor.astype(np.int32)
fig, ax = plt.subplots()
ax.scatter(X["Feature #0"], X["Feature #1"], c=y, **common_scatter_plot_params)
ax.set_title("The XOR dataset")
plt.show()
# %%
# Due to the inherent non-linear separability of the XOR dataset, tree-based
# models would often be preferred. However, appropriate feature engineering
# combined with a linear model can yield effective results, with the added
# benefit of producing better-calibrated probabilities for samples located in
# the transition regions affected by noise.
#
# We define and fit the models on the whole dataset.
from sklearn.ensemble import VotingClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer, StandardScaler
clf1 = make_pipeline(
SplineTransformer(degree=2, n_knots=2),
PolynomialFeatures(interaction_only=True),
LogisticRegression(C=10),
)
clf2 = make_pipeline(
SplineTransformer(
degree=2,
n_knots=4,
extrapolation="periodic",
include_bias=True,
),
PolynomialFeatures(interaction_only=True),
LogisticRegression(C=10),
)
clf3 = make_pipeline(
StandardScaler(),
Nystroem(gamma=2, random_state=0),
LogisticRegression(C=10),
)
weights = [2, 1, 3]
eclf = VotingClassifier(
estimators=[
("constant splines model", clf1),
("periodic splines model", clf2),
("nystroem model", clf3),
],
voting="soft",
weights=weights,
)
clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
eclf.fit(X, y)
# %%
# Finally we use :class:`~inspection.DecisionBoundaryDisplay` to plot the
# predicted probabilities. By using a diverging colormap (such as `"RdBu"`), we
# can ensure that darker colors correspond to `predict_proba` close to either 0
# or 1, and white corresponds to `predict_proba` of 0.5.
from itertools import product
from sklearn.inspection import DecisionBoundaryDisplay
fig, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8))
for idx, clf, title in zip(
product([0, 1], [0, 1]),
[clf1, clf2, clf3, eclf],
[
"Splines with\nconstant extrapolation",
"Splines with\nperiodic extrapolation",
"RBF Nystroem",
"Soft Voting",
],
):
disp = DecisionBoundaryDisplay.from_estimator(
clf,
X,
response_method="predict_proba",
plot_method="pcolormesh",
cmap="RdBu",
alpha=0.8,
ax=axarr[idx[0], idx[1]],
)
axarr[idx[0], idx[1]].scatter(
X["Feature #0"],
X["Feature #1"],
c=y,
**common_scatter_plot_params,
)
axarr[idx[0], idx[1]].set_title(title)
fig.colorbar(disp.surface_, ax=axarr[idx[0], idx[1]], label="Probability estimate")
plt.show()
# %%
# As a sanity check, we can verify for a given sample that the probability
# predicted by the :class:`~ensemble.VotingClassifier` is indeed the weighted
# average of the individual classifiers' soft-predictions.
#
# In the case of binary classification such as in the present example, the
# :term:`predict_proba` arrays contain the probability of belonging to class 0
# (here in red) as the first entry, and the probability of belonging to class 1
# (here in blue) as the second entry.
test_sample = pd.DataFrame({"Feature #0": [-0.5], "Feature #1": [1.5]})
predict_probas = [est.predict_proba(test_sample).ravel() for est in eclf.estimators_]
for (est_name, _), est_probas in zip(eclf.estimators, predict_probas):
print(f"{est_name}'s predicted probabilities: {est_probas}")
# %%
print(
"Weighted average of soft-predictions: "
f"{np.dot(weights, predict_probas) / np.sum(weights)}"
)
# %%
# We can see that manual calculation of predicted probabilities above is
# equivalent to that produced by the `VotingClassifier`:
print(
"Predicted probability of VotingClassifier: "
f"{eclf.predict_proba(test_sample).ravel()}"
)
# %%
# To convert soft predictions into hard predictions when weights are provided,
# the weighted average predicted probabilities are computed for each class.
# Then, the final class label is then derived from the class label with the
# highest average probability, which corresponds to the default threshold at
# `predict_proba=0.5` in the case of binary classification.
print(
"Class with the highest weighted average of soft-predictions: "
f"{np.argmax(np.dot(weights, predict_probas) / np.sum(weights))}"
)
# %%
# This is equivalent to the output of `VotingClassifier`'s `predict` method:
print(f"Predicted class of VotingClassifier: {eclf.predict(test_sample).ravel()}")
# %%
# Soft votes can be thresholded as for any other probabilistic classifier. This
# allows you to set a threshold probability at which the positive class will be
# predicted, instead of simply selecting the class with the highest predicted
# probability.
from sklearn.model_selection import FixedThresholdClassifier
eclf_other_threshold = FixedThresholdClassifier(
eclf, threshold=0.7, response_method="predict_proba"
).fit(X, y)
print(
"Predicted class of thresholded VotingClassifier: "
f"{eclf_other_threshold.predict(test_sample)}"
)
|