| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 
 | """
==================
Pipeline ANOVA SVM
==================
This example shows how a feature selection can be easily integrated within
a machine learning pipeline.
We also show that you can easily inspect part of the pipeline.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# %%
# We will start by generating a binary classification dataset. Subsequently, we
# will divide the dataset into two subsets.
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X, y = make_classification(
    n_features=20,
    n_informative=3,
    n_redundant=0,
    n_classes=2,
    n_clusters_per_class=2,
    random_state=42,
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# %%
# A common mistake done with feature selection is to search a subset of
# discriminative features on the full dataset, instead of only using the
# training set. The usage of scikit-learn :func:`~sklearn.pipeline.Pipeline`
# prevents to make such mistake.
#
# Here, we will demonstrate how to build a pipeline where the first step will
# be the feature selection.
#
# When calling `fit` on the training data, a subset of feature will be selected
# and the index of these selected features will be stored. The feature selector
# will subsequently reduce the number of features, and pass this subset to the
# classifier which will be trained.
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
anova_filter = SelectKBest(f_classif, k=3)
clf = LinearSVC()
anova_svm = make_pipeline(anova_filter, clf)
anova_svm.fit(X_train, y_train)
# %%
# Once the training is complete, we can predict on new unseen samples. In this
# case, the feature selector will only select the most discriminative features
# based on the information stored during training. Then, the data will be
# passed to the classifier which will make the prediction.
#
# Here, we show the final metrics via a classification report.
from sklearn.metrics import classification_report
y_pred = anova_svm.predict(X_test)
print(classification_report(y_test, y_pred))
# %%
# Be aware that you can inspect a step in the pipeline. For instance, we might
# be interested about the parameters of the classifier. Since we selected
# three features, we expect to have three coefficients.
anova_svm[-1].coef_
# %%
# However, we do not know which features were selected from the original
# dataset. We could proceed by several manners. Here, we will invert the
# transformation of these coefficients to get information about the original
# space.
anova_svm[:-1].inverse_transform(anova_svm[-1].coef_)
# %%
# We can see that the features with non-zero coefficients are the selected
# features by the first step.
 |