1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
|
# Author: Lars Buitinck
# License: 3-clause BSD
from numbers import Real
import numpy as np
from ..base import BaseEstimator, _fit_context
from ..utils._param_validation import Interval
from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
from ..utils.validation import check_is_fitted
from ._base import SelectorMixin
class VarianceThreshold(SelectorMixin, BaseEstimator):
"""Feature selector that removes all low-variance features.
This feature selection algorithm looks only at the features (X), not the
desired outputs (y), and can thus be used for unsupervised learning.
Read more in the :ref:`User Guide <variance_threshold>`.
Parameters
----------
threshold : float, default=0
Features with a training-set variance lower than this threshold will
be removed. The default is to keep all features with non-zero variance,
i.e. remove the features that have the same value in all samples.
Attributes
----------
variances_ : array, shape (n_features,)
Variances of individual features.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
SelectFromModel: Meta-transformer for selecting features based on
importance weights.
SelectPercentile : Select features according to a percentile of the highest
scores.
SequentialFeatureSelector : Transformer that performs Sequential Feature
Selection.
Notes
-----
Allows NaN in the input.
Raises ValueError if no feature in X meets the variance threshold.
Examples
--------
The following dataset has integer features, two of which are the same
in every sample. These are removed with the default setting for threshold::
>>> from sklearn.feature_selection import VarianceThreshold
>>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
>>> selector = VarianceThreshold()
>>> selector.fit_transform(X)
array([[2, 0],
[1, 4],
[1, 1]])
"""
_parameter_constraints: dict = {
"threshold": [Interval(Real, 0, None, closed="left")]
}
def __init__(self, threshold=0.0):
self.threshold = threshold
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Learn empirical variances from X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Data from which to compute variances, where `n_samples` is
the number of samples and `n_features` is the number of features.
y : any, default=None
Ignored. This parameter exists only for compatibility with
sklearn.pipeline.Pipeline.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(
X,
accept_sparse=("csr", "csc"),
dtype=np.float64,
force_all_finite="allow-nan",
)
if hasattr(X, "toarray"): # sparse matrix
_, self.variances_ = mean_variance_axis(X, axis=0)
if self.threshold == 0:
mins, maxes = min_max_axis(X, axis=0)
peak_to_peaks = maxes - mins
else:
self.variances_ = np.nanvar(X, axis=0)
if self.threshold == 0:
peak_to_peaks = np.ptp(X, axis=0)
if self.threshold == 0:
# Use peak-to-peak to avoid numeric precision issues
# for constant features
compare_arr = np.array([self.variances_, peak_to_peaks])
self.variances_ = np.nanmin(compare_arr, axis=0)
if np.all(~np.isfinite(self.variances_) | (self.variances_ <= self.threshold)):
msg = "No feature in X meets the variance threshold {0:.5f}"
if X.shape[0] == 1:
msg += " (X contains only one sample)"
raise ValueError(msg.format(self.threshold))
return self
def _get_support_mask(self):
check_is_fitted(self)
return self.variances_ > self.threshold
def _more_tags(self):
return {"allow_nan": True}
|