1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
|
# Authors: Gilles Louppe, Mathieu Blondel, Maheshakya Wijewardena
# License: BSD 3 clause
import numpy as np
from .base import SelectorMixin
from ..base import TransformerMixin, BaseEstimator, clone
from ..externals import six
from ..utils import safe_mask, check_array, deprecated
from ..utils.validation import check_is_fitted
from ..exceptions import NotFittedError
def _get_feature_importances(estimator):
"""Retrieve or aggregate feature importances from estimator"""
if hasattr(estimator, "feature_importances_"):
importances = estimator.feature_importances_
elif hasattr(estimator, "coef_"):
if estimator.coef_.ndim == 1:
importances = np.abs(estimator.coef_)
else:
importances = np.sum(np.abs(estimator.coef_), axis=0)
else:
raise ValueError(
"The underlying estimator %s has no `coef_` or "
"`feature_importances_` attribute. Either pass a fitted estimator"
" to SelectFromModel or call fit before calling transform."
% estimator.__class__.__name__)
return importances
def _calculate_threshold(estimator, importances, threshold):
"""Interpret the threshold value"""
if threshold is None:
# determine default from estimator
est_name = estimator.__class__.__name__
if ((hasattr(estimator, "penalty") and estimator.penalty == "l1") or
"Lasso" in est_name):
# the natural default threshold is 0 when l1 penalty was used
threshold = 1e-5
else:
threshold = "mean"
if isinstance(threshold, six.string_types):
if "*" in threshold:
scale, reference = threshold.split("*")
scale = float(scale.strip())
reference = reference.strip()
if reference == "median":
reference = np.median(importances)
elif reference == "mean":
reference = np.mean(importances)
else:
raise ValueError("Unknown reference: " + reference)
threshold = scale * reference
elif threshold == "median":
threshold = np.median(importances)
elif threshold == "mean":
threshold = np.mean(importances)
else:
raise ValueError("Expected threshold='mean' or threshold='median' "
"got %s" % threshold)
else:
threshold = float(threshold)
return threshold
class _LearntSelectorMixin(TransformerMixin):
# Note because of the extra threshold parameter in transform, this does
# not naturally extend from SelectorMixin
"""Transformer mixin selecting features based on importance weights.
This implementation can be mixin on any estimator that exposes a
``feature_importances_`` or ``coef_`` attribute to evaluate the relative
importance of individual features for feature selection.
"""
@deprecated('Support to use estimators as feature selectors will be '
'removed in version 0.19. Use SelectFromModel instead.')
def transform(self, X, threshold=None):
"""Reduce X to its most important features.
Uses ``coef_`` or ``feature_importances_`` to determine the most
important features. For models with a ``coef_`` for each class, the
absolute sum over the classes is used.
Parameters
----------
X : array or scipy sparse matrix of shape [n_samples, n_features]
The input samples.
threshold : string, float or None, optional (default=None)
The threshold value to use for feature selection. Features whose
importance is greater or equal are kept while the others are
discarded. If "median" (resp. "mean"), then the threshold value is
the median (resp. the mean) of the feature importances. A scaling
factor (e.g., "1.25*mean") may also be used. If None and if
available, the object attribute ``threshold`` is used. Otherwise,
"mean" is used by default.
Returns
-------
X_r : array of shape [n_samples, n_selected_features]
The input samples with only the selected features.
"""
check_is_fitted(self, ('coef_', 'feature_importances_'),
all_or_any=any)
X = check_array(X, 'csc')
importances = _get_feature_importances(self)
if len(importances) != X.shape[1]:
raise ValueError("X has different number of features than"
" during model fitting.")
if threshold is None:
threshold = getattr(self, 'threshold', None)
threshold = _calculate_threshold(self, importances, threshold)
# Selection
try:
mask = importances >= threshold
except TypeError:
# Fails in Python 3.x when threshold is str;
# result is array of True
raise ValueError("Invalid threshold: all features are discarded.")
if np.any(mask):
mask = safe_mask(X, mask)
return X[:, mask]
else:
raise ValueError("Invalid threshold: all features are discarded.")
class SelectFromModel(BaseEstimator, SelectorMixin):
"""Meta-transformer for selecting features based on importance weights.
.. versionadded:: 0.17
Parameters
----------
estimator : object
The base estimator from which the transformer is built.
This can be both a fitted (if ``prefit`` is set to True)
or a non-fitted estimator.
threshold : string, float, optional default None
The threshold value to use for feature selection. Features whose
importance is greater or equal are kept while the others are
discarded. If "median" (resp. "mean"), then the ``threshold`` value is
the median (resp. the mean) of the feature importances. A scaling
factor (e.g., "1.25*mean") may also be used. If None and if the
estimator has a parameter penalty set to l1, either explicitly
or implicitly (e.g, Lasso), the threshold used is 1e-5.
Otherwise, "mean" is used by default.
prefit : bool, default False
Whether a prefit model is expected to be passed into the constructor
directly or not. If True, ``transform`` must be called directly
and SelectFromModel cannot be used with ``cross_val_score``,
``GridSearchCV`` and similar utilities that clone the estimator.
Otherwise train the model using ``fit`` and then ``transform`` to do
feature selection.
Attributes
----------
`estimator_`: an estimator
The base estimator from which the transformer is built.
This is stored only when a non-fitted estimator is passed to the
``SelectFromModel``, i.e when prefit is False.
`threshold_`: float
The threshold value used for feature selection.
"""
def __init__(self, estimator, threshold=None, prefit=False):
self.estimator = estimator
self.threshold = threshold
self.prefit = prefit
def _get_support_mask(self):
# SelectFromModel can directly call on transform.
if self.prefit:
estimator = self.estimator
elif hasattr(self, 'estimator_'):
estimator = self.estimator_
else:
raise ValueError(
'Either fit the model before transform or set "prefit=True"'
' while passing the fitted estimator to the constructor.')
scores = _get_feature_importances(estimator)
self.threshold_ = _calculate_threshold(estimator, scores,
self.threshold)
return scores >= self.threshold_
def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
**fit_params : Other estimator specific parameters
Returns
-------
self : object
Returns self.
"""
if self.prefit:
raise NotFittedError(
"Since 'prefit=True', call transform directly")
if not hasattr(self, "estimator_"):
self.estimator_ = clone(self.estimator)
self.estimator_.fit(X, y, **fit_params)
return self
def partial_fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer only once.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
**fit_params : Other estimator specific parameters
Returns
-------
self : object
Returns self.
"""
if self.prefit:
raise NotFittedError(
"Since 'prefit=True', call transform directly")
if not hasattr(self, "estimator_"):
self.estimator_ = clone(self.estimator)
self.estimator_.partial_fit(X, y, **fit_params)
return self
|