File: univariate_selection.py

package info (click to toggle)
scikit-learn 0.11.0-2%2Bdeb7u1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 13,900 kB
  • sloc: python: 34,740; ansic: 8,860; cpp: 8,849; pascal: 230; makefile: 211; sh: 14
file content (483 lines) | stat: -rw-r--r-- 15,178 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
# -*- coding: utf-8 -*-
"""Univariate features selection."""

# Authors: V. Michel, B. Thirion, G. Varoquaux, A. Gramfort, E. Duchesnay.
#          L. Buitinck
# License: BSD 3 clause

from abc import ABCMeta, abstractmethod

import numpy as np
from scipy import stats
from scipy.sparse import issparse

from ..base import BaseEstimator, TransformerMixin
from ..preprocessing import LabelBinarizer
from ..utils import array2d, atleast2d_or_csr, deprecated, as_float_array
from ..utils.extmath import safe_sparse_dot

######################################################################
# Scoring functions


# The following function is a rewriting of scipy.stats.f_oneway
# Contrary to the scipy.stats.f_oneway implementation it does not
# copy the data while keeping the inputs unchanged.
def f_oneway(*args):
    """Performs a 1-way ANOVA.

    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean.  The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Parameters
    ----------
    sample1, sample2, ... : array_like
        The sample measurements should be given as arguments.

    Returns
    -------
    F-value : float
        The computed F-value of the test
    p-value : float
        The associated p-value from the F-distribution

    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homocedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`stats.kruskal`_) although with
    some loss of power.

    The algorithm is from Heiman[2], pp.394-7.

    See ``scipy.stats.f_oneway`` that should give the same results while
    being less efficient.

    References
    ----------

    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.

    """
    n_classes = len(args)
    n_samples_per_class = np.array([len(a) for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = reduce(lambda x, y: x + y,
            [np.sum(a ** 2, axis=0) for a in args])
    sums_args = [np.sum(a, axis=0) for a in args]
    square_of_sums_alldata = reduce(lambda x, y: x + y, sums_args) ** 2
    square_of_sums_args = [s ** 2 for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw
    prob = stats.fprob(dfbn, dfwn, f)
    return f, prob


def f_classif(X, y):
    """Compute the Anova F-value for the provided sample

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        the set of regressors sthat will tested sequentially
    y : array of shape(n_samples)
        the data matrix

    Returns
    -------
    F : array of shape (m),
        the set of F values
    pval : array of shape(m),
        the set of p-values
    """
    X = array2d(X)
    y = np.asarray(y).ravel()
    args = [X[y == k] for k in np.unique(y)]
    return f_oneway(*args)


def chi2(X, y):
    """Compute χ² (chi-squared) statistic for each class/feature combination.

    This transformer can be used to select the n_features features with the
    highest values for the χ² (chi-square) statistic from either boolean or
    multinomially distributed data (e.g., term counts in document
    classification) relative to the classes.

    Recall that the χ² statistic measures dependence between stochastic
    variables, so a transformer based on this function "weeds out" the features
    that are the most likely to be independent of class and therefore
    irrelevant for classification.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features_in]
        Sample vectors.

    y : array-like, shape = n_samples
        Target vector (class labels).

    Notes
    ----------
    Complexity of this algorithm is O(n_classes * n_features).
    """

    # XXX: we might want to do some of the following in logspace instead for
    # numerical stability.
    X = atleast2d_or_csr(X)
    Y = LabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    observed = safe_sparse_dot(Y.T, X)          # n_classes * n_features

    feature_count = array2d(X.sum(axis=0))
    class_prob = array2d(Y.mean(axis=0))
    expected = safe_sparse_dot(class_prob.T, feature_count)

    return stats.chisquare(observed, expected)


def f_regression(X, y, center=True):
    """Univariate linear regression tests

    Quick linear model for testing the effect of a single regressor,
    sequentially for many regressors.

    This is done in 3 steps:
    1. the regressor of interest and the data are orthogonalized
    wrt constant regressors
    2. the cross correlation between data and regressors is computed
    3. it is converted to an F score then to a p-value

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        the set of regressors sthat will tested sequentially
    y : array of shape(n_samples)
        the data matrix

    center : True, bool,
        If true, X and y are centered

    Returns
    -------
    F : array of shape (m),
        the set of F values
    pval : array of shape(m)
        the set of p-values
    """
    y = as_float_array(y, copy=False).ravel()
    X = as_float_array(X, copy=False)  # copy only if center
    if center:
        y = y - np.mean(y)
        X = X.copy('F')  # faster in fortran
        X -= np.mean(X, axis=0)

    # compute the correlation
    corr = np.dot(y, X)
    corr /= np.sqrt(np.sum(X ** 2, 0))
    corr /= np.sqrt(np.sum(y ** 2))

    # convert to p-value
    dof = y.size - 2
    F = corr ** 2 / (1 - corr ** 2) * dof
    pv = stats.f.sf(F, 1, dof)
    return F, pv


######################################################################
# General class for filter univariate selection

class _AbstractUnivariateFilter(BaseEstimator, TransformerMixin):
    __metaclass__ = ABCMeta

    def __init__(self, score_func):
        """ Initialize the univariate feature selection.

        Parameters
        ===========
        score_func: callable
            function taking two arrays X and y, and returning 2 arrays:
            both scores and pvalues
        """
        if not callable(score_func):
            raise TypeError(
                "The score function should be a callable, '%s' (type %s) "
                "was passed." % (score_func, type(score_func)))
        self.score_func = score_func

    def fit(self, X, y):
        """
        Evaluate the function
        """
        scores = self.score_func(X, y)
        self.scores_ = scores[0]
        self.pvalues_ = scores[1]
        return self

    @property
    @deprecated('``_scores`` is deprecated and will be removed in '
                'version 0.12. Please use ``scores_`` instead.')
    def _scores(self):
        return self.scores_

    @property
    @deprecated('``_pvalues`` is deprecated and will be removed in '
                'version 0.12. Please use ``scores_`` instead.')
    def _pvalues(self):
        return self.pvalues_

    def get_support(self, indices=False):
        """
        Return a mask, or list, of the features/indices selected.
        """
        mask = self._get_support_mask()
        return mask if not indices else np.where(mask)[0]

    @abstractmethod
    def _get_support_mask(self):
        """
        Must return a boolean mask indicating which features are selected.
        """

    def transform(self, X):
        """
        Transform a new matrix using the selected features
        """
        return atleast2d_or_csr(X)[:, self.get_support(indices=issparse(X))]

    def inverse_transform(self, X):
        """
        Transform a new matrix using the selected features
        """
        support_ = self.get_support()
        if X.ndim == 1:
            X = X[None, :]
        Xt = np.zeros((X.shape[0], support_.size))
        Xt[:, support_] = X
        return Xt


######################################################################
# Specific filters
######################################################################

class SelectPercentile(_AbstractUnivariateFilter):
    """Filter: Select the best percentile of the p_values

    Parameters
    ===========
    score_func: callable
        function taking two arrays X and y, and returning 2 arrays:
        both scores and pvalues

    percentile: int, optional
        percent of features to keep

    """

    def __init__(self, score_func, percentile=10):
        self.percentile = percentile
        _AbstractUnivariateFilter.__init__(self, score_func)

    def _get_support_mask(self):
        percentile = self.percentile
        if percentile > 100:
            raise ValueError("percentile should be between 0 and 100"
                             " (%f given)" % (percentile))
        # Cater for Nans
        if percentile == 100:
            return np.ones(len(self.pvalues_), dtype=np.bool)
        elif percentile == 0:
            return np.zeros(len(self.pvalues_), dtype=np.bool)
        alpha = stats.scoreatpercentile(self.pvalues_, percentile)
        return (self.pvalues_ <= alpha)


class SelectKBest(_AbstractUnivariateFilter):
    """Filter: Select the k lowest p-values.

    Parameters
    ----------
    score_func: callable
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues).

    k: int, optional
        Number of top features to select.

    Notes
    -----
    Ties between features with equal p-values will be broken in an unspecified
    way.

    """

    def __init__(self, score_func, k=10):
        self.k = k
        _AbstractUnivariateFilter.__init__(self, score_func)

    def _get_support_mask(self):
        k = self.k
        if k > len(self.pvalues_):
            raise ValueError("cannot select %d features among %d"
                             % (k, len(self.pvalues_)))

        # XXX This should be refactored; we're getting an array of indices
        # from argsort, which we transform to a mask, which we probably
        # transform back to indices later.
        mask = np.zeros(self.pvalues_.shape, dtype=bool)
        mask[np.argsort(self.pvalues_)[:k]] = 1
        return mask


class SelectFpr(_AbstractUnivariateFilter):
    """Filter: Select the pvalues below alpha based on a FPR test.

    FPR test stands for False Positive Rate test. It controls the total
    amount of false detections.

    Parameters
    ===========
    score_func: callable
        function taking two arrays X and y, and returning 2 arrays:
        both scores and pvalues

    alpha: float, optional
        the highest p-value for features to be kept
    """

    def __init__(self, score_func, alpha=5e-2):
        self.alpha = alpha
        _AbstractUnivariateFilter.__init__(self, score_func)

    def _get_support_mask(self):
        alpha = self.alpha
        return self.pvalues_ < alpha


class SelectFdr(_AbstractUnivariateFilter):
    """Filter: Select the p-values for an estimated false discovery rate

    This uses the Benjamini-Hochberg procedure. ``alpha`` is the target false
    discovery rate.

    Parameters
    ===========
    score_func: callable
        function taking two arrays X and y, and returning 2 arrays:
        both scores and pvalues

    alpha: float, optional
        the highest uncorrected p-value for features to keep

    """

    def __init__(self, score_func, alpha=5e-2):
        self.alpha = alpha
        _AbstractUnivariateFilter.__init__(self, score_func)

    def _get_support_mask(self):
        alpha = self.alpha
        sv = np.sort(self.pvalues_)
        threshold = sv[sv < alpha * np.arange(len(self.pvalues_))].max()
        return self.pvalues_ <= threshold


class SelectFwe(_AbstractUnivariateFilter):
    """Filter: Select the p-values corresponding to Family-wise error rate

    Parameters
    ===========
    score_func: callable
        function taking two arrays X and y, and returning 2 arrays:
        both scores and pvalues

    alpha: float, optional
        the highest uncorrected p-value for features to keep

    """

    def __init__(self, score_func, alpha=5e-2):
        self.alpha = alpha
        _AbstractUnivariateFilter.__init__(self, score_func)

    def _get_support_mask(self):
        alpha = self.alpha
        return (self.pvalues_ < alpha / len(self.pvalues_))


######################################################################
# Generic filter
######################################################################

class GenericUnivariateSelect(_AbstractUnivariateFilter):
    """Univariate feature selector with configurable strategy

    Parameters
    ===========
    score_func: callable
        Function taking two arrays X and y, and returning 2 arrays:
        both scores and pvalues

    mode: {'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}
        Feature selection mode

    param: float or int depending on the feature selection mode
        Parameter of the corresponding mode
    """

    _selection_modes = {'percentile':   SelectPercentile,
                        'k_best':       SelectKBest,
                        'fpr':          SelectFpr,
                        'fdr':          SelectFdr,
                        'fwe':          SelectFwe,
                        }

    def __init__(self, score_func, mode='percentile', param=1e-5):
        if not callable(score_func):
            raise TypeError(
                "The score function should be a callable, '%s' (type %s) "
                "was passed." % (score_func, type(score_func)))
        if mode not in self._selection_modes:
            raise ValueError(
                "The mode passed should be one of %s, '%s', (type %s) "
                "was passed." % (
                        self._selection_modes.keys(),
                        mode, type(mode)))
        self.score_func = score_func
        self.mode = mode
        self.param = param

    def _get_support_mask(self):
        selector = self._selection_modes[self.mode](lambda x: x)
        selector.pvalues_ = self.pvalues_
        selector.scores_ = self.scores_
        # Now perform some acrobatics to set the right named parameter in
        # the selector
        possible_params = selector._get_param_names()
        possible_params.remove('score_func')
        selector.set_params(**{possible_params[0]: self.param})
        return selector._get_support_mask()