File: fss.py

package info (click to toggle)
orange3 3.40.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,908 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (133 lines) | stat: -rw-r--r-- 4,793 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import random
from itertools import takewhile
from operator import itemgetter

import numpy as np


import Orange
from Orange.util import Reprable
from Orange.preprocess.score import ANOVA, GainRatio, \
    UnivariateLinearRegression

__all__ = ["SelectBestFeatures", "SelectRandomFeatures"]


class SelectBestFeatures(Reprable):
    """
    A feature selector that builds a new dataset consisting of either the top
    `k` features (if `k` is an `int`) or a proportion (if `k` is a `float`
    between 0.0 and 1.0), or all those that exceed a given `threshold`. Features
    are scored using the provided feature scoring `method`. By default it is
    assumed that feature importance decreases with decreasing scores.

    If both `k` and `threshold` are set, only features satisfying both
    conditions will be selected.

    If `method` is not set, it is automatically selected when presented with
    the dataset. Datasets with both continuous and discrete features are
    scored using a method suitable for the majority of features.

    Parameters
    ----------
    method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer
        Univariate feature scoring method.

    k : int or float
        The number or propotion of top features to select.

    threshold : float
        A threshold that a feature should meet according to the provided method.

    decreasing : boolean
        The order of feature importance when sorted from the most to the least
        important feature.
    """

    def __init__(self, method=None, k=None, threshold=None, decreasing=True):
        self.method = method
        self.k = k
        self.threshold = threshold
        self.decreasing = decreasing

    def __call__(self, data):
        n_attrs = len(data.domain.attributes)
        if isinstance(self.k, float):
            effective_k = np.round(self.k * n_attrs).astype(int) or 1
        else:
            effective_k = self.k

        method = self.method
        # select default method according to the provided data
        if method is None:
            autoMethod = True
            discr_ratio = (sum(a.is_discrete
                               for a in data.domain.attributes)
                           / len(data.domain.attributes))
            if data.domain.has_discrete_class:
                if discr_ratio >= 0.5:
                    method = GainRatio()
                else:
                    method = ANOVA()
            else:
                method = UnivariateLinearRegression()

        features = data.domain.attributes
        try:
            scores = method(data)
        except ValueError:
            scores = self.score_only_nice_features(data, method)
        best = sorted(zip(scores, features), key=itemgetter(0),
                      reverse=self.decreasing)
        if self.k:
            best = best[:effective_k]
        if self.threshold:
            pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else
                    (lambda x: x[0] <= self.threshold))
            best = takewhile(pred, best)

        domain = Orange.data.Domain([f for s, f in best],
                                    data.domain.class_vars, data.domain.metas)
        return data.transform(domain)

    def score_only_nice_features(self, data, method):
        # dtype must be defined because array can be empty
        mask = np.array([isinstance(a, method.feature_type)
                         for a in data.domain.attributes], dtype=bool)
        features = [f for f in data.domain.attributes
                    if isinstance(f, method.feature_type)]
        scores = [method(data, f) for f in features]
        bad = float('-inf') if self.decreasing else float('inf')
        all_scores = np.array([bad] * len(data.domain.attributes))
        all_scores[mask] = scores
        return all_scores


class SelectRandomFeatures(Reprable):
    """
    A feature selector that selects random `k` features from an input
    dataset and returns a dataset with selected features. Parameter
    `k` is either an integer (number of feature) or float (from 0.0 to
    1.0, proportion of retained features).

    Parameters
    ----------

    k : int or float (default = 0.1)
        The number or proportion of features to retain.
    """

    def __init__(self, k=0.1):
        self.k = k

    def __call__(self, data):
        if isinstance(self.k, float):
            effective_k = int(len(data.domain.attributes) * self.k)
        else:
            effective_k = self.k

        domain = Orange.data.Domain(
            random.sample(data.domain.attributes,
                          min(effective_k, len(data.domain.attributes))),
            data.domain.class_vars, data.domain.metas)
        return data.transform(domain)