1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
import random
from itertools import takewhile
from operator import itemgetter
import numpy as np
import Orange
from Orange.util import Reprable
from Orange.preprocess.score import ANOVA, GainRatio, \
UnivariateLinearRegression
__all__ = ["SelectBestFeatures", "SelectRandomFeatures"]
class SelectBestFeatures(Reprable):
"""
A feature selector that builds a new dataset consisting of either the top
`k` features (if `k` is an `int`) or a proportion (if `k` is a `float`
between 0.0 and 1.0), or all those that exceed a given `threshold`. Features
are scored using the provided feature scoring `method`. By default it is
assumed that feature importance decreases with decreasing scores.
If both `k` and `threshold` are set, only features satisfying both
conditions will be selected.
If `method` is not set, it is automatically selected when presented with
the dataset. Datasets with both continuous and discrete features are
scored using a method suitable for the majority of features.
Parameters
----------
method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer
Univariate feature scoring method.
k : int or float
The number or propotion of top features to select.
threshold : float
A threshold that a feature should meet according to the provided method.
decreasing : boolean
The order of feature importance when sorted from the most to the least
important feature.
"""
def __init__(self, method=None, k=None, threshold=None, decreasing=True):
self.method = method
self.k = k
self.threshold = threshold
self.decreasing = decreasing
def __call__(self, data):
n_attrs = len(data.domain.attributes)
if isinstance(self.k, float):
effective_k = np.round(self.k * n_attrs).astype(int) or 1
else:
effective_k = self.k
method = self.method
# select default method according to the provided data
if method is None:
autoMethod = True
discr_ratio = (sum(a.is_discrete
for a in data.domain.attributes)
/ len(data.domain.attributes))
if data.domain.has_discrete_class:
if discr_ratio >= 0.5:
method = GainRatio()
else:
method = ANOVA()
else:
method = UnivariateLinearRegression()
features = data.domain.attributes
try:
scores = method(data)
except ValueError:
scores = self.score_only_nice_features(data, method)
best = sorted(zip(scores, features), key=itemgetter(0),
reverse=self.decreasing)
if self.k:
best = best[:effective_k]
if self.threshold:
pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else
(lambda x: x[0] <= self.threshold))
best = takewhile(pred, best)
domain = Orange.data.Domain([f for s, f in best],
data.domain.class_vars, data.domain.metas)
return data.transform(domain)
def score_only_nice_features(self, data, method):
# dtype must be defined because array can be empty
mask = np.array([isinstance(a, method.feature_type)
for a in data.domain.attributes], dtype=bool)
features = [f for f in data.domain.attributes
if isinstance(f, method.feature_type)]
scores = [method(data, f) for f in features]
bad = float('-inf') if self.decreasing else float('inf')
all_scores = np.array([bad] * len(data.domain.attributes))
all_scores[mask] = scores
return all_scores
class SelectRandomFeatures(Reprable):
"""
A feature selector that selects random `k` features from an input
dataset and returns a dataset with selected features. Parameter
`k` is either an integer (number of feature) or float (from 0.0 to
1.0, proportion of retained features).
Parameters
----------
k : int or float (default = 0.1)
The number or proportion of features to retain.
"""
def __init__(self, k=0.1):
self.k = k
def __call__(self, data):
if isinstance(self.k, float):
effective_k = int(len(data.domain.attributes) * self.k)
else:
effective_k = self.k
domain = Orange.data.Domain(
random.sample(data.domain.attributes,
min(effective_k, len(data.domain.attributes))),
data.domain.class_vars, data.domain.metas)
return data.transform(domain)
|