File: test_fss.py

package info (click to toggle)
orange3 3.40.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,908 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (114 lines) | stat: -rw-r--r-- 4,589 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Test methods with long descriptive names can omit docstrings
# pylint: disable=missing-docstring

import unittest

import numpy as np

from Orange.data import Table
from Orange.preprocess.score import ANOVA, Gini, UnivariateLinearRegression, \
    Chi2
from Orange.preprocess import SelectBestFeatures, Impute, SelectRandomFeatures
from Orange.tests import test_filename


class TestFSS(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.titanic = Table('titanic')
        cls.heart_disease = Table('heart_disease')
        cls.iris = Table('iris')
        cls.imports = Table(test_filename('datasets/imports-85.tab'))

    def test_select_1(self):
        gini = Gini()
        s = SelectBestFeatures(method=gini, k=1)
        data2 = s(self.titanic)
        best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1]
        self.assertEqual(data2.domain.attributes[0], best)

    def test_select_2(self):
        gini = Gini()
        # 100th percentile = selection of top1 attribute
        sel1 = SelectBestFeatures(method=gini, k=1.0)
        data2 = sel1(self.titanic)
        best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1]
        self.assertEqual(data2.domain.attributes[0], best)

        # no k and no threshold, select all attributes
        sel2 = SelectBestFeatures(method=gini, k=0)
        data2 = sel2(self.titanic)
        self.assertEqual(len(data2.domain.attributes), len(self.titanic.domain.attributes))

        # 31% = selection of top  (out of 3) attributes
        sel3 = SelectBestFeatures(method=gini, k=0.31)
        data2 = sel3(self.titanic)
        self.assertEqual(len(data2.domain.attributes), 1)

        # 35% = selection of top  (out of 3) attributes
        sel3 = SelectBestFeatures(method=gini, k=0.35)
        data2 = sel3(self.titanic)
        self.assertEqual(len(data2.domain.attributes), 1)

        # 1% = select one (out of 3) attributes
        sel3 = SelectBestFeatures(method=gini, k=0.01)
        data2 = sel3(self.titanic)
        self.assertEqual(len(data2.domain.attributes), 1)

        # number of selected attrs should be relative to number of current input attrs
        sel3 = SelectBestFeatures(method=gini, k=1.0)
        data2 = sel3(self.heart_disease)
        self.assertEqual(len(data2.domain.attributes), 13)

    def test_select_threshold(self):
        anova = ANOVA()
        t = 30
        data2 = SelectBestFeatures(method=anova, threshold=t)(self.heart_disease)
        self.assertTrue(all(anova(self.heart_disease, f) >= t
                            for f in data2.domain.attributes))

    def test_error_when_using_regression_score_on_classification_data(self):
        s = SelectBestFeatures(method=UnivariateLinearRegression(), k=3)
        with self.assertRaises(ValueError):
            s(self.heart_disease)

    def test_discrete_scores_on_continuous_features(self):
        c = self.iris.columns
        for method in (Gini(), Chi2()):
            d1 = SelectBestFeatures(method=method)(self.iris)
            expected = \
                (c.petal_length, c.petal_width, c.sepal_length, c.sepal_width)
            self.assertSequenceEqual(d1.domain.attributes, expected)

            scores = method(d1)
            self.assertEqual(len(scores), 4)

            score = method(d1, c.petal_length)
            self.assertEqual(score.ndim, 0)  # a scalar
            self.assertTrue(np.issubdtype(score.dtype, float))

    def test_continuous_scores_on_discrete_features(self):
        data = Impute()(self.imports)
        with self.assertRaises(ValueError):
            UnivariateLinearRegression()(data)

        d1 = SelectBestFeatures(method=UnivariateLinearRegression())(data)
        self.assertEqual(len(d1.domain.variables), len(data.domain.variables))

    def test_defaults(self):
        fs = SelectBestFeatures(k=3)
        data2 = fs(Impute()(self.imports))
        self.assertTrue(all(a.is_continuous for a in data2.domain.attributes))
        data2 = fs(self.iris)
        self.assertTrue(all(a.is_continuous for a in data2.domain.attributes))
        data2 = fs(self.titanic)
        self.assertTrue(all(a.is_discrete for a in data2.domain.attributes))


class TestSelectRandomFeatures(unittest.TestCase):
    def test_select_random_features(self):
        data = Table("heart_disease")
        for k_features, n_attributes in ((3, 3), (0.35, 4)):
            srf = SelectRandomFeatures(k=k_features)
            new_data = srf(data)
            self.assertEqual(len(new_data.domain.attributes), n_attributes)