File: continuize.py

package info (click to toggle)
orange3 3.40.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,912 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (100 lines) | stat: -rw-r--r-- 4,092 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from Orange.data import ContinuousVariable, Domain
from Orange.statistics import distribution
from Orange.util import Reprable
from Orange.preprocess.transformation import \
    Identity, Indicator, Indicator1, Normalizer
from Orange.preprocess.preprocess import Continuize

__all__ = ["DomainContinuizer"]


class DomainContinuizer(Reprable):
    def __init__(self, zero_based=True,
                 multinomial_treatment=Continuize.Indicators,
                 transform_class=False):
        self.zero_based = zero_based
        self.multinomial_treatment = multinomial_treatment
        self.transform_class = transform_class

    def __call__(self, data):
        def transform_discrete(var):
            if (len(var.values) < 2 or
                    treat == Continuize.Remove or
                    treat == Continuize.RemoveMultinomial and
                    len(var.values) > 2):
                return []
            if treat == Continuize.AsOrdinal:
                new_var = ContinuousVariable(
                    var.name, compute_value=Identity(var), sparse=var.sparse)
                return [new_var]
            if treat == Continuize.AsNormalizedOrdinal:
                n_values = max(1, len(var.values))
                if self.zero_based:
                    return [ContinuousVariable(
                        var.name,
                        compute_value=Normalizer(var, 0, 1 / (n_values - 1)),
                        sparse=var.sparse)]
                else:
                    return [ContinuousVariable(
                        var.name,
                        compute_value=Normalizer(var, (n_values - 1) / 2,
                                                 2 / (n_values - 1)),
                        sparse=var.sparse)]

            new_vars = []
            if treat == Continuize.Indicators:
                base = -1
            elif treat in (Continuize.FirstAsBase,
                           Continuize.RemoveMultinomial):
                base = 0
            else:
                base = dists[var_ptr].modus()
            ind_class = [Indicator1, Indicator][self.zero_based]
            for i, val in enumerate(var.values):
                if i == base:
                    continue
                new_var = ContinuousVariable(
                    "{}={}".format(var.name, val),
                    compute_value=ind_class(var, i),
                    sparse=var.sparse)
                new_vars.append(new_var)
            return new_vars

        def transform_list(s):
            nonlocal var_ptr
            new_vars = []
            for var in s:
                if var.is_discrete:
                    new_vars += transform_discrete(var)
                    if needs_discrete:
                        var_ptr += 1
                else:
                    new_var = var
                    if new_var is not None:
                        new_vars.append(new_var)
                        if needs_continuous:
                            var_ptr += 1
            return new_vars

        treat = self.multinomial_treatment
        transform_class = self.transform_class

        domain = data if isinstance(data, Domain) else data.domain
        if (treat == Continuize.ReportError and
                any(var.is_discrete and len(var.values) > 2 for var in domain.variables)):
            raise ValueError("data has multinomial attributes")
        needs_discrete = (treat == Continuize.FrequentAsBase and
                          domain.has_discrete_attributes(transform_class))
        needs_continuous = False
        if needs_discrete:
            if isinstance(data, Domain):
                raise TypeError("continuizer requires data")
            dists = distribution.get_distributions(
                data, not needs_discrete, not needs_continuous)
        var_ptr = 0
        new_attrs = transform_list(domain.attributes)
        if transform_class:
            new_classes = transform_list(domain.class_vars)
        else:
            new_classes = domain.class_vars
        return Domain(new_attrs, new_classes, domain.metas)