File: majority.py

package info (click to toggle)
orange3 3.40.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,912 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (84 lines) | stat: -rw-r--r-- 2,970 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from hashlib import sha1

import numpy as np

from Orange.classification import Learner, Model
from Orange.statistics import distribution

__all__ = ["MajorityLearner"]


class MajorityLearner(Learner):
    """
    A majority classifier. Always returns most frequent class from the
    training set, regardless of the attribute values from the test data
    instance. Returns class value distribution if class probabilities
    are requested. Can be used as a baseline when comparing classifiers.

    In the special case of uniform class distribution within the training data,
    class value is selected randomly. In order to produce consistent results on
    the same dataset, this value is selected based on hash of the class vector.
    """
    def fit_storage(self, dat):
        if not dat.domain.has_discrete_class:
            raise ValueError("classification.MajorityLearner expects a domain "
                             "with a (single) categorical variable")
        dist = distribution.get_distribution(dat, dat.domain.class_var)
        N = dist.sum()
        if N > 0:
            dist /= N
        else:
            dist.fill(1 / len(dist))

        probs = np.array(dist)
        ties = np.flatnonzero(probs == probs.max())
        if len(ties) > 1:
            random_idx = int(sha1(np.ascontiguousarray(dat.Y).data)
                             .hexdigest(), 16) % len(ties)
            unif_maj = ties[random_idx]
        else:
            unif_maj = None
        return ConstantModel(dist=dist, unif_maj=unif_maj)


class ConstantModel(Model):
    """
    A classification model that returns a given class value.
    """
    def __init__(self, dist, unif_maj=None):
        """
        Constructs `Orange.classification.MajorityModel` that always
        returns majority value of given distribution.

        If no or empty distribution given, constructs a model that returns equal
        probabilities for each class value.

        :param dist: domain for the `Table`
        :param unif_maj: majority class for the special case of uniform
            class distribution in the training data
        :type dist: Orange.statistics.distribution.Discrete
        :return: regression model that returns majority value
        :rtype: Orange.classification.Model
        """
        self.dist = np.array(dist)
        self.unif_maj = unif_maj

    def predict(self, X):
        """
        Returns majority class for each given instance in X.

        :param X: data table for which to make predictions
        :type X: Orange.data.Table
        :return: predicted value
        :rtype: vector of majority values
        """
        probs = np.tile(self.dist, (X.shape[0], 1))
        if self.unif_maj is not None:
            value = np.tile(self.unif_maj, (X.shape[0], ))
            return value, probs
        return probs

    def __str__(self):
        return 'ConstantModel {}'.format(self.dist)

MajorityLearner.__returns__ = ConstantModel