File: storage.py

package info (click to toggle)
orange3 3.40.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,908 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (88 lines) | stat: -rw-r--r-- 3,309 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class Storage:

    domain = None

    name = ""

    MISSING, DENSE, SPARSE, SPARSE_BOOL = range(4)

    def approx_len(self):
        return len(self)

    def X_density(self):
        return Storage.DENSE

    def Y_density(self):
        return Storage.DENSE

    def metas_density(self):
        return Storage.DENSE

    def _filter_is_defined(self, columns=None, negate=False):
        raise NotImplementedError

    def _filter_has_class(self, negate=False):
        raise NotImplementedError

    def _filter_random(self, prob, negate=False):
        raise NotImplementedError

    def _filter_same_value(self, column, value, negate=False):
        raise NotImplementedError

    def _filter_values(self, filter):
        raise NotImplementedError

    def _compute_basic_stats(self, columns=None):
        """Compute basic stats for each of the columns.

        :param columns: columns to calculate stats for. None = all of them
        :return: tuple(min, max, mean, 0, #nans, #non-nans)
        """
        raise NotImplementedError

    def _compute_distributions(self, columns=None):
        """Compute distribution of values for the given columns.

        :param columns: columns to calculate distributions for
        :return: a list of distributions. Type of distribution depends on the
                 type of the column:
                   - for discrete, distribution is a 1d np.array containing the
                     occurrence counts for each of the values.
                   - for continuous, distribution is a 2d np.array with
                     distinct (ordered) values of the variable in the first row
                     and their counts in second.
        """
        raise NotImplementedError

    def _compute_contingency(self, col_vars=None, row_var=None):
        """
        Compute contingency matrices for one or more discrete or
        continuous variables against the specified discrete variable.

        The resulting list  contains a pair for each column variable.
        The first element contains the contingencies and the second
        elements gives the distribution of the row variables for instances
        in which the value of the column variable is missing.

        The format of contingencies returned depends on the variable type:

        - for discrete variables, it is a numpy array, where
          element (i, j) contains count of rows with i-th value of the
          row variable and j-th value of the column variable.

        - for continuous variables, contingency is a list of two arrays,
          where the first array contains ordered distinct values of the
          column_variable and the element (i,j) of the second array
          contains count of rows with i-th value of the row variable
          and j-th value of the ordered column variable.

        :param col_vars: variables whose values will correspond to columns of
            contingency matrices
        :type col_vars: list of ints, variable names or descriptors of type
            :obj:`Orange.data.Variable`
        :param row_var: a discrete variable whose values will correspond to the
            rows of contingency matrices
        :type row_var: int, variable name or :obj:`Orange.data.DiscreteVariable`
        """
        raise NotImplementedError