File: base.py

package info (click to toggle)
orange3 3.40.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,912 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (565 lines) | stat: -rw-r--r-- 21,997 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
# This module defines abstract base classes; derived classes are abstract, too
# pylint: disable=abstract-method

import numpy as np
import sklearn.metrics as skl_metrics

from Orange.data import Table, Domain, Instance, RowInstance
from Orange.misc import DistMatrix
from Orange.preprocess import SklImpute
from Orange.statistics import util


# TODO this *private* function is called from several widgets to prepare
# data for calling the below classes. After we (mostly) stopped relying
# on sklearn.metrics, this is (mostly) unnecessary
# Afterwards, also remove the following line:
# pylint: disable=redefined-outer-name
def _preprocess(table, impute=True):
    """Remove categorical attributes and impute missing values."""
    if not len(table):  # this can be an array, pylint: disable=len-as-condition
        return table
    new_domain = Domain(
        [a for a in table.domain.attributes if a.is_continuous],
        table.domain.class_vars,
        table.domain.metas)
    new_data = table.transform(new_domain)
    if impute:
        new_data = SklImpute()(new_data)
    return new_data


# TODO I have put this function here as a substitute the above `_preprocess`.
# None of them really belongs here; (re?)move them, eventually.
def remove_discrete_features(data, to_metas=False):
    """Remove discrete columns from the data."""
    new_domain = Domain(
        [a for a in data.domain.attributes if a.is_continuous],
        data.domain.class_vars,
        data.domain.metas
        + (() if not to_metas
           else tuple(a for a in data.domain.attributes if not a.is_continuous))
    )
    return data.transform(new_domain)


def remove_nonbinary_features(data, to_metas=False):
    """Remove non-binary columns from the data."""
    new_domain = Domain(
        [a for a in data.domain.attributes
         if a.is_discrete and len(a.values) == 2],
        data.domain.class_vars,
        data.domain.metas +
        (() if not to_metas
         else tuple(a for a in data.domain.attributes
               if not (a.is_discrete and len(a.values) == 2))
         if to_metas else ()))
    return data.transform(new_domain)

def impute(data):
    """Impute missing values."""
    return SklImpute()(data)


def _orange_to_numpy(x):
    """
    Return :class:`numpy.ndarray` (dense or sparse) with attribute data
    from the given instance of :class:`Orange.data.Table`,
    :class:`Orange.data.RowInstance` or :class:`Orange.data.Instance`.
    """
    if isinstance(x, Table):
        return x.X
    elif isinstance(x, Instance):
        return np.atleast_2d(x.x)
    elif isinstance(x, np.ndarray):
        return np.atleast_2d(x)
    else:
        return x  # e.g. None


class Distance:
    """
    Base class for construction of distances models (:obj:`DistanceModel`).

    Distances can be computed between all pairs of rows in one table, or
    between pairs where one row is from one table and one from another.

    If `axis` is set to `0`, the class computes distances between all pairs
    of columns in a table. Distances between columns from separate tables are
    probably meaningless, thus unsupported.

    The class can be used as follows:

    - Constructor is called only with keyword argument `axis` that
      specifies the axis over which the distances are computed, and with other
      subclass-specific keyword arguments.
    - Next, we call the method `fit(data)` to produce an instance of
      :obj:`DistanceModel`; the instance stores any parameters needed for
      computation of distances, such as statistics for normalization and
      handling of missing data.
    - We can then call the :obj:`DistanceModel` with data to compute the
      distance between its rows or columns, or with two data tables to
      compute distances between all pairs of rows.

    The second, shorter way to use this class is to call the constructor with
    one or two data tables and any additional keyword arguments. Constructor
    will execute the above steps and return :obj:`~Orange.misc.DistMatrix`.
    Such usage is here for backward compatibility, practicality and efficiency.

    Args:
        e1 (:obj:`~Orange.data.Table` or :obj:`~Orange.data.Instance` or \
                :obj:`np.ndarray` or `None`):
            data on which to train the model and compute the distances
        e2 (:obj:`~Orange.data.Table` or :obj:`~Orange.data.Instance` or \
                :obj:`np.ndarray` or `None`):
            if present, the class computes distances with pairs coming from
            the two tables
        axis (int):
            axis over which the distances are computed, 1 (default) for
            rows, 0 for columns
        impute (bool):
            if `True` (default is `False`), nans in the computed distances
            are replaced with zeros, and infs with very large numbers.
        callback (callable or None):
            callback function
        similarity (bool):
            if `True` (default is `False`) the class will compute similarities
            instead of distances

    Attributes:
        axis (int):
            axis over which the distances are computed, 1 (default) for
            rows, 0 for columns
        impute (bool):
            if `True` (default is `False`), nans in the computed distances
            are replaced with zeros, and infs with very large numbers.
        normalize (bool):
            if `True`, columns are normalized before computation. This attribute
            applies only if the distance supports normalization.

    The capabilities of the metrics are described with class attributes.

    If class attribute `supports_discrete` is `True`, the distance
    also uses discrete attributes to compute row distances. The use of discrete
    attributes depends upon the type of distance; e.g. Jaccard distance observes
    whether the value is zero or non-zero, while Euclidean and Manhattan
    distance observes whether a pair of values is same or different.

    Class attribute `supports_missing` indicates that the distance can cope
    with missing data. In such cases, letting the distance handle it should
    be preferred over pre-imputation of missing values.

    Class attribute `supports_normalization` indicates that the constructor
    accepts an argument `normalize`. If set to `True`, the metric will attempt
    to normalize the values in a sense that each attribute will have equal
    influence. For instance, the Euclidean distance subtract the mean and
    divides the result by the deviation, while Manhattan distance uses the
    median and MAD.

    If class attribute `supports_sparse` is `True`, the class will handle
    sparse data. Currently, all classes that do handle it rely on fallbacks to
    SKL metrics. These, however, do not support discrete data and missing
    values, and will fail silently.

    Class attribute `supports_similarity` indicates whether the class can also
    compute similarities.
    """
    supports_sparse = False
    supports_discrete = False
    supports_normalization = False
    supports_missing = True
    supports_similarity = False

    # Predefined here to silence pylint, which doesn't look into __new__
    normalize = False
    axis = 1
    impute = False

    def __new__(cls, e1=None, e2=None, axis=1, impute=False,
                callback=None, *, similarity=False, **kwargs):

        if similarity and not cls.supports_similarity:
            raise ValueError(f"{cls.__name__} does not compute similarity")

        self = super().__new__(cls)
        self.axis = axis
        self.impute = impute
        self.similarity = similarity
        self.callback = callback
        # Ugly, but needed to allow allow setting subclass-specific parameters
        # (such as normalize) when `e1` is not `None` and the `__new__` in the
        # subclass is skipped
        self.__dict__.update(**kwargs)
        if e1 is None:
            return self

        # Fallbacks for sparse data and numpy tables. Remove when subclasses
        # no longer use fallbacks for sparse data, and handling numpy tables
        # becomes obsolete (or handled elsewhere)
        if (not hasattr(e1, "domain")
                or hasattr(e1, "is_sparse") and e1.is_sparse()):
            fallback = getattr(self, "fallback", None)
            if fallback is not None:
                # pylint: disable=not-callable
                dist = fallback(e1, e2, axis, impute)
                if self.similarity:
                    assert fallback.metric == "cosine"
                    return 1 - dist
                return dist

        # Magic constructor
        model = self.fit(e1)
        return model(e1, e2)

    def fit(self, data):
        """
        Abstract method returning :obj:`DistanceModel` fit to the data

        Args:
            e1 (Orange.data.Table, Orange.data.Instance, np.ndarray):
                data for fitting the distance model

        Returns:
            model (DistanceModel)
        """
        raise NotImplementedError

    @staticmethod
    def check_no_discrete(n_vals):
        """
        Raise an exception if there are any discrete attributes.

        Args:
            n_vals (list of int): number of attributes values, 0 for continuous
        """
        if any(n_vals):
            raise ValueError("columns with discrete values are incommensurable")


class DistanceModel:
    """
    Base class for classes that compute distances between data rows or columns.
    Instances of these classes are not constructed directly but returned by
    the corresponding instances of :obj:`Distance`.

    Attributes:
        axis (int, readonly):
            axis over which the distances are computed, 1 (default) for
            rows, 0 for columns
        impute (bool):
            if `True` (default is `False`), nans in the computed distances
            are replaced with zeros, and infs with very large numbers
        callback (callable or None):
            callback function

    """
    def __init__(self, axis, impute=False, callback=None, *, similarity=False):
        self._axis = axis
        self.impute = impute
        self.callback = callback
        self.similarity = similarity

    @property
    def axis(self):
        return self._axis

    def __call__(self, e1, e2=None):
        """
        If e2 is omitted, calculate distances between all rows (axis=1) or
        columns (axis=2) of e1. If e2 is present, calculate distances between
        all pairs if rows from e1 and e2.

        This method converts the data into numpy arrays, calls the method
        `compute_data` and packs the result into `DistMatrix`. Subclasses are
        expected to define the `compute_data` and not the `__call__` method.

        Args:
            e1 (Orange.data.Table or Orange.data.Instance or numpy.ndarray):
                input data
            e2 (Orange.data.Table or Orange.data.Instance or numpy.ndarray):
                secondary data

        Returns:
            A distance matrix (Orange.misc.distmatrix.DistMatrix)
        """
        if self.axis == 0 and e2 is not None:
            # Backward compatibility fix
            if e2 is e1:
                e2 = None
            else:
                raise ValueError("Two tables cannot be compared by columns")

        x1 = _orange_to_numpy(e1)
        x2 = _orange_to_numpy(e2)
        with np.errstate(invalid="ignore"):  # nans are handled below
            dist = self.compute_distances(x1, x2)
            if self.impute and np.isnan(dist).any():
                dist = np.nan_to_num(dist)
            if isinstance(e1, (Table, RowInstance)):
                dist = DistMatrix(dist, e1, e2, self.axis)
            else:
                dist = DistMatrix(dist)
            return dist

    def compute_distances(self, x1, x2):
        """
        Abstract method for computation of distances between rows or columns of
        `x1`, or between rows of `x1` and `x2`. Do not call directly."""
        raise NotImplementedError


class FittedDistanceModel(DistanceModel):
    """
    Base class for models that store attribute-related data for normalization
    and imputation, and that treat discrete and continuous columns separately.

    Attributes:
        attributes (list of `Variable`): attributes on which the model was fit
        discrete (np.ndarray): bool array indicating discrete attributes
        continuous (np.ndarray): bool array indicating continuous attributes
        normalize (bool):
            if `True` (default is `False`) continuous columns are normalized
        callback (callable or None): callback function
    """
    def __init__(self, attributes, axis=1, impute=False, callback=None, **kwargs):
        super().__init__(axis, impute, callback, **kwargs)
        self.attributes = attributes
        self.discrete = None
        self.continuous = None
        self.normalize = False

    def __call__(self, e1, e2=None):
        if self.attributes is not None and (
                e1.domain.attributes != self.attributes
                or e2 is not None and e2.domain.attributes != self.attributes):
            raise ValueError("mismatching domains")
        return super().__call__(e1, e2)

    def continuous_columns(self, x1, x2, offset, scale):
        """
        Extract and scale continuous columns from data tables.
        If the second table is None, it defaults to the first table.

        Values are scaled if `self.normalize` is `True`.

        Args:
            x1 (np.ndarray): first table
            x2 (np.ndarray or None): second table
            offset (float): a constant (e.g. mean, median) subtracted from data
            scale: (float): divider (e.g. deviation)

        Returns:
            data1 (np.ndarray): scaled continuous columns from `x1`
            data2 (np.ndarray): scaled continuous columns from `x2` or `x1`
        """
        if self.continuous.all() and not self.normalize:
            data1, data2 = x1, x2
        else:
            data1 = x1[:, self.continuous]
            if x2 is not None:
                data2 = x2[:, self.continuous]
            if self.normalize:
                data1 = x1[:, self.continuous]
                data1 -= offset
                data1 /= scale
                if x2 is not None:
                    data2 = x2[:, self.continuous]
                    data2 -= offset
                    data2 /= scale
        if x2 is None:
            data2 = data1
        return data1, data2

    def discrete_columns(self, x1, x2):
        """
        Return discrete columns from the given tables.
        If the second table is None, it defaults to the first table.
        """
        if self.discrete.all():
            data1, data2 = x1, x1 if x2 is None else x2
        else:
            data1 = x1[:, self.discrete]
            data2 = data1 if x2 is None else x2[:, self.discrete]
        return data1, data2


class FittedDistance(Distance):
    """
    Base class for fitting models that store attribute-related data for
    normalization and imputation, and that treat discrete and continuous
    columns separately.

    The class implements a method `fit` that calls either `fit_columns`
    or `fit_rows` with the data and the number of values for discrete
    attributes. The provided method `fit_rows` calls methods
    `get_discrete_stats` and `get_continuous_stats` that can be implemented
    in derived classes.

    Class attribute `rows_model_type` contains the type of the model returned by
    `fit_rows`.
    """
    rows_model_type = None  #: Option[FittedDistanceModel]

    def fit(self, data):
        """
        Prepare the data on attributes, call `fit_cols` or `fit_rows` and
        return the resulting model.
        """
        x = _orange_to_numpy(data)
        if hasattr(data, "domain"):
            attributes = data.domain.attributes
            n_vals = np.fromiter(
                (len(attr.values) if attr.is_discrete else 0
                 for attr in attributes),
                dtype=np.int32, count=len(attributes))
        else:
            assert isinstance(x, np.ndarray)
            attributes = None
            n_vals = np.zeros(x.shape[1], dtype=np.int32)
        return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals)

    def fit_cols(self, attributes, x, n_vals):
        """
        Return DistanceModel for computation of distances between columns.
        Derived classes must define this method.

        Args:
            attributes (list of Orange.data.Variable): list of attributes
            x (np.ndarray): data
            n_vals (np.ndarray): number of attribute values, 0 for continuous
        """
        raise NotImplementedError

    def fit_rows(self, attributes, x, n_vals):
        """
        Return a DistanceModel for computation of distances between rows.

        The model type is `self.row_distance_model`. It stores the data for
        imputation of discrete and continuous values, and for normalization
        of continuous values. Typical examples are the Euclidean and Manhattan
        distance, for which the following data is stored:

        For continuous columns:

        - offsets[col] is the number subtracted from values in column `col`
        - scales[col] is the divisor for values in columns `col`
        - dist_missing2_cont[col]: the value used for distance between two
            missing values in column `col`

        For discrete values:

        - dist_missing_disc[col, value] is the distance added for the given
            `value` in the column `col` if the value for the other row is
            missing
        - dist_missing2_disc[col]: the distance between two missing values in
            column `col`
        """
        n_cols = len(n_vals)

        discrete = n_vals > 0
        n_bins = max(n_vals, default=0)
        n_discrete = sum(discrete)
        dist_missing_disc = np.zeros((n_discrete, n_bins), dtype=float)
        dist_missing2_disc = np.zeros(n_discrete, dtype=float)

        continuous = ~discrete
        n_continuous = sum(continuous)
        offsets = np.zeros(n_continuous, dtype=float)
        scales = np.empty(n_continuous, dtype=float)
        dist_missing2_cont = np.zeros(n_continuous, dtype=float)

        curr_disc = curr_cont = 0
        for col in range(n_cols):
            column = x[:, col]
            if np.isnan(column).all():
                continuous[col] = discrete[col] = False
            elif discrete[col]:
                discrete_stats = self.get_discrete_stats(column, n_bins)
                if discrete_stats is not None:
                    dist_missing_disc[curr_disc], \
                    dist_missing2_disc[curr_disc] = discrete_stats
                    curr_disc += 1
            else:
                continuous_stats = self.get_continuous_stats(column)
                if continuous_stats is not None:
                    offsets[curr_cont], scales[curr_cont],\
                    dist_missing2_cont[curr_cont] = continuous_stats
                    curr_cont += 1
                else:
                    continuous[col] = False
        # pylint: disable=not-callable
        return self.rows_model_type(
            attributes, impute, getattr(self, "normalize", False),
            continuous, discrete,
            offsets[:curr_cont], scales[:curr_cont],
            dist_missing2_cont[:curr_cont],
            dist_missing_disc, dist_missing2_disc,
            self.callback)

    @staticmethod
    def get_discrete_stats(column, n_bins):
        """
        Return tables used computing distance between missing discrete values.

        Args:
            column (np.ndarray): column data
            n_bins (int): maximal number of bins in the dataset

        Returns:
            dist_missing_disc (np.ndarray): `dist_missing_disc[value]` is
                1 - probability of `value`, which is used as the distance added
                for the given `value` in the column `col` if the value for the
                other row is missing
            dist_missing2_disc (float): the distance between two missing
                values in this columns
        """
        dist = util.bincount(column, minlength=n_bins)[0]
        dist /= max(1, sum(dist))
        return 1 - dist, 1 - np.sum(dist ** 2)

    def get_continuous_stats(self, column):
        """
        Compute statistics for imputation and normalization of continuous data.
        Derived classes must define this method.

        Args:
            column (np.ndarray): column data

        Returns:
            offset (float): the number subtracted from values in column
            scales (float): the divisor for values in column
            dist_missing2_cont (float): the value used for distance between two
                missing values in column
        """
        raise NotImplementedError


# Fallbacks for distances in sparse data
# To be removed as the corresponding functionality is implemented properly

class SklDistance:
    """
    Wrapper for functions sklearn's metrics. Used only as temporary fallbacks
    when `Euclidean`, `Manhattan`, `Cosine` or `Jaccard` are given sparse data
    or raw numpy arrays. These classes can't handle discrete or missing data
    and normalization. Do not use for wrapping new classes.
    """
    def __init__(self, metric):
        self.metric = metric

    def __call__(self, e1, e2=None, axis=1, impute=False):
        x1 = _orange_to_numpy(e1)
        x2 = _orange_to_numpy(e2)
        if axis == 0:
            x1 = x1.T
            if x2 is not None:
                x2 = x2.T
        dist = skl_metrics.pairwise.pairwise_distances(
            x1, x2, metric=self.metric)
        if impute and np.isnan(dist).any():
            dist = np.nan_to_num(dist)
        if isinstance(e1, (Table, RowInstance)):
            dist_matrix = DistMatrix(dist, e1, e2, axis)
        else:
            dist_matrix = DistMatrix(dist)
        return dist_matrix