File: _tree_scorers.pyx

package info (click to toggle)
orange3 3.40.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,912 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (521 lines) | stat: -rw-r--r-- 18,247 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
#cython: embedsignature=True
#cython: boundscheck=False
#cython: wraparound=False
#cython: infer_types=True
#cython: cdivision=True
#cython: language_level=3


import numpy as np
cimport numpy as np

from libc.math cimport log

cdef extern from "numpy/npy_math.h":
    bint npy_isnan(double x) nogil

cpdef enum:
    NULL_BRANCH = -1

def contingency(const double[:] x, int nx, const double[:] y, int ny):
    cdef:
        np.ndarray[np.uint32_t, ndim=2] cont = np.zeros((ny, nx), dtype=np.uint32)
        int n = len(x), yi, xi

    for i in range(n):
        if not npy_isnan(x[i]) and not npy_isnan(y[i]):
            yi, xi = int(y[i]), int(x[i])
            cont[yi, xi] += 1
    return cont

def find_threshold_entropy(const double[:] x, const double[:] y,
                           const np.intp_t[:] idx,
                           int n_classes, int min_leaf):
    """
    Find the threshold for continuous attribute values that maximizes
    information gain.

    Argument min_leaf sets the minimal number of data instances on each side
    of the threshold. If there is no threshold within that limits with positive
    information gain, the function returns (0, 0).

    Args:
        x: attribute values
        y: class values
        idx: arg-sorted indices of x (and y)
        n_classes: the number of classes
        min_leaf: the minimal number of instances on each side of the threshold

    Returns:
        (highest information gain, the corresponding optimal threshold)
    """
    cdef:
        unsigned int[:] distr = np.zeros(2 * n_classes, dtype=np.uint32)
        Py_ssize_t i, j
        double entro, class_entro, best_entro
        unsigned int p, curr_y
        unsigned int best_idx = 0
        unsigned int N = idx.shape[0]

    # Initial split (min_leaf on the left)
    if N <= min_leaf:
        return 0, 0
    with nogil:
        for i in range(min_leaf - 1):  # one will be added in the loop
            distr[n_classes + <int>y[idx[i]]] += 1
        for i in range(min_leaf - 1, N):
            distr[<int>y[idx[i]]] += 1

        # Compute class entropy
        class_entro = N * log(N)
        for j in range(n_classes):
            p = distr[j] + distr[j + n_classes]
            if p:
                class_entro -= p * log(p)
        best_entro = class_entro

        # Loop through
        for i in range(min_leaf - 1, N - min_leaf):
            curr_y = <int>y[idx[i]]
            distr[curr_y] -= 1
            distr[n_classes + curr_y] += 1
            if x[idx[i]] != x[idx[i + 1]]:
                entro = (i + 1) * log(i + 1) + (N - i - 1) * log(N - i - 1)
                for j in range(2 * n_classes):
                    if distr[j]:
                        entro -= distr[j] * log(distr[j])
                if entro < best_entro:
                    best_entro = entro
                    best_idx = i
    return (class_entro - best_entro) / N / log(2), x[idx[best_idx]]


def find_binarization_entropy(const double[:, :] cont,
                              const double[:] class_distr,
                              const double[:] val_distr, int min_leaf):
    """
    Find the split of discrete values into two groups that optimizes information
    gain.

    The split is returned as an int in which the lower bits give the group
    membership of the corresponding values; the first value of the attribute
    corresponding to bit zero and so forth.

    Argument min_leaf sets the minimal number of data instances in each group.
    If there is no split (within that limits) with positive information gain,
    the function returns (0, 0).

    The function works by traversing over the 2 ** (n - 1) possible states in
    the order of Gray encoding. With this, the function does not recompute the
    sum of distributions but just moves one distribution at a time to the left
    or to the right.

    Args:
        cont: contingency matrix
        class_distr: marginal class distribution (sum of cont over axis 1)
        val_distr: marginal attribute value distribution (sum of cont over axis 0)
        min_leaf: the minimal number of instances on each side of the threshold

    Returns:
        (highest information gain, the corresponding optimal mapping)
    """
    cdef:
        unsigned int n_classes = cont.shape[0]
        unsigned int n_values = cont.shape[1]
        double[:] distr = np.zeros(2 * n_classes)
        double[:] mfrom
        double[:] mto
        double left, right
        unsigned int i, change, to_right, allowed, m
        unsigned int best_mapping = 0, move = 0, mapping, previous
        double entro, class_entro, best_entro
        double N = 0

    with nogil:
        class_entro = 0
        for i in range(n_classes):
            distr[i + n_classes] = 0
            distr[i] = class_distr[i]
            if class_distr[i] > 0:
                N += class_distr[i]
                class_entro -= class_distr[i] * log(class_distr[i])
        class_entro += N * log(N)
        best_entro = class_entro
        left = N
        right = 0

        previous = 0
        # Gray code
        for m in range(1, 1 << (n_values - 1)):
            # What moves where
            mapping = m ^ (m >> 1)
            change = mapping ^ previous
            to_right = change & mapping
            for move in range(n_values):
                if change & 1:
                    break
                change = change >> 1
            previous = mapping

            if to_right:
                left -= val_distr[move]
                right += val_distr[move]
                mfrom = distr
                mto = distr[n_classes:]
            else:
                left += val_distr[move]
                right -= val_distr[move]
                mfrom = distr[n_classes:]
                mto = distr

            allowed = left >= min_leaf and right >= min_leaf
            # Move distribution to the other side and
            # compute entropy by the way, if the split is allowed
            entro = 0
            for i in range(n_classes):
                mfrom[i] -= cont[i, move]
                mto[i] += cont[i, move]
                if allowed:
                    if mfrom[i]:
                        entro -= mfrom[i] * log(mfrom[i])
                    if mto[i]:
                        entro -= mto[i] * log(mto[i])

            if allowed:
                entro += left * log(left) + right * log(right)
                if entro < best_entro:
                    best_entro = entro
                    best_mapping = mapping
    return (class_entro - best_entro) / N / log(2), best_mapping


def find_threshold_MSE(const double[:] x,
                       const double[:] y,
                       const np.intp_t[:] idx, int min_leaf):
    """
    Find the threshold for continuous attribute values that minimizes MSE.

    Argument min_leaf sets the minimal number of data instances on each side
    of the threshold. If there is no threshold within that limits that decreases
    the MSE with respect to the prior MSE, the function returns (0, 0).

    Args:
        x: attribute values
        y: target values
        idx: arg sorted indices of x (and y)
        min_leaf: the minimal number of instances on each side of the threshold

    Returns:
        (largest MSE decrease, the corresponding optimal threshold)
    """
    cdef:
        double sleft = 0, sum, inter, best_inter
        unsigned int i, best_idx = 0
        unsigned int N = idx.shape[0]

    # Initial split (min_leaf on the left)
    if N <= min_leaf:
        return 0, 0
    with nogil:
        sum = 0
        for i in range(min_leaf - 1):  # one will be added in the loop
            sum += y[idx[i]]
        sleft = sum
        for i in range(min_leaf - 1, N):
            sum += y[idx[i]]

        best_inter = (sum * sum) / N
        for i in range(min_leaf - 1, N - min_leaf):
            sleft += y[idx[i]]
            if x[idx[i]] == x[idx[i + 1]]:
                continue
            inter = sleft * sleft / (i + 1) + (sum - sleft) * (sum - sleft) / (N - i - 1)
            if inter > best_inter:
                best_inter = inter
                best_idx = i
    return (best_inter - (sum * sum) / N) / N, x[idx[best_idx]]


def find_binarization_MSE(const double[:] x,
                          const double[:] y, int n_values, int min_leaf):
    """
    Find the split of discrete values into two groups that minimizes the MSE.

    The split is returned as an int in which the lower bits give the group
    membership of the corresponding values; the first value of the attribute
    corresponding to bit zero and so forth.

    The score is decreased in proportion with the number of missing values in x.

    Argument min_leaf sets the minimal number of data instances in each group.
    If there is no split (within that limits) that decreases the average
    MSE with respect to the prior MSE, the function returns (0, 0).

    The function works by traversing over the 2 ** (n - 1) possible states in
    the order of Gray encoding. With this, the function does not recompute the
    sums but just moves one value at a time to the left or to the right.

    Args:
        x: attribute values
        y: target values
        n_values: the number of attribute values
        min_leaf: the minimal number of instances on each side of the threshold

    Returns:
        (largest MSE decrease, the corresponding optimal mapping)
    """
    cdef:
        double sleft, sum = 0, val
        unsigned int left
        unsigned int i, change, to_right, m
        unsigned int best_mapping = 0, move = 0, mapping, previous
        double inter, best_inter, start_inter
        unsigned int N

        np.int32_t[:] group_sizes = np.zeros(n_values, dtype=np.int32)
        double[:] group_sums = np.zeros(n_values)

    N = 0
    for i in range(x.shape[0]):
        val = x[i]
        if not npy_isnan(val):
            group_sizes[<int>val] += 1
            group_sums[<int>val] += y[i]
            sum += y[i]
            N += 1
    if N == 0:
        return 0, 0
    with nogil:
        left = N
        sleft = sum
        best_inter = start_inter = (sum * sum) / N

        previous = 0
        # Gray code
        for m in range(1, 1 << (n_values - 1)):
            # What moves where
            mapping = m ^ (m >> 1)
            change = mapping ^ previous
            to_right = change & mapping
            for move in range(n_values):
                if change & 1:
                    break
                change = change >> 1
            previous = mapping

            if to_right:
                left -= group_sizes[move]
                sleft -= group_sums[move]
            else:
                left += group_sizes[move]
                sleft += group_sums[move]

            if left >= min_leaf and (N - left) >= min_leaf:
                inter = sleft * sleft / left + (sum - sleft) * (sum - sleft) / (N - left)
                if inter > best_inter:
                    best_inter = inter
                    best_mapping = mapping
        # factor N / x.shape[0] is the punishment for missing values
        # return (best_inter - start_inter) / N * (N / x.shape[0]), best_mapping
    return (best_inter - start_inter) / x.shape[0], best_mapping


def compute_grouped_MSE(const double[:] x,
                        const double[:] y,
                        int n_values, int min_leaf):
    """
    Compute the MSE decrease of the given split into groups.

    Argument min_leaf sets the minimal number of data instances in each group.
    If there are less than two groups with such number of instances, the
    function returns (0, 0).

    The score is decreased in proportion with the number of missing values in x.

    Args:
        x: attribute values
        y: target values
        n_values: the number of attribute values
        min_leaf: the minimal number of instances on each side of the threshold

    Returns:
        MSE decrease
    """

    cdef:
        int i, n
        #: number of valid nodes (having at least `min_leaf` instances)
        int nvalid = 0
        double sum = 0, inter, tx

        np.int32_t[:] group_sizes = np.zeros(n_values, dtype=np.int32)
        double[:] group_sums = np.zeros(n_values)

    with nogil:
        for i in range(x.shape[0]):
            tx = x[i]
            if not npy_isnan(tx):
                group_sizes[<int>tx] += 1
                group_sums[<int>tx] += y[i]
        inter = 0
        n = 0
        for i in range(n_values):
            if group_sizes[i] < min_leaf:
                # We don't construct nodes with less than min_leaf instances
                # If there is only one non-null node, the split will yield a
                # score of 0
                continue
            inter += group_sums[i] * group_sums[i] / group_sizes[i]
            sum += group_sums[i]
            n += group_sizes[i]
            nvalid += 1
    if nvalid < 2:
        # NOTE: the `inter - sum * sum / n` below does not necessarily
        # cancel out
        return 0
    # factor n / x.shape[0] is the punishment for missing values
    #return (inter - sum * sum / n) / n * n / x.shape[0]
    return (inter - sum * sum / n) / x.shape[0]


def compute_predictions(const double[:, :] X,
                        const int[:] code,
                        const double[:, :] values,
                        const double[:] thresholds):
    """
    Return the values (distributions, means and variances) stored in the nodes
    to which the tree classify the rows in X.

    The tree is encoded by :obj:`Orange.tree.OrangeTreeMode._compile`.

    The result is a matrix of shape (X.shape[0], values.shape[1])

    Args:
        X: data for which the predictions are made
        code: encoded tree
        values: values corresponding to tree nodes
        thresholds: thresholds for numeric nodes

    Returns:
        a matrix of values
    """
    cdef:
        unsigned int node_ptr, i, j, val_idx
        signed int next_node_ptr, node_idx
        np.float64_t val
        double[: ,:] predictions = np.empty(
            (X.shape[0], values.shape[1]), dtype=np.float64)

    with nogil:
        for i in range(X.shape[0]):
            node_ptr = 0
            while code[node_ptr]:
                val = X[i, code[node_ptr + 2]]
                if npy_isnan(val):
                    break
                if code[node_ptr] == 3:
                    node_idx = code[node_ptr + 1]
                    val_idx = int(val > thresholds[node_idx])
                else:
                    val_idx = int(val)
                next_node_ptr = code[node_ptr + 3 + val_idx]
                if next_node_ptr == NULL_BRANCH:
                    break
                node_ptr = next_node_ptr
            node_idx = code[node_ptr + 1]
            for j in range(values.shape[1]):
                predictions[i, j] = values[node_idx, j]
    return np.asarray(predictions)


def compute_predictions_csr(X,
                            const int[:] code,
                            const double[:, :] values,
                            const double[:] thresholds):
    """
    Same as compute_predictions except for sparse data
    """
    cdef:
        unsigned int node_ptr, i, j, val_idx
        signed int next_node_ptr, node_idx
        np.float64_t val
        double[: ,:] predictions = np.empty(
            (X.shape[0], values.shape[1]), dtype=np.float64)

        const double[:] data = X.data
        const np.int32_t[:] indptr = X.indptr
        const np.int32_t[:] indices = X.indices
        int ind, attr, n_rows

    n_rows = X.shape[0]

    with nogil:
        for i in range(n_rows):
            node_ptr = 0
            while code[node_ptr]:
                attr = code[node_ptr + 2]
                ind = indptr[i]
                while ind < indptr[i + 1] and indices[ind] != attr:
                    ind += 1
                val = data[ind] if ind < indptr[i + 1] else 0
                if npy_isnan(val):
                    break
                if code[node_ptr] == 3:
                    node_idx = code[node_ptr + 1]
                    val_idx = int(val > thresholds[node_idx])
                else:
                    val_idx = int(val)
                next_node_ptr = code[node_ptr + 3 + val_idx]
                if next_node_ptr == NULL_BRANCH:
                    break
                node_ptr = next_node_ptr
            node_idx = code[node_ptr + 1]
            for j in range(values.shape[1]):
                predictions[i, j] = values[node_idx, j]
    return np.asarray(predictions)

def compute_predictions_csc(X,
                            const int[:] code,
                            const double[:, :] values,
                            const double[:] thresholds):
    """
    Same as compute_predictions except for sparse data
    """
    cdef:
        unsigned int node_ptr, i, j, val_idx
        signed int next_node_ptr, node_idx
        np.float64_t val
        double[: ,:] predictions = np.empty(
            (X.shape[0], values.shape[1]), dtype=np.float64)

        const double[:] data = X.data
        const np.int32_t[:] indptr = X.indptr
        const np.int32_t[:] indices = X.indices
        int ind, attr, n_rows

    n_rows = X.shape[0]

    with nogil:
        for i in range(n_rows):
            node_ptr = 0
            while code[node_ptr]:
                attr = code[node_ptr + 2]
                ind = indptr[attr]
                while ind < indptr[attr + 1] and indices[ind] != i:
                    ind += 1
                val = data[ind] if ind < indptr[attr + 1] else 0
                if npy_isnan(val):
                    break
                if code[node_ptr] == 3:
                    node_idx = code[node_ptr + 1]
                    val_idx = int(val > thresholds[node_idx])
                else:
                    val_idx = int(val)
                next_node_ptr = code[node_ptr + 3 + val_idx]
                if next_node_ptr == NULL_BRANCH:
                    break
                node_ptr = next_node_ptr
            node_idx = code[node_ptr + 1]
            for j in range(values.shape[1]):
                predictions[i, j] = values[node_idx, j]
    return np.asarray(predictions)