File: _loss.pyx

package info (click to toggle)
scikit-learn 0.23.2-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 21,892 kB
  • sloc: python: 132,020; cpp: 5,765; javascript: 2,201; ansic: 831; makefile: 213; sh: 44
file content (213 lines) | stat: -rw-r--r-- 7,529 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# cython: cdivision=True
# cython: boundscheck=False
# cython: wraparound=False
# cython: language_level=3

# Author: Nicolas Hug

cimport cython
from cython.parallel import prange
import numpy as np
cimport numpy as np

from libc.math cimport exp, log

from .common cimport Y_DTYPE_C
from .common cimport G_H_DTYPE_C

np.import_array()


def _update_gradients_least_squares(
        G_H_DTYPE_C [::1] gradients,  # OUT
        const Y_DTYPE_C [::1] y_true,  # IN
        const Y_DTYPE_C [::1] raw_predictions):  # IN

    cdef:
        int n_samples
        int i

    n_samples = raw_predictions.shape[0]
    for i in prange(n_samples, schedule='static', nogil=True):
        # Note: a more correct expression is 2 * (raw_predictions - y_true)
        # but since we use 1 for the constant hessian value (and not 2) this
        # is strictly equivalent for the leaves values.
        gradients[i] = raw_predictions[i] - y_true[i]


def _update_gradients_hessians_least_squares(
        G_H_DTYPE_C [::1] gradients,  # OUT
        G_H_DTYPE_C [::1] hessians,  # OUT
        const Y_DTYPE_C [::1] y_true,  # IN
        const Y_DTYPE_C [::1] raw_predictions,  # IN
        const Y_DTYPE_C [::1] sample_weight):  # IN

    cdef:
        int n_samples
        int i

    n_samples = raw_predictions.shape[0]
    for i in prange(n_samples, schedule='static', nogil=True):
        # Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight
        # but since we use 1 for the constant hessian value (and not 2) this
        # is strictly equivalent for the leaves values.
        gradients[i] = (raw_predictions[i] - y_true[i]) * sample_weight[i]
        hessians[i] = sample_weight[i]


def _update_gradients_hessians_least_absolute_deviation(
        G_H_DTYPE_C [::1] gradients,  # OUT
        G_H_DTYPE_C [::1] hessians,  # OUT
        const Y_DTYPE_C [::1] y_true,  # IN
        const Y_DTYPE_C [::1] raw_predictions,  # IN
        const Y_DTYPE_C [::1] sample_weight):  # IN

    cdef:
        int n_samples
        int i

    n_samples = raw_predictions.shape[0]
    for i in prange(n_samples, schedule='static', nogil=True):
        # gradient = sign(raw_predicition - y_pred) * sample_weight
        gradients[i] = sample_weight[i] * (2 *
                        (y_true[i] - raw_predictions[i] < 0) - 1)
        hessians[i] = sample_weight[i]


def _update_gradients_least_absolute_deviation(
        G_H_DTYPE_C [::1] gradients,  # OUT
        const Y_DTYPE_C [::1] y_true,  # IN
        const Y_DTYPE_C [::1] raw_predictions):  # IN

    cdef:
        int n_samples
        int i

    n_samples = raw_predictions.shape[0]
    for i in prange(n_samples, schedule='static', nogil=True):
        # gradient = sign(raw_predicition - y_pred)
        gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1


def _update_gradients_hessians_poisson(
        G_H_DTYPE_C [::1] gradients,  # OUT
        G_H_DTYPE_C [::1] hessians,  # OUT
        const Y_DTYPE_C [::1] y_true,  # IN
        const Y_DTYPE_C [::1] raw_predictions,  # IN
        const Y_DTYPE_C [::1] sample_weight):  # IN

    cdef:
        int n_samples
        int i
        Y_DTYPE_C y_pred

    n_samples = raw_predictions.shape[0]
    if sample_weight is None:
        for i in prange(n_samples, schedule='static', nogil=True):
            # Note: We use only half of the deviance loss. Therefore, there is
            # no factor of 2.
            y_pred = exp(raw_predictions[i])
            gradients[i] = (y_pred - y_true[i])
            hessians[i] = y_pred
    else:
        for i in prange(n_samples, schedule='static', nogil=True):
            # Note: We use only half of the deviance loss. Therefore, there is
            # no factor of 2.
            y_pred = exp(raw_predictions[i])
            gradients[i] = (y_pred - y_true[i]) * sample_weight[i]
            hessians[i] = y_pred * sample_weight[i]


def _update_gradients_hessians_binary_crossentropy(
        G_H_DTYPE_C [::1] gradients,  # OUT
        G_H_DTYPE_C [::1] hessians,  # OUT
        const Y_DTYPE_C [::1] y_true,  # IN
        const Y_DTYPE_C [::1] raw_predictions,  # IN
        const Y_DTYPE_C [::1] sample_weight):  # IN
    cdef:
        int n_samples
        Y_DTYPE_C p_i  # proba that ith sample belongs to positive class
        int i

    n_samples = raw_predictions.shape[0]
    if sample_weight is None:
        for i in prange(n_samples, schedule='static', nogil=True):
            p_i = _cexpit(raw_predictions[i])
            gradients[i] = p_i - y_true[i]
            hessians[i] = p_i * (1. - p_i)
    else:
        for i in prange(n_samples, schedule='static', nogil=True):
            p_i = _cexpit(raw_predictions[i])
            gradients[i] = (p_i - y_true[i]) * sample_weight[i]
            hessians[i] = p_i * (1. - p_i) * sample_weight[i]


def _update_gradients_hessians_categorical_crossentropy(
        G_H_DTYPE_C [:, ::1] gradients,  # OUT
        G_H_DTYPE_C [:, ::1] hessians,  # OUT
        const Y_DTYPE_C [::1] y_true,  # IN
        const Y_DTYPE_C [:, ::1] raw_predictions,  # IN
        const Y_DTYPE_C [::1] sample_weight):  # IN
    cdef:
        int prediction_dim = raw_predictions.shape[0]
        int n_samples = raw_predictions.shape[1]
        int k  # class index
        int i  # sample index
        Y_DTYPE_C sw
        # p[i, k] is the probability that class(ith sample) == k.
        # It's the softmax of the raw predictions
        Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim))
        Y_DTYPE_C p_i_k

    if sample_weight is None:
        for i in prange(n_samples, schedule='static', nogil=True):
            # first compute softmaxes of sample i for each class
            for k in range(prediction_dim):
                p[i, k] = raw_predictions[k, i]  # prepare softmax
            _compute_softmax(p, i)
            # then update gradients and hessians
            for k in range(prediction_dim):
                p_i_k = p[i, k]
                gradients[k, i] = p_i_k - (y_true[i] == k)
                hessians[k, i] = p_i_k * (1. - p_i_k)
    else:
        for i in prange(n_samples, schedule='static', nogil=True):
            # first compute softmaxes of sample i for each class
            for k in range(prediction_dim):
                p[i, k] = raw_predictions[k, i]  # prepare softmax
            _compute_softmax(p, i)
            # then update gradients and hessians
            sw = sample_weight[i]
            for k in range(prediction_dim):
                p_i_k = p[i, k]
                gradients[k, i] = (p_i_k - (y_true[i] == k)) * sw
                hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw


cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil:
    """Compute softmaxes of values in p[i, :]."""
    # i needs to be passed (and stays constant) because otherwise Cython does
    # not generate optimal code

    cdef:
        Y_DTYPE_C max_value = p[i, 0]
        Y_DTYPE_C sum_exps = 0.
        unsigned int k
        unsigned prediction_dim = p.shape[1]

    # Compute max value of array for numerical stability
    for k in range(1, prediction_dim):
        if max_value < p[i, k]:
            max_value = p[i, k]

    for k in range(prediction_dim):
        p[i, k] = exp(p[i, k] - max_value)
        sum_exps += p[i, k]

    for k in range(prediction_dim):
        p[i, k] /= sum_exps


cdef inline Y_DTYPE_C _cexpit(const Y_DTYPE_C x) nogil:
    """Custom expit (logistic sigmoid function)"""
    return 1. / (1. + exp(-x))