File: _target_encoder_fast.pyx

package info (click to toggle)
scikit-learn 1.4.2%2Bdfsg-8
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 25,036 kB
  • sloc: python: 201,105; cpp: 5,790; ansic: 854; makefile: 304; sh: 56; javascript: 20
file content (167 lines) | stat: -rw-r--r-- 5,962 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from libc.math cimport isnan
from libcpp.vector cimport vector

cimport numpy as cnp
import numpy as np

cnp.import_array()

ctypedef fused INT_DTYPE:
    cnp.int64_t
    cnp.int32_t

ctypedef fused Y_DTYPE:
    cnp.int64_t
    cnp.int32_t
    cnp.float64_t
    cnp.float32_t


def _fit_encoding_fast(
    INT_DTYPE[:, ::1] X_int,
    const Y_DTYPE[:] y,
    cnp.int64_t[::1] n_categories,
    double smooth,
    double y_mean,
):
    """Fit a target encoding on X_int and y.

    This implementation uses Eq 7 from [1] to compute the encoding.
    As stated in the paper, Eq 7 is the same as Eq 3.

    [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
         categorical attributes in classification and prediction problems"
    """
    cdef:
        cnp.int64_t sample_idx, feat_idx, cat_idx, n_cats
        INT_DTYPE X_int_tmp
        int n_samples = X_int.shape[0]
        int n_features = X_int.shape[1]
        double smooth_sum = smooth * y_mean
        cnp.int64_t max_n_cats = np.max(n_categories)
        double[::1] sums = np.empty(max_n_cats, dtype=np.float64)
        double[::1] counts = np.empty(max_n_cats, dtype=np.float64)
        list encodings = []
        double[::1] current_encoding
        # Gives access to encodings without gil
        vector[double*] encoding_vec

    encoding_vec.resize(n_features)
    for feat_idx in range(n_features):
        current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
        encoding_vec[feat_idx] = &current_encoding[0]
        encodings.append(np.asarray(current_encoding))

    with nogil:
        for feat_idx in range(n_features):
            n_cats = n_categories[feat_idx]

            for cat_idx in range(n_cats):
                sums[cat_idx] = smooth_sum
                counts[cat_idx] = smooth

            for sample_idx in range(n_samples):
                X_int_tmp = X_int[sample_idx, feat_idx]
                # -1 are unknown categories, which are not counted
                if X_int_tmp == -1:
                    continue
                sums[X_int_tmp] += y[sample_idx]
                counts[X_int_tmp] += 1.0

            for cat_idx in range(n_cats):
                if counts[cat_idx] == 0:
                    encoding_vec[feat_idx][cat_idx] = y_mean
                else:
                    encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx]

    return encodings


def _fit_encoding_fast_auto_smooth(
    INT_DTYPE[:, ::1] X_int,
    const Y_DTYPE[:] y,
    cnp.int64_t[::1] n_categories,
    double y_mean,
    double y_variance,
):
    """Fit a target encoding on X_int and y with auto smoothing.

    This implementation uses Eq 5 and 6 from [1].

    [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
         categorical attributes in classification and prediction problems"
    """
    cdef:
        cnp.int64_t sample_idx, feat_idx, cat_idx, n_cats
        INT_DTYPE X_int_tmp
        double diff
        int n_samples = X_int.shape[0]
        int n_features = X_int.shape[1]
        cnp.int64_t max_n_cats = np.max(n_categories)
        double[::1] means = np.empty(max_n_cats, dtype=np.float64)
        cnp.int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
        double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
        double lambda_
        list encodings = []
        double[::1] current_encoding
        # Gives access to encodings without gil
        vector[double*] encoding_vec

    encoding_vec.resize(n_features)
    for feat_idx in range(n_features):
        current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
        encoding_vec[feat_idx] = &current_encoding[0]
        encodings.append(np.asarray(current_encoding))

    # TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's
    # probably good to parallelize the outer loop. When n_features is too small,
    # then it would probably better to parallelize the nested loops on n_samples and
    # n_cats, but the code to handle thread-local temporary variables might be
    # significantly more complex.
    with nogil:
        for feat_idx in range(n_features):
            n_cats = n_categories[feat_idx]

            for cat_idx in range(n_cats):
                means[cat_idx] = 0.0
                counts[cat_idx] = 0
                sum_of_squared_diffs[cat_idx] = 0.0

            # first pass to compute the mean
            for sample_idx in range(n_samples):
                X_int_tmp = X_int[sample_idx, feat_idx]

                # -1 are unknown categories, which are not counted
                if X_int_tmp == -1:
                    continue
                counts[X_int_tmp] += 1
                means[X_int_tmp] += y[sample_idx]

            for cat_idx in range(n_cats):
                means[cat_idx] /= counts[cat_idx]

            # second pass to compute the sum of squared differences
            for sample_idx in range(n_samples):
                X_int_tmp = X_int[sample_idx, feat_idx]
                if X_int_tmp == -1:
                    continue
                diff = y[sample_idx] - means[X_int_tmp]
                sum_of_squared_diffs[X_int_tmp] += diff * diff

            for cat_idx in range(n_cats):
                lambda_ = (
                    y_variance * counts[cat_idx] /
                    (y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
                     counts[cat_idx])
                )
                if isnan(lambda_):
                    # A nan can happen when:
                    # 1. counts[cat_idx] == 0
                    # 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0
                    encoding_vec[feat_idx][cat_idx] = y_mean
                else:
                    encoding_vec[feat_idx][cat_idx] = (
                        lambda_ * means[cat_idx] + (1 - lambda_) * y_mean
                    )

    return encodings