1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
|
from libc.math cimport isnan
from libcpp.vector cimport vector
cimport numpy as cnp
import numpy as np
cnp.import_array()
ctypedef fused INT_DTYPE:
cnp.int64_t
cnp.int32_t
ctypedef fused Y_DTYPE:
cnp.int64_t
cnp.int32_t
cnp.float64_t
cnp.float32_t
def _fit_encoding_fast(
INT_DTYPE[:, ::1] X_int,
const Y_DTYPE[:] y,
cnp.int64_t[::1] n_categories,
double smooth,
double y_mean,
):
"""Fit a target encoding on X_int and y.
This implementation uses Eq 7 from [1] to compute the encoding.
As stated in the paper, Eq 7 is the same as Eq 3.
[1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
categorical attributes in classification and prediction problems"
"""
cdef:
cnp.int64_t sample_idx, feat_idx, cat_idx, n_cats
INT_DTYPE X_int_tmp
int n_samples = X_int.shape[0]
int n_features = X_int.shape[1]
double smooth_sum = smooth * y_mean
cnp.int64_t max_n_cats = np.max(n_categories)
double[::1] sums = np.empty(max_n_cats, dtype=np.float64)
double[::1] counts = np.empty(max_n_cats, dtype=np.float64)
list encodings = []
double[::1] current_encoding
# Gives access to encodings without gil
vector[double*] encoding_vec
encoding_vec.resize(n_features)
for feat_idx in range(n_features):
current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
encoding_vec[feat_idx] = ¤t_encoding[0]
encodings.append(np.asarray(current_encoding))
with nogil:
for feat_idx in range(n_features):
n_cats = n_categories[feat_idx]
for cat_idx in range(n_cats):
sums[cat_idx] = smooth_sum
counts[cat_idx] = smooth
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]
# -1 are unknown categories, which are not counted
if X_int_tmp == -1:
continue
sums[X_int_tmp] += y[sample_idx]
counts[X_int_tmp] += 1.0
for cat_idx in range(n_cats):
if counts[cat_idx] == 0:
encoding_vec[feat_idx][cat_idx] = y_mean
else:
encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx]
return encodings
def _fit_encoding_fast_auto_smooth(
INT_DTYPE[:, ::1] X_int,
const Y_DTYPE[:] y,
cnp.int64_t[::1] n_categories,
double y_mean,
double y_variance,
):
"""Fit a target encoding on X_int and y with auto smoothing.
This implementation uses Eq 5 and 6 from [1].
[1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
categorical attributes in classification and prediction problems"
"""
cdef:
cnp.int64_t sample_idx, feat_idx, cat_idx, n_cats
INT_DTYPE X_int_tmp
double diff
int n_samples = X_int.shape[0]
int n_features = X_int.shape[1]
cnp.int64_t max_n_cats = np.max(n_categories)
double[::1] means = np.empty(max_n_cats, dtype=np.float64)
cnp.int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
double lambda_
list encodings = []
double[::1] current_encoding
# Gives access to encodings without gil
vector[double*] encoding_vec
encoding_vec.resize(n_features)
for feat_idx in range(n_features):
current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64)
encoding_vec[feat_idx] = ¤t_encoding[0]
encodings.append(np.asarray(current_encoding))
# TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's
# probably good to parallelize the outer loop. When n_features is too small,
# then it would probably better to parallelize the nested loops on n_samples and
# n_cats, but the code to handle thread-local temporary variables might be
# significantly more complex.
with nogil:
for feat_idx in range(n_features):
n_cats = n_categories[feat_idx]
for cat_idx in range(n_cats):
means[cat_idx] = 0.0
counts[cat_idx] = 0
sum_of_squared_diffs[cat_idx] = 0.0
# first pass to compute the mean
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]
# -1 are unknown categories, which are not counted
if X_int_tmp == -1:
continue
counts[X_int_tmp] += 1
means[X_int_tmp] += y[sample_idx]
for cat_idx in range(n_cats):
means[cat_idx] /= counts[cat_idx]
# second pass to compute the sum of squared differences
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]
if X_int_tmp == -1:
continue
diff = y[sample_idx] - means[X_int_tmp]
sum_of_squared_diffs[X_int_tmp] += diff * diff
for cat_idx in range(n_cats):
lambda_ = (
y_variance * counts[cat_idx] /
(y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
counts[cat_idx])
)
if isnan(lambda_):
# A nan can happen when:
# 1. counts[cat_idx] == 0
# 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0
encoding_vec[feat_idx][cat_idx] = y_mean
else:
encoding_vec[feat_idx][cat_idx] = (
lambda_ * means[cat_idx] + (1 - lambda_) * y_mean
)
return encodings
|