1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
|
# Author: Nicolas Hug
from cython.parallel import prange
from libc.math cimport isnan
from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
def _map_to_bins(const X_DTYPE_C [:, :] data,
list binning_thresholds,
const unsigned char[::1] is_categorical,
const unsigned char missing_values_bin_idx,
int n_threads,
X_BINNED_DTYPE_C [::1, :] binned):
"""Bin continuous and categorical values to discrete integer-coded levels.
A given value x is mapped into bin value i iff
thresholds[i - 1] < x <= thresholds[i]
Parameters
----------
data : ndarray, shape (n_samples, n_features)
The data to bin.
binning_thresholds : list of arrays
For each feature, stores the increasing numeric values that are
used to separate the bins.
is_categorical : ndarray of unsigned char of shape (n_features,)
Indicates categorical features.
n_threads : int
Number of OpenMP threads to use.
binned : ndarray, shape (n_samples, n_features)
Output array, must be fortran aligned.
"""
cdef:
int feature_idx
for feature_idx in range(data.shape[1]):
_map_col_to_bins(
data[:, feature_idx],
binning_thresholds[feature_idx],
is_categorical[feature_idx],
missing_values_bin_idx,
n_threads,
binned[:, feature_idx]
)
cdef void _map_col_to_bins(
const X_DTYPE_C [:] data,
const X_DTYPE_C [:] binning_thresholds,
const unsigned char is_categorical,
const unsigned char missing_values_bin_idx,
int n_threads,
X_BINNED_DTYPE_C [:] binned
):
"""Binary search to find the bin index for each value in the data."""
cdef:
int i
int left
int right
int middle
for i in prange(data.shape[0], schedule='static', nogil=True,
num_threads=n_threads):
if (
isnan(data[i]) or
# To follow LightGBM's conventions, negative values for
# categorical features are considered as missing values.
(is_categorical and data[i] < 0)
):
binned[i] = missing_values_bin_idx
else:
# for known values, use binary search
left, right = 0, binning_thresholds.shape[0]
while left < right:
# equal to (right + left - 1) // 2 but avoids overflow
middle = left + (right - left - 1) // 2
if data[i] <= binning_thresholds[middle]:
right = middle
else:
left = middle + 1
binned[i] = left
|