File: utils.pyx

package info (click to toggle)
scikit-learn 1.4.2%2Bdfsg-8
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 25,036 kB
  • sloc: python: 201,105; cpp: 5,790; ansic: 854; makefile: 304; sh: 56; javascript: 20
file content (159 lines) | stat: -rw-r--r-- 5,845 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""This module contains utility routines."""
# Author: Nicolas Hug

from cython.parallel import prange

from ...base import is_classifier
from .binning import _BinMapper
from .common cimport G_H_DTYPE_C
from .common cimport Y_DTYPE_C


def get_equivalent_estimator(estimator, lib='lightgbm', n_classes=None):
    """Return an unfitted estimator from another lib with matching hyperparams.

    This utility function takes care of renaming the sklearn parameters into
    their LightGBM, XGBoost or CatBoost equivalent parameters.

    # unmapped XGB parameters:
    # - min_samples_leaf
    # - min_data_in_bin
    # - min_split_gain (there is min_split_loss though?)

    # unmapped Catboost parameters:
    # max_leaves
    # min_*
    """

    if lib not in ('lightgbm', 'xgboost', 'catboost'):
        raise ValueError('accepted libs are lightgbm, xgboost, and catboost. '
                         ' got {}'.format(lib))

    sklearn_params = estimator.get_params()

    if sklearn_params['loss'] == 'auto':
        raise ValueError('auto loss is not accepted. We need to know if '
                         'the problem is binary or multiclass classification.')
    if sklearn_params['early_stopping']:
        raise NotImplementedError('Early stopping should be deactivated.')

    lightgbm_loss_mapping = {
        'squared_error': 'regression_l2',
        'absolute_error': 'regression_l1',
        'log_loss': 'binary' if n_classes == 2 else 'multiclass',
        'gamma': 'gamma',
        'poisson': 'poisson',
    }

    lightgbm_params = {
        'objective': lightgbm_loss_mapping[sklearn_params['loss']],
        'learning_rate': sklearn_params['learning_rate'],
        'n_estimators': sklearn_params['max_iter'],
        'num_leaves': sklearn_params['max_leaf_nodes'],
        'max_depth': sklearn_params['max_depth'],
        'min_child_samples': sklearn_params['min_samples_leaf'],
        'reg_lambda': sklearn_params['l2_regularization'],
        'max_bin': sklearn_params['max_bins'],
        'min_data_in_bin': 1,
        'min_child_weight': 1e-3,  # alias for 'min_sum_hessian_in_leaf'
        'min_sum_hessian_in_leaf': 1e-3,
        'min_split_gain': 0,
        'verbosity': 10 if sklearn_params['verbose'] else -10,
        'boost_from_average': True,
        'enable_bundle': False,  # also makes feature order consistent
        'subsample_for_bin': _BinMapper().subsample,
        'poisson_max_delta_step': 1e-12,
    }

    if sklearn_params['loss'] == 'log_loss' and n_classes > 2:
        # LightGBM multiplies hessians by 2 in multiclass loss.
        lightgbm_params['min_sum_hessian_in_leaf'] *= 2
        # LightGBM 3.0 introduced a different scaling of the hessian for the multiclass case.
        # It is equivalent of scaling the learning rate.
        # See https://github.com/microsoft/LightGBM/pull/3256.
        if n_classes is not None:
            lightgbm_params['learning_rate'] *= n_classes / (n_classes - 1)

    # XGB
    xgboost_loss_mapping = {
        'squared_error': 'reg:linear',
        'absolute_error': 'LEAST_ABSOLUTE_DEV_NOT_SUPPORTED',
        'log_loss': 'reg:logistic' if n_classes == 2 else 'multi:softmax',
        'gamma': 'reg:gamma',
        'poisson': 'count:poisson',
    }

    xgboost_params = {
        'tree_method': 'hist',
        'grow_policy': 'lossguide',  # so that we can set max_leaves
        'objective': xgboost_loss_mapping[sklearn_params['loss']],
        'learning_rate': sklearn_params['learning_rate'],
        'n_estimators': sklearn_params['max_iter'],
        'max_leaves': sklearn_params['max_leaf_nodes'],
        'max_depth': sklearn_params['max_depth'] or 0,
        'lambda': sklearn_params['l2_regularization'],
        'max_bin': sklearn_params['max_bins'],
        'min_child_weight': 1e-3,
        'verbosity': 2 if sklearn_params['verbose'] else 0,
        'silent': sklearn_params['verbose'] == 0,
        'n_jobs': -1,
    }

    # Catboost
    catboost_loss_mapping = {
        'squared_error': 'RMSE',
        # catboost does not support MAE when leaf_estimation_method is Newton
        'absolute_error': 'LEAST_ASBOLUTE_DEV_NOT_SUPPORTED',
        'log_loss': 'Logloss' if n_classes == 2 else 'MultiClass',
        'gamma': None,
        'poisson': 'Poisson',
    }

    catboost_params = {
        'loss_function': catboost_loss_mapping[sklearn_params['loss']],
        'learning_rate': sklearn_params['learning_rate'],
        'iterations': sklearn_params['max_iter'],
        'depth': sklearn_params['max_depth'],
        'reg_lambda': sklearn_params['l2_regularization'],
        'max_bin': sklearn_params['max_bins'],
        'feature_border_type': 'Median',
        'leaf_estimation_method': 'Newton',
        'verbose': bool(sklearn_params['verbose']),
    }

    if lib == 'lightgbm':
        from lightgbm import LGBMRegressor
        from lightgbm import LGBMClassifier
        if is_classifier(estimator):
            return LGBMClassifier(**lightgbm_params)
        else:
            return LGBMRegressor(**lightgbm_params)

    elif lib == 'xgboost':
        from xgboost import XGBRegressor
        from xgboost import XGBClassifier
        if is_classifier(estimator):
            return XGBClassifier(**xgboost_params)
        else:
            return XGBRegressor(**xgboost_params)

    else:
        from catboost import CatBoostRegressor
        from catboost import CatBoostClassifier
        if is_classifier(estimator):
            return CatBoostClassifier(**catboost_params)
        else:
            return CatBoostRegressor(**catboost_params)


def sum_parallel(G_H_DTYPE_C [:] array, int n_threads):

    cdef:
        Y_DTYPE_C out = 0.
        int i = 0

    for i in prange(array.shape[0], schedule='static', nogil=True,
                    num_threads=n_threads):
        out += array[i]

    return out