File: utils.py

package info (click to toggle)
scikit-learn 1.7.2%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,752 kB
  • sloc: python: 219,120; cpp: 5,790; ansic: 846; makefile: 191; javascript: 110
file content (149 lines) | stat: -rw-r--r-- 5,523 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""This module contains utility routines."""

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

from ...base import is_classifier
from .binning import _BinMapper


def get_equivalent_estimator(estimator, lib="lightgbm", n_classes=None):
    """Return an unfitted estimator from another lib with matching hyperparams.

    This utility function takes care of renaming the sklearn parameters into
    their LightGBM, XGBoost or CatBoost equivalent parameters.

    # unmapped XGB parameters:
    # - min_samples_leaf
    # - min_data_in_bin
    # - min_split_gain (there is min_split_loss though?)

    # unmapped Catboost parameters:
    # max_leaves
    # min_*
    """

    if lib not in ("lightgbm", "xgboost", "catboost"):
        raise ValueError(
            "accepted libs are lightgbm, xgboost, and catboost.  got {}".format(lib)
        )

    sklearn_params = estimator.get_params()

    if sklearn_params["loss"] == "auto":
        raise ValueError(
            "auto loss is not accepted. We need to know if "
            "the problem is binary or multiclass classification."
        )
    if sklearn_params["early_stopping"]:
        raise NotImplementedError("Early stopping should be deactivated.")

    lightgbm_loss_mapping = {
        "squared_error": "regression_l2",
        "absolute_error": "regression_l1",
        "log_loss": "binary" if n_classes == 2 else "multiclass",
        "gamma": "gamma",
        "poisson": "poisson",
    }

    lightgbm_params = {
        "objective": lightgbm_loss_mapping[sklearn_params["loss"]],
        "learning_rate": sklearn_params["learning_rate"],
        "n_estimators": sklearn_params["max_iter"],
        "num_leaves": sklearn_params["max_leaf_nodes"],
        "max_depth": sklearn_params["max_depth"],
        "min_data_in_leaf": sklearn_params["min_samples_leaf"],
        "reg_lambda": sklearn_params["l2_regularization"],
        "max_bin": sklearn_params["max_bins"],
        "min_data_in_bin": 1,
        "min_sum_hessian_in_leaf": 1e-3,
        "min_split_gain": 0,
        "verbosity": 10 if sklearn_params["verbose"] else -10,
        "boost_from_average": True,
        "enable_bundle": False,  # also makes feature order consistent
        "subsample_for_bin": _BinMapper().subsample,
        "poisson_max_delta_step": 1e-12,
        "feature_fraction_bynode": sklearn_params["max_features"],
    }

    if sklearn_params["loss"] == "log_loss" and n_classes > 2:
        # LightGBM multiplies hessians by 2 in multiclass loss.
        lightgbm_params["min_sum_hessian_in_leaf"] *= 2
        # LightGBM 3.0 introduced a different scaling of the hessian for the multiclass
        # case.
        # It is equivalent of scaling the learning rate.
        # See https://github.com/microsoft/LightGBM/pull/3256.
        if n_classes is not None:
            lightgbm_params["learning_rate"] *= n_classes / (n_classes - 1)

    # XGB
    xgboost_loss_mapping = {
        "squared_error": "reg:linear",
        "absolute_error": "LEAST_ABSOLUTE_DEV_NOT_SUPPORTED",
        "log_loss": "reg:logistic" if n_classes == 2 else "multi:softmax",
        "gamma": "reg:gamma",
        "poisson": "count:poisson",
    }

    xgboost_params = {
        "tree_method": "hist",
        "grow_policy": "lossguide",  # so that we can set max_leaves
        "objective": xgboost_loss_mapping[sklearn_params["loss"]],
        "learning_rate": sklearn_params["learning_rate"],
        "n_estimators": sklearn_params["max_iter"],
        "max_leaves": sklearn_params["max_leaf_nodes"],
        "max_depth": sklearn_params["max_depth"] or 0,
        "lambda": sklearn_params["l2_regularization"],
        "max_bin": sklearn_params["max_bins"],
        "min_child_weight": 1e-3,
        "verbosity": 2 if sklearn_params["verbose"] else 0,
        "silent": sklearn_params["verbose"] == 0,
        "n_jobs": -1,
        "colsample_bynode": sklearn_params["max_features"],
    }

    # Catboost
    catboost_loss_mapping = {
        "squared_error": "RMSE",
        # catboost does not support MAE when leaf_estimation_method is Newton
        "absolute_error": "LEAST_ASBOLUTE_DEV_NOT_SUPPORTED",
        "log_loss": "Logloss" if n_classes == 2 else "MultiClass",
        "gamma": None,
        "poisson": "Poisson",
    }

    catboost_params = {
        "loss_function": catboost_loss_mapping[sklearn_params["loss"]],
        "learning_rate": sklearn_params["learning_rate"],
        "iterations": sklearn_params["max_iter"],
        "depth": sklearn_params["max_depth"],
        "reg_lambda": sklearn_params["l2_regularization"],
        "max_bin": sklearn_params["max_bins"],
        "feature_border_type": "Median",
        "leaf_estimation_method": "Newton",
        "verbose": bool(sklearn_params["verbose"]),
    }

    if lib == "lightgbm":
        from lightgbm import LGBMClassifier, LGBMRegressor

        if is_classifier(estimator):
            return LGBMClassifier(**lightgbm_params)
        else:
            return LGBMRegressor(**lightgbm_params)

    elif lib == "xgboost":
        from xgboost import XGBClassifier, XGBRegressor

        if is_classifier(estimator):
            return XGBClassifier(**xgboost_params)
        else:
            return XGBRegressor(**xgboost_params)

    else:
        from catboost import CatBoostClassifier, CatBoostRegressor

        if is_classifier(estimator):
            return CatBoostClassifier(**catboost_params)
        else:
            return CatBoostRegressor(**catboost_params)