1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
|
import re
import numpy as np
import pytest
from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
from sklearn.ensemble._hist_gradient_boosting.splitting import (
Splitter,
compute_node_value,
)
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
n_threads = _openmp_effective_n_threads()
def is_increasing(a):
return (np.diff(a) >= 0.0).all()
def is_decreasing(a):
return (np.diff(a) <= 0.0).all()
def assert_leaves_values_monotonic(predictor, monotonic_cst):
# make sure leaves values (from left to right) are either all increasing
# or all decreasing (or neither) depending on the monotonic constraint.
nodes = predictor.nodes
def get_leaves_values():
"""get leaves values from left to right"""
values = []
def depth_first_collect_leaf_values(node_idx):
node = nodes[node_idx]
if node["is_leaf"]:
values.append(node["value"])
return
depth_first_collect_leaf_values(node["left"])
depth_first_collect_leaf_values(node["right"])
depth_first_collect_leaf_values(0) # start at root (0)
return values
values = get_leaves_values()
if monotonic_cst == MonotonicConstraint.NO_CST:
# some increasing, some decreasing
assert not is_increasing(values) and not is_decreasing(values)
elif monotonic_cst == MonotonicConstraint.POS:
# all increasing
assert is_increasing(values)
else: # NEG
# all decreasing
assert is_decreasing(values)
def assert_children_values_monotonic(predictor, monotonic_cst):
# Make sure siblings values respect the monotonic constraints. Left should
# be lower (resp greater) than right child if constraint is POS (resp.
# NEG).
# Note that this property alone isn't enough to ensure full monotonicity,
# since we also need to guanrantee that all the descendents of the left
# child won't be greater (resp. lower) than the right child, or its
# descendents. That's why we need to bound the predicted values (this is
# tested in assert_children_values_bounded)
nodes = predictor.nodes
left_lower = []
left_greater = []
for node in nodes:
if node["is_leaf"]:
continue
left_idx = node["left"]
right_idx = node["right"]
if nodes[left_idx]["value"] < nodes[right_idx]["value"]:
left_lower.append(node)
elif nodes[left_idx]["value"] > nodes[right_idx]["value"]:
left_greater.append(node)
if monotonic_cst == MonotonicConstraint.NO_CST:
assert left_lower and left_greater
elif monotonic_cst == MonotonicConstraint.POS:
assert left_lower and not left_greater
else: # NEG
assert not left_lower and left_greater
def assert_children_values_bounded(grower, monotonic_cst):
# Make sure that the values of the children of a node are bounded by the
# middle value between that node and its sibling (if there is a monotonic
# constraint).
# As a bonus, we also check that the siblings values are properly ordered
# which is slightly redundant with assert_children_values_monotonic (but
# this check is done on the grower nodes whereas
# assert_children_values_monotonic is done on the predictor nodes)
if monotonic_cst == MonotonicConstraint.NO_CST:
return
def recursively_check_children_node_values(node, right_sibling=None):
if node.is_leaf:
return
if right_sibling is not None:
middle = (node.value + right_sibling.value) / 2
if monotonic_cst == MonotonicConstraint.POS:
assert node.left_child.value <= node.right_child.value <= middle
if not right_sibling.is_leaf:
assert (
middle
<= right_sibling.left_child.value
<= right_sibling.right_child.value
)
else: # NEG
assert node.left_child.value >= node.right_child.value >= middle
if not right_sibling.is_leaf:
assert (
middle
>= right_sibling.left_child.value
>= right_sibling.right_child.value
)
recursively_check_children_node_values(
node.left_child, right_sibling=node.right_child
)
recursively_check_children_node_values(node.right_child)
recursively_check_children_node_values(grower.root)
@pytest.mark.parametrize("seed", range(3))
@pytest.mark.parametrize(
"monotonic_cst",
(
MonotonicConstraint.NO_CST,
MonotonicConstraint.POS,
MonotonicConstraint.NEG,
),
)
def test_nodes_values(monotonic_cst, seed):
# Build a single tree with only one feature, and make sure the nodes
# values respect the monotonic constraints.
# Considering the following tree with a monotonic POS constraint, we
# should have:
#
# root
# / \
# 5 10 # middle = 7.5
# / \ / \
# a b c d
#
# a <= b and c <= d (assert_children_values_monotonic)
# a, b <= middle <= c, d (assert_children_values_bounded)
# a <= b <= c <= d (assert_leaves_values_monotonic)
#
# The last one is a consequence of the others, but can't hurt to check
rng = np.random.RandomState(seed)
n_samples = 1000
n_features = 1
X_binned = rng.randint(0, 255, size=(n_samples, n_features), dtype=np.uint8)
X_binned = np.asfortranarray(X_binned)
gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
hessians = np.ones(shape=1, dtype=G_H_DTYPE)
grower = TreeGrower(
X_binned, gradients, hessians, monotonic_cst=[monotonic_cst], shrinkage=0.1
)
grower.grow()
# grow() will shrink the leaves values at the very end. For our comparison
# tests, we need to revert the shrinkage of the leaves, else we would
# compare the value of a leaf (shrunk) with a node (not shrunk) and the
# test would not be correct.
for leave in grower.finalized_leaves:
leave.value /= grower.shrinkage
# We pass undefined binning_thresholds because we won't use predict anyway
predictor = grower.make_predictor(
binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1))
)
# The consistency of the bounds can only be checked on the tree grower
# as the node bounds are not copied into the predictor tree. The
# consistency checks on the values of node children and leaves can be
# done either on the grower tree or on the predictor tree. We only
# do those checks on the predictor tree as the latter is derived from
# the former.
assert_children_values_monotonic(predictor, monotonic_cst)
assert_children_values_bounded(grower, monotonic_cst)
assert_leaves_values_monotonic(predictor, monotonic_cst)
@pytest.mark.parametrize("use_feature_names", (True, False))
def test_predictions(global_random_seed, use_feature_names):
# Train a model with a POS constraint on the first feature and a NEG
# constraint on the second feature, and make sure the constraints are
# respected by checking the predictions.
# test adapted from lightgbm's test_monotone_constraint(), itself inspired
# by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
rng = np.random.RandomState(global_random_seed)
n_samples = 1000
f_0 = rng.rand(n_samples) # positive correlation with y
f_1 = rng.rand(n_samples) # negative correslation with y
X = np.c_[f_0, f_1]
if use_feature_names:
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X, columns=["f_0", "f_1"])
noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise
if use_feature_names:
monotonic_cst = {"f_0": +1, "f_1": -1}
else:
monotonic_cst = [+1, -1]
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
gbdt.fit(X, y)
linspace = np.linspace(0, 1, 100)
sin = np.sin(linspace)
constant = np.full_like(linspace, fill_value=0.5)
# We now assert the predictions properly respect the constraints, on each
# feature. When testing for a feature we need to set the other one to a
# constant, because the monotonic constraints are only a "all else being
# equal" type of constraints:
# a constraint on the first feature only means that
# x0 < x0' => f(x0, x1) < f(x0', x1)
# while x1 stays constant.
# The constraint does not guanrantee that
# x0 < x0' => f(x0, x1) < f(x0', x1')
# First feature (POS)
# assert pred is all increasing when f_0 is all increasing
X = np.c_[linspace, constant]
pred = gbdt.predict(X)
assert is_increasing(pred)
# assert pred actually follows the variations of f_0
X = np.c_[sin, constant]
pred = gbdt.predict(X)
assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
# Second feature (NEG)
# assert pred is all decreasing when f_1 is all increasing
X = np.c_[constant, linspace]
pred = gbdt.predict(X)
assert is_decreasing(pred)
# assert pred actually follows the inverse variations of f_1
X = np.c_[constant, sin]
pred = gbdt.predict(X)
assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
def test_input_error():
X = [[1, 2], [2, 3], [3, 4]]
y = [0, 1, 2]
gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
with pytest.raises(
ValueError, match=re.escape("monotonic_cst has shape (3,) but the input data")
):
gbdt.fit(X, y)
for monotonic_cst in ([1, 3], [1, -3], [0.3, -0.7]):
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape(
"must be an array-like of -1, 0 or 1. Observed values:"
)
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X, y)
gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
with pytest.raises(
ValueError,
match="monotonic constraints are not supported for multiclass classification",
):
gbdt.fit(X, y)
def test_input_error_related_to_feature_names():
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]})
y = np.array([0, 1, 0])
monotonic_cst = {"d": 1, "a": 1, "c": -1}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape(
"monotonic_cst contains 2 unexpected feature names: ['c', 'd']."
)
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X, y)
monotonic_cst = {k: 1 for k in "abcdefghijklmnopqrstuvwxyz"}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape(
"monotonic_cst contains 24 unexpected feature names: "
"['c', 'd', 'e', 'f', 'g', '...']."
)
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X, y)
monotonic_cst = {"a": 1}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape(
"HistGradientBoostingRegressor was not fitted on data with feature "
"names. Pass monotonic_cst as an integer array instead."
)
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X.values, y)
monotonic_cst = {"b": -1, "a": "+"}
gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
expected_msg = re.escape("monotonic_cst['a'] must be either -1, 0 or 1. Got '+'.")
with pytest.raises(ValueError, match=expected_msg):
gbdt.fit(X, y)
def test_bounded_value_min_gain_to_split():
# The purpose of this test is to show that when computing the gain at a
# given split, the value of the current node should be properly bounded to
# respect the monotonic constraints, because it strongly interacts with
# min_gain_to_split. We build a simple example where gradients are [1, 1,
# 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
# bin, and depending on whether the value of the node is bounded or not,
# the min_gain_to_split constraint is or isn't satisfied.
l2_regularization = 0
min_hessian_to_split = 0
min_samples_leaf = 1
n_bins = n_samples = 5
X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
sample_indices = np.arange(n_samples, dtype=np.uint32)
all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
sum_gradients = all_gradients.sum()
sum_hessians = all_hessians.sum()
hessians_are_constant = False
builder = HistogramBuilder(
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
)
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
monotonic_cst = np.array(
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
)
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
missing_values_bin_idx = n_bins - 1
children_lower_bound, children_upper_bound = -np.inf, np.inf
min_gain_to_split = 2000
splitter = Splitter(
X_binned,
n_bins_non_missing,
missing_values_bin_idx,
has_missing_values,
is_categorical,
monotonic_cst,
l2_regularization,
min_hessian_to_split,
min_samples_leaf,
min_gain_to_split,
hessians_are_constant,
)
histograms = builder.compute_histograms_brute(sample_indices)
# Since the gradient array is [1, 1, 100, 1, 1]
# the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
# and is equal to about 1307, which less than min_gain_to_split = 2000, so
# the node is considered unsplittable (gain = -1)
current_lower_bound, current_upper_bound = -np.inf, np.inf
value = compute_node_value(
sum_gradients,
sum_hessians,
current_lower_bound,
current_upper_bound,
l2_regularization,
)
# the unbounded value is equal to -sum_gradients / sum_hessians
assert value == pytest.approx(-104 / 5)
split_info = splitter.find_node_split(
n_samples,
histograms,
sum_gradients,
sum_hessians,
value,
lower_bound=children_lower_bound,
upper_bound=children_upper_bound,
)
assert split_info.gain == -1 # min_gain_to_split not respected
# here again the max possible gain is on the 3rd bin but we now cap the
# value of the node into [-10, inf].
# This means the gain is now about 2430 which is more than the
# min_gain_to_split constraint.
current_lower_bound, current_upper_bound = -10, np.inf
value = compute_node_value(
sum_gradients,
sum_hessians,
current_lower_bound,
current_upper_bound,
l2_regularization,
)
assert value == -10
split_info = splitter.find_node_split(
n_samples,
histograms,
sum_gradients,
sum_hessians,
value,
lower_bound=children_lower_bound,
upper_bound=children_upper_bound,
)
assert split_info.gain > min_gain_to_split
|