File: test_parse_tree.py

package info (click to toggle)
xgboost 3.0.0-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 13,796 kB
  • sloc: cpp: 67,502; python: 35,503; java: 4,676; ansic: 1,426; sh: 1,320; xml: 1,197; makefile: 204; javascript: 19
file content (72 lines) | stat: -rw-r--r-- 2,784 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
import pytest

import xgboost as xgb
from xgboost import testing as tm

pytestmark = pytest.mark.skipif(**tm.no_pandas())


dpath = 'demo/data/'
rng = np.random.RandomState(1994)


class TestTreesToDataFrame:
    def build_model(self, max_depth, num_round):
        dtrain, _ = tm.load_agaricus(__file__)
        param = {'max_depth': max_depth, 'objective': 'binary:logistic',
                 'verbosity': 1}
        num_round = num_round
        bst = xgb.train(param, dtrain, num_round)
        return bst

    def parse_dumped_model(self, booster, item_to_get, splitter):
        item_to_get += '='
        txt_dump = booster.get_dump(with_stats=True)
        tree_list = [tree.split('/n') for tree in txt_dump]
        split_trees = [tree[0].split(item_to_get)[1:] for tree in tree_list]
        res = sum([float(line.split(splitter)[0])
                   for tree in split_trees for line in tree])
        return res

    def test_trees_to_dataframe(self):
        bst = self.build_model(max_depth=5, num_round=10)
        gain_from_dump = self.parse_dumped_model(booster=bst,
                                                 item_to_get='gain',
                                                 splitter=',')
        cover_from_dump = self.parse_dumped_model(booster=bst,
                                                  item_to_get='cover',
                                                  splitter='\n')
        # method being tested
        df = bst.trees_to_dataframe()

        # test for equality of gains
        gain_from_df = df[df.Feature != 'Leaf'][['Gain']].sum()
        assert np.allclose(gain_from_dump, gain_from_df)

        # test for equality of covers
        cover_from_df = df.Cover.sum()
        assert np.allclose(cover_from_dump, cover_from_df)

    def run_tree_to_df_categorical(self, tree_method: str) -> None:
        X, y = tm.make_categorical(100, 10, 31, onehot=False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        booster = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=10)
        df = booster.trees_to_dataframe()
        for _, x in df.iterrows():
            if x["Feature"] != "Leaf":
                assert len(x["Category"]) >= 1

    def test_tree_to_df_categorical(self) -> None:
        self.run_tree_to_df_categorical("approx")

    def run_split_value_histograms(self, tree_method) -> None:
        X, y = tm.make_categorical(1000, 10, 13, onehot=False)
        reg = xgb.XGBRegressor(tree_method=tree_method, enable_categorical=True)
        reg.fit(X, y)

        with pytest.raises(ValueError, match="doesn't"):
            reg.get_booster().get_split_value_histogram("3", bins=5)

    def test_split_value_histograms(self):
        self.run_split_value_histograms("approx")