File: partial-dependence-plot-with-categorical.py

package info (click to toggle)
scikit-optimize 0.10.2-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 7,672 kB
  • sloc: python: 10,659; javascript: 438; makefile: 136; sh: 6
file content (109 lines) | stat: -rw-r--r-- 3,660 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
=================================================
Partial Dependence Plots  with categorical values
=================================================

Sigurd Carlsen Feb 2019
Holger Nahrstaedt 2020

.. currentmodule:: skopt

Plot objective now supports optional use of partial dependence as well as
different methods of defining parameter values for dependency plots.
"""

print(__doc__)
import numpy as np

from skopt.plots import plot_objective

np.random.seed(123)
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

from skopt import gp_minimize
from skopt.plots import plot_objective
from skopt.space import Categorical, Integer

#############################################################################
# objective function
# ==================
# Here we define a function that we evaluate.


def objective(params):
    clf = DecisionTreeClassifier(
        **{dim.name: val for dim, val in zip(SPACE, params) if dim.name != 'dummy'}
    )
    return -np.mean(cross_val_score(clf, *load_breast_cancer(return_X_y=True)))


#############################################################################
# Bayesian optimization
# =====================

SPACE = [
    Integer(1, 20, name='max_depth'),
    Integer(2, 100, name='min_samples_split'),
    Integer(5, 30, name='min_samples_leaf'),
    Integer(1, 30, name='max_features'),
    Categorical(list('abc'), name='dummy'),
    Categorical(['gini', 'entropy'], name='criterion'),
    Categorical(list('def'), name='dummy'),
]

result = gp_minimize(objective, SPACE, n_calls=20)

#############################################################################
# Partial dependence plot
# =======================
#
# Here we see an example of using partial dependence. Even when setting
# n_points all the way down to 10 from the default of 40, this method is
# still very slow. This is because partial dependence calculates 250 extra
# predictions for each point on the plots.

_ = plot_objective(result, n_points=10)

#############################################################################
# Plot without partial dependence
# ===============================
# Here we plot without partial dependence. We see that it is a lot faster.
# Also the values for the other parameters are set to the default "result"
# which is the parameter set of the best observed value so far. In the case
# of funny_func this is close to 0 for all parameters.

_ = plot_objective(result, sample_source='result', n_points=10)

#############################################################################
# Modify the shown minimum
# ========================
# Here we try with setting the other parameters to something other than
# "result". When dealing with categorical dimensions we can't use
# 'expected_minimum'. Therefore we try with "expected_minimum_random"
# which is a naive way of finding the minimum of the surrogate by only
# using random sampling. `n_minimum_search` sets the number of random samples,
# which is used to find the minimum

_ = plot_objective(
    result,
    n_points=10,
    sample_source='expected_minimum_random',
    minimum='expected_minimum_random',
    n_minimum_search=10000,
)

#############################################################################
# Set a minimum location
# ======================
# Lastly we can also define these parameters ourselfs by
# parsing a list as the pars argument:

_ = plot_objective(
    result,
    n_points=10,
    sample_source=[15, 4, 7, 15, 'b', 'entropy', 'e'],
    minimum=[15, 4, 7, 15, 'b', 'entropy', 'e'],
)