1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670
|
"""The underlying data structure used in gplearn.
The :mod:`gplearn._program` module contains the underlying representation of a
computer program. It is used for creating and evolving programs used in the
:mod:`gplearn.genetic` module.
"""
# Author: Trevor Stephens <trevorstephens.com>
#
# License: BSD 3 clause
from copy import copy
import numpy as np
from sklearn.utils.random import sample_without_replacement
from .functions import _Function
from .utils import check_random_state
class _Program(object):
"""A program-like representation of the evolved program.
This is the underlying data-structure used by the public classes in the
:mod:`gplearn.genetic` module. It should not be used directly by the user.
Parameters
----------
function_set : list
A list of valid functions to use in the program.
arities : dict
A dictionary of the form `{arity: [functions]}`. The arity is the
number of arguments that the function takes, the functions must match
those in the `function_set` parameter.
init_depth : tuple of two ints
The range of tree depths for the initial population of naive formulas.
Individual trees will randomly choose a maximum depth from this range.
When combined with `init_method='half and half'` this yields the well-
known 'ramped half and half' initialization method.
init_method : str
- 'grow' : Nodes are chosen at random from both functions and
terminals, allowing for smaller trees than `init_depth` allows. Tends
to grow asymmetrical trees.
- 'full' : Functions are chosen until the `init_depth` is reached, and
then terminals are selected. Tends to grow 'bushy' trees.
- 'half and half' : Trees are grown through a 50/50 mix of 'full' and
'grow', making for a mix of tree shapes in the initial population.
n_features : int
The number of features in `X`.
const_range : tuple of two floats
The range of constants to include in the formulas.
metric : _Fitness object
The raw fitness metric.
p_point_replace : float
The probability that any given node will be mutated during point
mutation.
parsimony_coefficient : float
This constant penalizes large programs by adjusting their fitness to
be less favorable for selection. Larger values penalize the program
more which can control the phenomenon known as 'bloat'. Bloat is when
evolution is increasing the size of programs without a significant
increase in fitness, which is costly for computation time and makes for
a less understandable final result. This parameter may need to be tuned
over successive runs.
random_state : RandomState instance
The random number generator. Note that ints, or None are not allowed.
The reason for this being passed is that during parallel evolution the
same program object may be accessed by multiple parallel processes.
transformer : _Function object, optional (default=None)
The function to transform the output of the program to probabilities,
only used for the SymbolicClassifier.
feature_names : list, optional (default=None)
Optional list of feature names, used purely for representations in
the `print` operation or `export_graphviz`. If None, then X0, X1, etc
will be used for representations.
program : list, optional (default=None)
The flattened tree representation of the program. If None, a new naive
random tree will be grown. If provided, it will be validated.
Attributes
----------
program : list
The flattened tree representation of the program.
raw_fitness_ : float
The raw fitness of the individual program.
fitness_ : float
The penalized fitness of the individual program.
oob_fitness_ : float
The out-of-bag raw fitness of the individual program for the held-out
samples. Only present when sub-sampling was used in the estimator by
specifying `max_samples` < 1.0.
parents : dict, or None
If None, this is a naive random program from the initial population.
Otherwise it includes meta-data about the program's parent(s) as well
as the genetic operations performed to yield the current program. This
is set outside this class by the controlling evolution loops.
depth_ : int
The maximum depth of the program tree.
length_ : int
The number of functions and terminals in the program.
"""
def __init__(self,
function_set,
arities,
init_depth,
init_method,
n_features,
const_range,
metric,
p_point_replace,
parsimony_coefficient,
random_state,
transformer=None,
feature_names=None,
program=None):
self.function_set = function_set
self.arities = arities
self.init_depth = (init_depth[0], init_depth[1] + 1)
self.init_method = init_method
self.n_features = n_features
self.const_range = const_range
self.metric = metric
self.p_point_replace = p_point_replace
self.parsimony_coefficient = parsimony_coefficient
self.transformer = transformer
self.feature_names = feature_names
self.program = program
if self.program is not None:
if not self.validate_program():
raise ValueError('The supplied program is incomplete.')
else:
# Create a naive random program
self.program = self.build_program(random_state)
self.raw_fitness_ = None
self.fitness_ = None
self.parents = None
self._n_samples = None
self._max_samples = None
self._indices_state = None
def build_program(self, random_state):
"""Build a naive random program.
Parameters
----------
random_state : RandomState instance
The random number generator.
Returns
-------
program : list
The flattened tree representation of the program.
"""
if self.init_method == 'half and half':
method = ('full' if random_state.randint(2) else 'grow')
else:
method = self.init_method
max_depth = random_state.randint(*self.init_depth)
# Start a program with a function to avoid degenerative programs
function = random_state.randint(len(self.function_set))
function = self.function_set[function]
program = [function]
terminal_stack = [function.arity]
while terminal_stack:
depth = len(terminal_stack)
choice = self.n_features + len(self.function_set)
choice = random_state.randint(choice)
# Determine if we are adding a function or terminal
if (depth < max_depth) and (method == 'full' or
choice <= len(self.function_set)):
function = random_state.randint(len(self.function_set))
function = self.function_set[function]
program.append(function)
terminal_stack.append(function.arity)
else:
# We need a terminal, add a variable or constant
if self.const_range is not None:
terminal = random_state.randint(self.n_features + 1)
else:
terminal = random_state.randint(self.n_features)
if terminal == self.n_features:
terminal = random_state.uniform(*self.const_range)
if self.const_range is None:
# We should never get here
raise ValueError('A constant was produced with '
'const_range=None.')
program.append(terminal)
terminal_stack[-1] -= 1
while terminal_stack[-1] == 0:
terminal_stack.pop()
if not terminal_stack:
return program
terminal_stack[-1] -= 1
# We should never get here
return None
def validate_program(self):
"""Rough check that the embedded program in the object is valid."""
terminals = [0]
for node in self.program:
if isinstance(node, _Function):
terminals.append(node.arity)
else:
terminals[-1] -= 1
while terminals[-1] == 0:
terminals.pop()
terminals[-1] -= 1
return terminals == [-1]
def __str__(self):
"""Overloads `print` output of the object to resemble a LISP tree."""
terminals = [0]
output = ''
for i, node in enumerate(self.program):
if isinstance(node, _Function):
terminals.append(node.arity)
output += node.name + '('
else:
if isinstance(node, int):
if self.feature_names is None:
output += 'X%s' % node
else:
output += self.feature_names[node]
else:
output += '%.3f' % node
terminals[-1] -= 1
while terminals[-1] == 0:
terminals.pop()
terminals[-1] -= 1
output += ')'
if i != len(self.program) - 1:
output += ', '
return output
def export_graphviz(self, fade_nodes=None):
"""Returns a string, Graphviz script for visualizing the program.
Parameters
----------
fade_nodes : list, optional
A list of node indices to fade out for showing which were removed
during evolution.
Returns
-------
output : string
The Graphviz script to plot the tree representation of the program.
"""
terminals = []
if fade_nodes is None:
fade_nodes = []
output = 'digraph program {\nnode [style=filled]\n'
for i, node in enumerate(self.program):
fill = '#cecece'
if isinstance(node, _Function):
if i not in fade_nodes:
fill = '#136ed4'
terminals.append([node.arity, i])
output += ('%d [label="%s", fillcolor="%s"] ;\n'
% (i, node.name, fill))
else:
if i not in fade_nodes:
fill = '#60a6f6'
if isinstance(node, int):
if self.feature_names is None:
feature_name = 'X%s' % node
else:
feature_name = self.feature_names[node]
output += ('%d [label="%s", fillcolor="%s"] ;\n'
% (i, feature_name, fill))
else:
output += ('%d [label="%.3f", fillcolor="%s"] ;\n'
% (i, node, fill))
if i == 0:
# A degenerative program of only one node
return output + '}'
terminals[-1][0] -= 1
terminals[-1].append(i)
while terminals[-1][0] == 0:
output += '%d -> %d ;\n' % (terminals[-1][1],
terminals[-1][-1])
terminals[-1].pop()
if len(terminals[-1]) == 2:
parent = terminals[-1][-1]
terminals.pop()
if not terminals:
return output + '}'
terminals[-1].append(parent)
terminals[-1][0] -= 1
# We should never get here
return None
def _depth(self):
"""Calculates the maximum depth of the program tree."""
terminals = [0]
depth = 1
for node in self.program:
if isinstance(node, _Function):
terminals.append(node.arity)
depth = max(len(terminals), depth)
else:
terminals[-1] -= 1
while terminals[-1] == 0:
terminals.pop()
terminals[-1] -= 1
return depth - 1
def _length(self):
"""Calculates the number of functions and terminals in the program."""
return len(self.program)
def execute(self, X):
"""Execute the program according to X.
Parameters
----------
X : {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
y_hats : array-like, shape = [n_samples]
The result of executing the program on X.
"""
# Check for single-node programs
node = self.program[0]
if isinstance(node, float):
return np.repeat(node, X.shape[0])
if isinstance(node, int):
return X[:, node]
apply_stack = []
for node in self.program:
if isinstance(node, _Function):
apply_stack.append([node])
else:
# Lazily evaluate later
apply_stack[-1].append(node)
while len(apply_stack[-1]) == apply_stack[-1][0].arity + 1:
# Apply functions that have sufficient arguments
function = apply_stack[-1][0]
terminals = [np.repeat(t, X.shape[0]) if isinstance(t, float)
else X[:, t] if isinstance(t, int)
else t for t in apply_stack[-1][1:]]
intermediate_result = function(*terminals)
if len(apply_stack) != 1:
apply_stack.pop()
apply_stack[-1].append(intermediate_result)
else:
return intermediate_result
# We should never get here
return None
def get_all_indices(self, n_samples=None, max_samples=None,
random_state=None):
"""Get the indices on which to evaluate the fitness of a program.
Parameters
----------
n_samples : int
The number of samples.
max_samples : int
The maximum number of samples to use.
random_state : RandomState instance
The random number generator.
Returns
-------
indices : array-like, shape = [n_samples]
The in-sample indices.
not_indices : array-like, shape = [n_samples]
The out-of-sample indices.
"""
if self._indices_state is None and random_state is None:
raise ValueError('The program has not been evaluated for fitness '
'yet, indices not available.')
if n_samples is not None and self._n_samples is None:
self._n_samples = n_samples
if max_samples is not None and self._max_samples is None:
self._max_samples = max_samples
if random_state is not None and self._indices_state is None:
self._indices_state = random_state.get_state()
indices_state = check_random_state(None)
indices_state.set_state(self._indices_state)
not_indices = sample_without_replacement(
self._n_samples,
self._n_samples - self._max_samples,
random_state=indices_state)
sample_counts = np.bincount(not_indices, minlength=self._n_samples)
indices = np.where(sample_counts == 0)[0]
return indices, not_indices
def _indices(self):
"""Get the indices used to measure the program's fitness."""
return self.get_all_indices()[0]
def raw_fitness(self, X, y, sample_weight):
"""Evaluate the raw fitness of the program according to X, y.
Parameters
----------
X : {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
sample_weight : array-like, shape = [n_samples]
Weights applied to individual samples.
Returns
-------
raw_fitness : float
The raw fitness of the program.
"""
y_pred = self.execute(X)
if self.transformer:
y_pred = self.transformer(y_pred)
raw_fitness = self.metric(y, y_pred, sample_weight)
return raw_fitness
def fitness(self, parsimony_coefficient=None):
"""Evaluate the penalized fitness of the program according to X, y.
Parameters
----------
parsimony_coefficient : float, optional
If automatic parsimony is being used, the computed value according
to the population. Otherwise the initialized value is used.
Returns
-------
fitness : float
The penalized fitness of the program.
"""
if parsimony_coefficient is None:
parsimony_coefficient = self.parsimony_coefficient
penalty = parsimony_coefficient * len(self.program) * self.metric.sign
return self.raw_fitness_ - penalty
def get_subtree(self, random_state, program=None):
"""Get a random subtree from the program.
Parameters
----------
random_state : RandomState instance
The random number generator.
program : list, optional (default=None)
The flattened tree representation of the program. If None, the
embedded tree in the object will be used.
Returns
-------
start, end : tuple of two ints
The indices of the start and end of the random subtree.
"""
if program is None:
program = self.program
# Choice of crossover points follows Koza's (1992) widely used approach
# of choosing functions 90% of the time and leaves 10% of the time.
probs = np.array([0.9 if isinstance(node, _Function) else 0.1
for node in program])
probs = np.cumsum(probs / probs.sum())
start = np.searchsorted(probs, random_state.uniform())
stack = 1
end = start
while stack > end - start:
node = program[end]
if isinstance(node, _Function):
stack += node.arity
end += 1
return start, end
def reproduce(self):
"""Return a copy of the embedded program."""
return copy(self.program)
def crossover(self, donor, random_state):
"""Perform the crossover genetic operation on the program.
Crossover selects a random subtree from the embedded program to be
replaced. A donor also has a subtree selected at random and this is
inserted into the original parent to form an offspring.
Parameters
----------
donor : list
The flattened tree representation of the donor program.
random_state : RandomState instance
The random number generator.
Returns
-------
program : list
The flattened tree representation of the program.
"""
# Get a subtree to replace
start, end = self.get_subtree(random_state)
removed = range(start, end)
# Get a subtree to donate
donor_start, donor_end = self.get_subtree(random_state, donor)
donor_removed = list(set(range(len(donor))) -
set(range(donor_start, donor_end)))
# Insert genetic material from donor
return (self.program[:start] +
donor[donor_start:donor_end] +
self.program[end:]), removed, donor_removed
def subtree_mutation(self, random_state):
"""Perform the subtree mutation operation on the program.
Subtree mutation selects a random subtree from the embedded program to
be replaced. A donor subtree is generated at random and this is
inserted into the original parent to form an offspring. This
implementation uses the "headless chicken" method where the donor
subtree is grown using the initialization methods and a subtree of it
is selected to be donated to the parent.
Parameters
----------
random_state : RandomState instance
The random number generator.
Returns
-------
program : list
The flattened tree representation of the program.
"""
# Build a new naive program
chicken = self.build_program(random_state)
# Do subtree mutation via the headless chicken method!
return self.crossover(chicken, random_state)
def hoist_mutation(self, random_state):
"""Perform the hoist mutation operation on the program.
Hoist mutation selects a random subtree from the embedded program to
be replaced. A random subtree of that subtree is then selected and this
is 'hoisted' into the original subtrees location to form an offspring.
This method helps to control bloat.
Parameters
----------
random_state : RandomState instance
The random number generator.
Returns
-------
program : list
The flattened tree representation of the program.
"""
# Get a subtree to replace
start, end = self.get_subtree(random_state)
subtree = self.program[start:end]
# Get a subtree of the subtree to hoist
sub_start, sub_end = self.get_subtree(random_state, subtree)
hoist = subtree[sub_start:sub_end]
# Determine which nodes were removed for plotting
removed = list(set(range(start, end)) -
set(range(start + sub_start, start + sub_end)))
return self.program[:start] + hoist + self.program[end:], removed
def point_mutation(self, random_state):
"""Perform the point mutation operation on the program.
Point mutation selects random nodes from the embedded program to be
replaced. Terminals are replaced by other terminals and functions are
replaced by other functions that require the same number of arguments
as the original node. The resulting tree forms an offspring.
Parameters
----------
random_state : RandomState instance
The random number generator.
Returns
-------
program : list
The flattened tree representation of the program.
"""
program = copy(self.program)
# Get the nodes to modify
mutate = np.where(random_state.uniform(size=len(program)) <
self.p_point_replace)[0]
for node in mutate:
if isinstance(program[node], _Function):
arity = program[node].arity
# Find a valid replacement with same arity
replacement = len(self.arities[arity])
replacement = random_state.randint(replacement)
replacement = self.arities[arity][replacement]
program[node] = replacement
else:
# We've got a terminal, add a const or variable
if self.const_range is not None:
terminal = random_state.randint(self.n_features + 1)
else:
terminal = random_state.randint(self.n_features)
if terminal == self.n_features:
terminal = random_state.uniform(*self.const_range)
if self.const_range is None:
# We should never get here
raise ValueError('A constant was produced with '
'const_range=None.')
program[node] = terminal
return program, list(mutate)
depth_ = property(_depth)
length_ = property(_length)
indices_ = property(_indices)
|