1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
|
# Authors: Gilles Louppe <g.louppe@gmail.com>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Brian Holt <bdholt1@gmail.com>
# Joel Nothman <joel.nothman@gmail.com>
# Arnaud Joly <arnaud.v.joly@gmail.com>
# Jacob Schreiber <jmschreiber91@gmail.com>
#
# License: BSD 3 clause
# See _criterion.pyx for implementation details.
import numpy as np
cimport numpy as np
ctypedef np.npy_float32 DTYPE_t # Type of X
ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight
ctypedef np.npy_intp SIZE_t # Type for indices and counters
ctypedef np.npy_int32 INT32_t # Signed 32 bit integer
ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
cdef class Criterion:
# The criterion computes the impurity of a node and the reduction of
# impurity of a split on that node. It also computes the output statistics
# such as the mean in regression and class probabilities in classification.
# Internal structures
cdef DOUBLE_t* y # Values of y
cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1)
cdef DOUBLE_t* sample_weight # Sample weights
cdef SIZE_t* samples # Sample indices in X, y
cdef SIZE_t start # samples[start:pos] are the samples in the left node
cdef SIZE_t pos # samples[pos:end] are the samples in the right node
cdef SIZE_t end
cdef SIZE_t n_outputs # Number of outputs
cdef SIZE_t n_samples # Number of samples
cdef SIZE_t n_node_samples # Number of samples in the node (end-start)
cdef double weighted_n_samples # Weighted number of samples (in total)
cdef double weighted_n_node_samples # Weighted number of samples in the node
cdef double weighted_n_left # Weighted number of samples in the left node
cdef double weighted_n_right # Weighted number of samples in the right node
cdef double* sum_total # For classification criteria, the sum of the
# weighted count of each label. For regression,
# the sum of w*y. sum_total[k] is equal to
# sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k],
# where k is output index.
cdef double* sum_left # Same as above, but for the left side of the split
cdef double* sum_right # same as above, but for the right side of the split
# The criterion object is maintained such that left and right collected
# statistics correspond to samples[start:pos] and samples[pos:end].
# Methods
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1
cdef int reset(self) nogil except -1
cdef int reverse_reset(self) nogil except -1
cdef int update(self, SIZE_t new_pos) nogil except -1
cdef double node_impurity(self) nogil
cdef void children_impurity(self, double* impurity_left,
double* impurity_right) nogil
cdef void node_value(self, double* dest) nogil
cdef double impurity_improvement(self, double impurity) nogil
cdef double proxy_impurity_improvement(self) nogil
cdef class ClassificationCriterion(Criterion):
"""Abstract criterion for classification."""
cdef SIZE_t* n_classes
cdef SIZE_t sum_stride
cdef class RegressionCriterion(Criterion):
"""Abstract regression criterion."""
cdef double sq_sum_total
|