1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
#' Print model summary
#'
#' Display basic information about a \code{gbm} object.
#'
#' Prints some information about the model object. In particular, this method
#' prints the call to \code{gbm()}, the type of loss function that was used,
#' and the total number of iterations.
#'
#' If cross-validation was performed, the 'best' number of trees as estimated
#' by cross-validation error is displayed. If a test set was used, the 'best'
#' number of trees as estimated by the test set error is displayed.
#'
#' The number of available predictors, and the number of those having non-zero
#' influence on predictions is given (which might be interesting in data mining
#' applications).
#'
#' If multinomial, bernoulli or adaboost was used, the confusion matrix and
#' prediction accuracy are printed (objects being allocated to the class with
#' highest probability for multinomial and bernoulli). These classifications
#' are performed on the entire training data using the model with the 'best'
#' number of trees as described above, or the maximum number of trees if the
#' 'best' cannot be computed.
#'
#' If the 'distribution' was specified as gaussian, laplace, quantile or
#' t-distribution, a summary of the residuals is displayed. The residuals are
#' for the training data with the model at the 'best' number of trees, as
#' described above, or the maximum number of trees if the 'best' cannot be
#' computed.
#'
#' @aliases print.gbm show.gbm
#' @param x an object of class \code{gbm}.
#' @param \dots arguments passed to \code{print.default}.
#' @author Harry Southworth, Daniel Edwards
#' @seealso \code{\link{gbm}}
#' @keywords models nonlinear survival nonparametric
#' @examples
#'
#' data(iris)
#' iris.mod <- gbm(Species ~ ., distribution="multinomial", data=iris,
#' n.trees=2000, shrinkage=0.01, cv.folds=5,
#' verbose=FALSE, n.cores=1)
#' iris.mod
#' #data(lung)
#' #lung.mod <- gbm(Surv(time, status) ~ ., distribution="coxph", data=lung,
#' # n.trees=2000, shrinkage=0.01, cv.folds=5,verbose =FALSE)
#' #lung.mod
#' @rdname print.gbm
#' @export
print.gbm <- function(x, ... )
{
if (!is.null(x$call)){ print(x$call) }
dist.name <- x$distribution$name
if (dist.name == "pairwise")
{
if (!is.null(x$distribution$max.rank) && x$distribution$max.rank > 0)
{
dist.name <- sprintf("pairwise (metric=%s, max.rank=%d)", x$distribution$metric, x$distribution$max.rank)
}
else
{
dist.name <- sprintf("pairwise (metric=%s)", x$distribution$metric)
}
}
cat( paste( "A gradient boosted model with", dist.name, "loss function.\n" ))
cat( paste( length( x$train.error ), "iterations were performed.\n" ) )
best <- length( x$train.error )
if ( !is.null( x$cv.error ) )
{
best <- gbm.perf( x, plot.it = FALSE, method="cv" )
cat( paste("The best cross-validation iteration was ", best, ".\n", sep = "" ) )
}
if ( x$train.fraction < 1 )
{
best <- gbm.perf( x, plot.it = FALSE, method="test" )
cat( paste("The best test-set iteration was ", best, ".\n", sep = "" ) )
}
if ( is.null( best ) )
{
best <- length( x$train.error )
}
ri <- relative.influence( x, n.trees=best )
cat( "There were", length( x$var.names ), "predictors of which",
sum( ri > 0 ), "had non-zero influence.\n" )
invisible()
}
#' @rdname print.gbm
#'
#' @export
show.gbm <- print.gbm
#' Summary of a gbm object
#'
#' Computes the relative influence of each variable in the gbm object.
#'
#' For \code{distribution="gaussian"} this returns exactly the reduction of
#' squared error attributable to each variable. For other loss functions this
#' returns the reduction attributable to each variable in sum of squared error
#' in predicting the gradient on each iteration. It describes the relative
#' influence of each variable in reducing the loss function. See the references
#' below for exact details on the computation.
#'
#' @param object a \code{gbm} object created from an initial call to
#' \code{\link{gbm}}.
#' @param cBars the number of bars to plot. If \code{order=TRUE} the only the
#' variables with the \code{cBars} largest relative influence will appear in
#' the barplot. If \code{order=FALSE} then the first \code{cBars} variables
#' will appear in the plot. In either case, the function will return the
#' relative influence of all of the variables.
#' @param n.trees the number of trees used to generate the plot. Only the first
#' \code{n.trees} trees will be used.
#' @param plotit an indicator as to whether the plot is generated.
#' @param order an indicator as to whether the plotted and/or returned relative
#' influences are sorted.
#' @param method The function used to compute the relative influence.
#' \code{\link{relative.influence}} is the default and is the same as that
#' described in Friedman (2001). The other current (and experimental) choice is
#' \code{\link{permutation.test.gbm}}. This method randomly permutes each
#' predictor variable at a time and computes the associated reduction in
#' predictive performance. This is similar to the variable importance measures
#' Breiman uses for random forests, but \code{gbm} currently computes using the
#' entire training dataset (not the out-of-bag observations).
#' @param normalize if \code{FALSE} then \code{summary.gbm} returns the
#' unnormalized influence.
#' @param ... other arguments passed to the plot function.
#' @return Returns a data frame where the first component is the variable name
#' and the second is the computed relative influence, normalized to sum to 100.
#' @author Greg Ridgeway \email{gregridgeway@@gmail.com}
#' @seealso \code{\link{gbm}}
#' @references J.H. Friedman (2001). "Greedy Function Approximation: A Gradient
#' Boosting Machine," Annals of Statistics 29(5):1189-1232.
#'
#' L. Breiman
#' (2001).\url{https://www.stat.berkeley.edu/users/breiman/randomforest2001.pdf}.
#' @keywords hplot
#'
#' @export summary.gbm
#' @export
summary.gbm <- function(object,
cBars=length(object$var.names),
n.trees=object$n.trees,
plotit=TRUE,
order=TRUE,
method=relative.influence,
normalize=TRUE,
...)
{
if(n.trees < 1)
{
stop("n.trees must be greater than 0.")
}
if(n.trees > object$n.trees)
{
warning("Exceeded total number of GBM terms. Results use n.trees=",object$n.trees," terms.\n")
n.trees <- object$n.trees
}
rel.inf <- method(object,n.trees)
rel.inf[rel.inf<0] <- 0
if(order)
{
i <- order(-rel.inf)
}
else
{
i <- 1:length(rel.inf)
}
if(cBars==0) cBars <- min(10,length(object$var.names))
if(cBars>length(object$var.names)) cBars <- length(object$var.names)
if(normalize) rel.inf <- 100*rel.inf/sum(rel.inf)
if(plotit)
{
barplot(rel.inf[i[cBars:1]],
horiz=TRUE,
col=rainbow(cBars,start=3/6,end=4/6),
names=object$var.names[i[cBars:1]],
xlab="Relative influence",...)
}
return(data.frame(var=object$var.names[i],
rel.inf=rel.inf[i]))
}
|