File: print.gbm.R

package info (click to toggle)
r-cran-gbm 2.2.2-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 1,244 kB
  • sloc: cpp: 7,368; ansic: 266; sh: 13; makefile: 9
file content (188 lines) | stat: -rw-r--r-- 7,489 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#' Print model summary
#' 
#' Display basic information about a \code{gbm} object.
#' 
#' Prints some information about the model object. In particular, this method
#' prints the call to \code{gbm()}, the type of loss function that was used,
#' and the total number of iterations.
#' 
#' If cross-validation was performed, the 'best' number of trees as estimated
#' by cross-validation error is displayed. If a test set was used, the 'best'
#' number of trees as estimated by the test set error is displayed.
#' 
#' The number of available predictors, and the number of those having non-zero
#' influence on predictions is given (which might be interesting in data mining
#' applications).
#' 
#' If multinomial, bernoulli or adaboost was used, the confusion matrix and
#' prediction accuracy are printed (objects being allocated to the class with
#' highest probability for multinomial and bernoulli). These classifications
#' are performed on the entire training data using the model with the 'best'
#' number of trees as described above, or the maximum number of trees if the
#' 'best' cannot be computed.
#' 
#' If the 'distribution' was specified as gaussian, laplace, quantile or
#' t-distribution, a summary of the residuals is displayed.  The residuals are
#' for the training data with the model at the 'best' number of trees, as
#' described above, or the maximum number of trees if the 'best' cannot be
#' computed.
#' 
#' @aliases print.gbm show.gbm
#' @param x an object of class \code{gbm}.
#' @param \dots arguments passed to \code{print.default}.
#' @author Harry Southworth, Daniel Edwards
#' @seealso \code{\link{gbm}}
#' @keywords models nonlinear survival nonparametric
#' @examples
#' 
#' data(iris)
#' iris.mod <- gbm(Species ~ ., distribution="multinomial", data=iris,
#'                  n.trees=2000, shrinkage=0.01, cv.folds=5,
#'                  verbose=FALSE, n.cores=1)
#' iris.mod
#' #data(lung)
#' #lung.mod <- gbm(Surv(time, status) ~ ., distribution="coxph", data=lung,
#' #                 n.trees=2000, shrinkage=0.01, cv.folds=5,verbose =FALSE)
#' #lung.mod

#' @rdname print.gbm
#' @export
print.gbm <- function(x, ... )
{
   if (!is.null(x$call)){ print(x$call) }
   dist.name <- x$distribution$name
   if (dist.name == "pairwise")
   {
      if (!is.null(x$distribution$max.rank) && x$distribution$max.rank > 0)
      {
           dist.name <- sprintf("pairwise (metric=%s, max.rank=%d)", x$distribution$metric,  x$distribution$max.rank)
      }
      else
      {
           dist.name <- sprintf("pairwise (metric=%s)", x$distribution$metric)
      }
   }
   cat( paste( "A gradient boosted model with", dist.name, "loss function.\n" ))
   cat( paste( length( x$train.error ), "iterations were performed.\n" ) )
   best <- length( x$train.error )
   if ( !is.null( x$cv.error ) )
   {
      best <- gbm.perf( x, plot.it = FALSE, method="cv" )
      cat( paste("The best cross-validation iteration was ", best, ".\n", sep = "" ) )
   }
   if ( x$train.fraction < 1 )
   {
      best <- gbm.perf( x, plot.it = FALSE, method="test" )
      cat( paste("The best test-set iteration was ", best, ".\n", sep = "" ) )
   }
   if ( is.null( best ) )
   {
      best <- length( x$train.error )
   }
   ri <- relative.influence( x, n.trees=best )
   cat( "There were", length( x$var.names ), "predictors of which",
       sum( ri > 0 ), "had non-zero influence.\n" )

   invisible()
}


#' @rdname print.gbm
#' 
#' @export
show.gbm <- print.gbm


#' Summary of a gbm object
#' 
#' Computes the relative influence of each variable in the gbm object.
#' 
#' For \code{distribution="gaussian"} this returns exactly the reduction of
#' squared error attributable to each variable. For other loss functions this
#' returns the reduction attributable to each variable in sum of squared error
#' in predicting the gradient on each iteration. It describes the relative
#' influence of each variable in reducing the loss function. See the references
#' below for exact details on the computation.
#' 
#' @param object a \code{gbm} object created from an initial call to
#' \code{\link{gbm}}.
#' @param cBars the number of bars to plot. If \code{order=TRUE} the only the
#' variables with the \code{cBars} largest relative influence will appear in
#' the barplot. If \code{order=FALSE} then the first \code{cBars} variables
#' will appear in the plot. In either case, the function will return the
#' relative influence of all of the variables.
#' @param n.trees the number of trees used to generate the plot. Only the first
#' \code{n.trees} trees will be used.
#' @param plotit an indicator as to whether the plot is generated.
#' @param order an indicator as to whether the plotted and/or returned relative
#' influences are sorted.
#' @param method The function used to compute the relative influence.
#' \code{\link{relative.influence}} is the default and is the same as that
#' described in Friedman (2001). The other current (and experimental) choice is
#' \code{\link{permutation.test.gbm}}. This method randomly permutes each
#' predictor variable at a time and computes the associated reduction in
#' predictive performance. This is similar to the variable importance measures
#' Breiman uses for random forests, but \code{gbm} currently computes using the
#' entire training dataset (not the out-of-bag observations).
#' @param normalize if \code{FALSE} then \code{summary.gbm} returns the
#' unnormalized influence.
#' @param ...  other arguments passed to the plot function.
#' @return Returns a data frame where the first component is the variable name
#' and the second is the computed relative influence, normalized to sum to 100.
#' @author Greg Ridgeway \email{gregridgeway@@gmail.com}
#' @seealso \code{\link{gbm}}
#' @references J.H. Friedman (2001). "Greedy Function Approximation: A Gradient
#' Boosting Machine," Annals of Statistics 29(5):1189-1232.
#' 
#' L. Breiman
#' (2001).\url{https://www.stat.berkeley.edu/users/breiman/randomforest2001.pdf}.
#' @keywords hplot
#' 
#' @export summary.gbm
#' @export
summary.gbm <- function(object,
                        cBars=length(object$var.names),
                        n.trees=object$n.trees,
                        plotit=TRUE,
                        order=TRUE,
                        method=relative.influence,
                        normalize=TRUE,
                        ...)
{
   if(n.trees < 1)
   {
      stop("n.trees must be greater than 0.")
   }
   if(n.trees > object$n.trees)
   {
      warning("Exceeded total number of GBM terms. Results use n.trees=",object$n.trees," terms.\n")
      n.trees <- object$n.trees
   }

   rel.inf <- method(object,n.trees)
   rel.inf[rel.inf<0] <- 0

   if(order)
   {
      i <- order(-rel.inf)
   }
   else
   {
      i <- 1:length(rel.inf)
   }
   if(cBars==0) cBars <- min(10,length(object$var.names))
   if(cBars>length(object$var.names)) cBars <- length(object$var.names)

   if(normalize) rel.inf <- 100*rel.inf/sum(rel.inf)

   if(plotit)
   {
      barplot(rel.inf[i[cBars:1]],
              horiz=TRUE,
              col=rainbow(cBars,start=3/6,end=4/6),
              names=object$var.names[i[cBars:1]],
              xlab="Relative influence",...)
   }
   return(data.frame(var=object$var.names[i],
                     rel.inf=rel.inf[i]))
}