File: varImp.Rd

package info (click to toggle)
r-cran-caret 6.0-81-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 7,268 kB
  • sloc: ansic: 208; sh: 10; makefile: 2
file content (267 lines) | stat: -rw-r--r-- 10,466 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/varImp.R, R/varImp.train.R
\name{varImp}
\alias{varImp}
\alias{varImp.train}
\alias{varImp.earth}
\alias{varImp.rpart}
\alias{varImp.randomForest}
\alias{varImp.gbm}
\alias{varImp.regbagg}
\alias{varImp.classbagg}
\alias{varImp.pamrtrained}
\alias{varImp.lm}
\alias{varImp.mvr}
\alias{varImp.bagEarth}
\alias{varImp.bagFDA}
\alias{varImp.RandomForest}
\alias{varImp.rfe}
\alias{varImp.dsa}
\alias{varImp.fda}
\alias{varImp.multinom}
\alias{varImp.cubist}
\alias{varImp.plsda}
\alias{varImp.JRip}
\alias{varImp.PART}
\alias{varImp.nnet}
\alias{varImp.C5.0}
\alias{varImp.glmnet}
\alias{varImp.glm}
\alias{varImp.avNNet}
\alias{varImp.RRF}
\alias{varImp.gam}
\alias{varImp.Gam}
\title{Calculation of variable importance for regression and classification models}
\usage{
varImp(object, ...)

\method{varImp}{bagEarth}(object, ...)

\method{varImp}{bagFDA}(object, ...)

\method{varImp}{C5.0}(object, ...)

\method{varImp}{cubist}(object, weights = c(0.5, 0.5), ...)

\method{varImp}{dsa}(object, cuts = NULL, ...)

\method{varImp}{glm}(object, ...)

\method{varImp}{glmnet}(object, lambda = NULL, ...)

\method{varImp}{JRip}(object, ...)

\method{varImp}{multinom}(object, ...)

\method{varImp}{nnet}(object, ...)

\method{varImp}{avNNet}(object, ...)

\method{varImp}{PART}(object, ...)

\method{varImp}{RRF}(object, ...)

\method{varImp}{rpart}(object, surrogates = FALSE, competes = TRUE,
  ...)

\method{varImp}{randomForest}(object, ...)

\method{varImp}{gbm}(object, numTrees = NULL, ...)

\method{varImp}{classbagg}(object, ...)

\method{varImp}{regbagg}(object, ...)

\method{varImp}{pamrtrained}(object, threshold, data, ...)

\method{varImp}{lm}(object, ...)

\method{varImp}{mvr}(object, estimate = NULL, ...)

\method{varImp}{earth}(object, value = "gcv", ...)

\method{varImp}{RandomForest}(object, ...)

\method{varImp}{plsda}(object, ...)

\method{varImp}{fda}(object, value = "gcv", ...)

\method{varImp}{gam}(object, ...)

\method{varImp}{Gam}(object, ...)

\method{varImp}{train}(object, useModel = TRUE, nonpara = TRUE,
  scale = TRUE, ...)
}
\arguments{
\item{object}{an object corresponding to a fitted model}

\item{\dots}{parameters to pass to the specific \code{varImp} methods}

\item{weights}{a numeric vector of length two that weighs the usage of
variables in the rule conditions and the usage in the linear models (see
details below).}

\item{cuts}{the number of rule sets to use in the model (for \code{partDSA}
only)}

\item{lambda}{a single value of the penalty parameter}

\item{surrogates}{should surrogate splits contribute to the importance
calculation?}

\item{competes}{should competing splits contribute to the importance
calculation?}

\item{numTrees}{the number of iterations (trees) to use in a boosted tree
model}

\item{threshold}{the shrinkage threshold (\code{pamr} models only)}

\item{data}{the training set predictors (\code{pamr} models only)}

\item{estimate}{which estimate of performance should be used? See
\code{\link[pls]{mvrVal}}}

\item{value}{the statistic that will be used to calculate importance: either
\code{gcv}, \code{nsubsets}, or \code{rss}}

\item{useModel}{use a model based technique for measuring variable
importance?  This is only used for some models (lm, pls, rf, rpart, gbm, pam
and mars)}

\item{nonpara}{should nonparametric methods be used to assess the
relationship between the features and response (only used with
\code{useModel = FALSE} and only passed to \code{filterVarImp}).}

\item{scale}{should the importance values be scaled to 0 and 100?}
}
\value{
A data frame with class \code{c("varImp.train", "data.frame")} for
\code{varImp.train} or a matrix for other models.
}
\description{
A generic method for calculating variable importance for objects produced by
\code{train} and method specific methods
}
\details{
For models that do not have corresponding \code{varImp} methods, see
\code{\link{filterVarImp}}.

Otherwise:

\bold{Linear Models}: the absolute value of the t--statistic for each model
parameter is used.

\bold{\code{glmboost}} and \bold{\code{glmnet}}: the absolute value of the coefficients
corresponding the the tuned model are used.

\bold{Random Forest}: \code{varImp.randomForest} and
\code{varImp.RandomForest} are wrappers around the importance functions from
the \pkg{randomForest} and \pkg{party} packages, respectively.

\bold{Partial Least Squares}: the variable importance measure here is based
on weighted sums of the absolute regression coefficients. The weights are a
function of the reduction of the sums of squares across the number of PLS
components and are computed separately for each outcome. Therefore, the
contribution of the coefficients are weighted proportionally to the
reduction in the sums of squares.

\bold{Recursive Partitioning}: The reduction in the loss function (e.g. mean
squared error) attributed to each variable at each split is tabulated and
the sum is returned. Also, since there may be candidate variables that are
important but are not used in a split, the top competing variables are also
tabulated at each split. This can be turned off using the \code{maxcompete}
argument in \code{rpart.control}. This method does not currently provide
class--specific measures of importance when the response is a factor.

\bold{Bagged Trees}: The same methodology as a single tree is applied to all
bootstrapped trees and the total importance is returned

\bold{Boosted Trees}: \code{varImp.gbm} is a wrapper around the function
from that package (see the \pkg{gbm} package vignette)

\bold{ Multivariate Adaptive Regression Splines}: MARS models include a
backwards elimination feature selection routine that looks at reductions in
the generalized cross-validation (GCV) estimate of error. The \code{varImp}
function tracks the changes in model statistics, such as the GCV, for each
predictor and accumulates the reduction in the statistic when each
predictor's feature is added to the model. This total reduction is used as
the variable importance measure. If a predictor was never used in any of the
MARS basis functions in the final model (after pruning), it has an
importance value of zero. Prior to June 2008, the package used an internal
function for these calculations. Currently, the \code{varImp} is a wrapper
to the \code{\link[earth]{evimp}} function in the \code{earth} package.
There are three statistics that can be used to estimate variable importance
in MARS models. Using \code{varImp(object, value = "gcv")} tracks the
reduction in the generalized cross-validation statistic as terms are added.
However, there are some cases when terms are retained in the model that
result in an increase in GCV. Negative variable importance values for MARS
are set to zero.  Alternatively, using \code{varImp(object, value = "rss")}
monitors the change in the residual sums of squares (RSS) as terms are
added, which will never be negative.  Also, the option \code{varImp(object,
value =" nsubsets")}, which counts the number of subsets where the variable
is used (in the final, pruned model).

\bold{Nearest shrunken centroids}: The difference between the class
centroids and the overall centroid is used to measure the variable influence
(see \code{pamr.predict}). The larger the difference between the class
centroid and the overall center of the data, the larger the separation
between the classes. The training set predictions must be supplied when an
object of class \code{pamrtrained} is given to \code{varImp}.

\bold{Cubist}: The Cubist output contains variable usage statistics. It
gives the percentage of times where each variable was used in a condition
and/or a linear model. Note that this output will probably be inconsistent
with the rules shown in the output from
\code{\link[Cubist]{summary.cubist}}. At each split of the tree, Cubist
saves a linear model (after feature selection) that is allowed to have terms
for each variable used in the current split or any split above it. Quinlan
(1992) discusses a smoothing algorithm where each model prediction is a
linear combination of the parent and child model along the tree. As such,
the final prediction is a function of all the linear models from the initial
node to the terminal node. The percentages shown in the Cubist output
reflects all the models involved in prediction (as opposed to the terminal
models shown in the output). The variable importance used here is a linear
combination of the usage in the rule conditions and the model.

\bold{PART} and \bold{JRip}: For these rule-based models, the importance for
a predictor is simply the number of rules that involve the predictor.

\bold{C5.0}: C5.0 measures predictor importance by determining the
percentage of training set samples that fall into all the terminal nodes
after the split. For example, the predictor in the first split automatically
has an importance measurement of 100 percent since all samples are affected
by this split. Other predictors may be used frequently in splits, but if the
terminal nodes cover only a handful of training set samples, the importance
scores may be close to zero. The same strategy is applied to rule-based
models and boosted versions of the model. The underlying function can also
return the number of times each predictor was involved in a split by using
the option \code{metric = "usage"}.

\bold{Neural Networks}: The method used here is based on Gevrey et al
(2003), which uses combinations of the absolute values of the weights. For
classification models, the class-specific importances will be the same.

\bold{Recursive Feature Elimination}: Variable importance is computed using
the ranking method used for feature selection. For the final subset size,
the importances for the models across all resamples are averaged to compute
an overall value.

\bold{Feature Selection via Univariate Filters}, the percentage of resamples
that a predictor was selected is determined. In other words, an importance
of 0.50 means that the predictor survived the filter in half of the
resamples.
}
\references{
Gevrey, M., Dimopoulos, I., & Lek, S. (2003). Review and
comparison of methods to study the contribution of variables in artificial
neural network models. Ecological Modelling, 160(3), 249-264.

Quinlan, J. (1992). Learning with continuous classes. Proceedings of the 5th
Australian Joint Conference On Artificial Intelligence, 343-348.
}
\author{
Max Kuhn
}
\keyword{models}