1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
|
\name{rfcv}
\alias{rfcv}
\title{Random Forest Cross-Valdidation for feature selection}
\description{
This function shows the cross-validated prediction performance of
models with sequentially reduced number of predictors (ranked by
variable importance) via a nested cross-validation procedure.
}
\usage{
rfcv(trainx, trainy, cv.fold=5, scale="log", step=0.5,
mtry=function(p) max(1, floor(sqrt(p))), recursive=FALSE, ...)
}
\arguments{
\item{trainx}{matrix or data frame containing columns of predictor
variables}
\item{trainy}{vector of response, must have length equal to the number
of rows in \code{trainx}}
\item{cv.fold}{number of folds in the cross-validation}
\item{scale}{if \code{"log"}, reduce a fixed proportion (\code{step})
of variables at each step, otherwise reduce \code{step} variables at a
time}
\item{step}{if \code{log=TRUE}, the fraction of variables to remove at
each step, else remove this many variables at a time}
\item{mtry}{a function of number of remaining predictor variables to
use as the \code{mtry} parameter in the \code{randomForest} call}
\item{recursive}{whether variable importance is (re-)assessed at each
step of variable reduction}
\item{...}{other arguments passed on to \code{randomForest}}
}
\value{
A list with the following components:
list(n.var=n.var, error.cv=error.cv, predicted=cv.pred)
\item{n.var}{vector of number of variables used at each step}
\item{error.cv}{corresponding vector of error rates or MSEs at each
step}
\item{predicted}{list of \code{n.var} components, each containing
the predicted values from the cross-validation}
}
%\details{
%}
\references{
Svetnik, V., Liaw, A., Tong, C. and Wang, T., ``Application of Breiman's
Random Forest to Modeling Structure-Activity Relationships of
Pharmaceutical Molecules'', MCS 2004, Roli, F. and Windeatt, T. (Eds.)
pp. 334-343.
}
\seealso{
\code{\link{randomForest}}, \code{\link{importance}}
}
\examples{
set.seed(647)
myiris <- cbind(iris[1:4], matrix(runif(96 * nrow(iris)), nrow(iris), 96))
result <- rfcv(myiris, iris$Species, cv.fold=3)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))
## The following can take a while to run, so if you really want to try
## it, copy and paste the code into R.
\dontrun{
result <- replicate(5, rfcv(myiris, iris$Species), simplify=FALSE)
error.cv <- sapply(result, "[[", "error.cv")
matplot(result[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type="l",
lwd=c(2, rep(1, ncol(error.cv))), col=1, lty=1, log="x",
xlab="Number of variables", ylab="CV Error")
}
}
\author{Andy Liaw}
\keyword{classif}
\keyword{regression}
|