1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/selectByFilter.R
\name{sbf}
\alias{sbf}
\alias{sbf.default}
\alias{sbf.formula}
\alias{predict.sbf}
\alias{sbf.recipe}
\title{Selection By Filtering (SBF)}
\usage{
sbf(x, ...)
\method{sbf}{default}(x, y, sbfControl = sbfControl(), ...)
\method{sbf}{formula}(form, data, ..., subset, na.action, contrasts = NULL)
\method{sbf}{recipe}(x, data, sbfControl = sbfControl(), ...)
\method{predict}{sbf}(object, newdata = NULL, ...)
}
\arguments{
\item{x}{a data frame containing training data where samples are in rows and
features are in columns. For the recipes method, \code{x} is a recipe object.}
\item{\dots}{for \code{sbf}: arguments passed to the classification or
regression routine (such as \code{\link[randomForest]{randomForest}}). For
\code{predict.sbf}: augments cannot be passed to the prediction function
using \code{predict.sbf} as it uses the function originally specified for
prediction.}
\item{y}{a numeric or factor vector containing the outcome for each sample.}
\item{sbfControl}{a list of values that define how this function acts. See
\code{\link{sbfControl}}. (NOTE: If given, this argument must be named.)}
\item{form}{A formula of the form \code{y ~ x1 + x2 + ...}}
\item{data}{Data frame from which variables specified in \code{formula} are
preferentially to be taken.}
\item{subset}{An index vector specifying the cases to be used in the
training sample. (NOTE: If given, this argument must be named.)}
\item{na.action}{A function to specify the action to be taken if NAs are
found. The default action is for the procedure to fail. An alternative is
na.omit, which leads to rejection of cases with missing values on any
required variable. (NOTE: If given, this argument must be named.)}
\item{contrasts}{a list of contrasts to be used for some or all the factors
appearing as variables in the model formula.}
\item{object}{an object of class \code{sbf}}
\item{newdata}{a matrix or data frame of predictors. The object must have
non-null column names}
}
\value{
for \code{sbf}, an object of class \code{sbf} with elements:
\item{pred}{if \code{sbfControl$saveDetails} is \code{TRUE}, this is a list
of predictions for the hold-out samples at each resampling iteration.
Otherwise it is \code{NULL}} \item{variables}{a list of variable names that
survived the filter at each resampling iteration} \item{results}{a data
frame of results aggregated over the resamples} \item{fit}{the final model
fit with only the filtered variables} \item{optVariables}{the names of the
variables that survived the filter using the training set} \item{ call}{the
function call} \item{control}{the control object} \item{resample}{if
\code{sbfControl$returnResamp} is "all", a data frame of the resampled
performance measures. Otherwise, \code{NULL}} \item{metrics}{a character
vector of names of the performance measures} \item{dots}{a list of optional
arguments that were passed in}
For \code{predict.sbf}, a vector of predictions.
}
\description{
Model fitting after applying univariate filters
}
\details{
More details on this function can be found at
\url{http://topepo.github.io/caret/feature-selection-using-univariate-filters.html}.
This function can be used to get resampling estimates for models when
simple, filter-based feature selection is applied to the training data.
For each iteration of resampling, the predictor variables are univariately
filtered prior to modeling. Performance of this approach is estimated using
resampling. The same filter and model are then applied to the entire
training set and the final model (and final features) are saved.
\code{sbf} can be used with "explicit parallelism", where different
resamples (e.g. cross-validation group) can be split up and run on multiple
machines or processors. By default, \code{sbf} will use a single processor
on the host machine. As of version 4.99 of this package, the framework used
for parallel processing uses the \pkg{foreach} package. To run the resamples
in parallel, the code for \code{sbf} does not change; prior to the call to
\code{sbf}, a parallel backend is registered with \pkg{foreach} (see the
examples below).
The modeling and filtering techniques are specified in
\code{\link{sbfControl}}. Example functions are given in
\code{\link{lmSBF}}.
}
\examples{
\dontrun{
data(BloodBrain)
## Use a GAM is the filter, then fit a random forest model
RFwithGAM <- sbf(bbbDescr, logBBB,
sbfControl = sbfControl(functions = rfSBF,
verbose = FALSE,
method = "cv"))
RFwithGAM
predict(RFwithGAM, bbbDescr[1:10,])
## classification example with parallel processing
## library(doMC)
## Note: if the underlying model also uses foreach, the
## number of cores specified above will double (along with
## the memory requirements)
## registerDoMC(cores = 2)
data(mdrr)
mdrrDescr <- mdrrDescr[,-nearZeroVar(mdrrDescr)]
mdrrDescr <- mdrrDescr[, -findCorrelation(cor(mdrrDescr), .8)]
set.seed(1)
filteredNB <- sbf(mdrrDescr, mdrrClass,
sbfControl = sbfControl(functions = nbSBF,
verbose = FALSE,
method = "repeatedcv",
repeats = 5))
confusionMatrix(filteredNB)
}
}
\seealso{
\code{\link{sbfControl}}
}
\author{
Max Kuhn
}
\keyword{models}
|