1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/aaa.R, R/postResample.R, R/prec_rec.R
\name{defaultSummary}
\alias{defaultSummary}
\alias{postResample}
\alias{twoClassSummary}
\alias{prSummary}
\alias{getTrainPerf}
\alias{mnLogLoss}
\alias{R2}
\alias{RMSE}
\alias{multiClassSummary}
\alias{MAE}
\title{Calculates performance across resamples}
\usage{
defaultSummary(data, lev = NULL, model = NULL)
postResample(pred, obs)
twoClassSummary(data, lev = NULL, model = NULL)
mnLogLoss(data, lev = NULL, model = NULL)
multiClassSummary(data, lev = NULL, model = NULL)
prSummary(data, lev = NULL, model = NULL)
}
\arguments{
\item{data}{a data frame with columns \code{obs} and
\code{pred} for the observed and predicted outcomes. For metrics
that rely on class probabilities, such as
\code{twoClassSummary}, columns should also include predicted
probabilities for each class. See the \code{classProbs} argument
to \code{\link{trainControl}}.}
\item{lev}{a character vector of factors levels for the
response. In regression cases, this would be \code{NULL}.}
\item{model}{a character string for the model name (as taken
from the \code{method} argument of \code{\link{train}}.}
\item{pred}{A vector of numeric data (could be a factor)}
\item{obs}{A vector of numeric data (could be a factor)}
}
\value{
A vector of performance estimates.
}
\description{
Given two numeric vectors of data, the mean squared error and
R-squared are calculated. For two factors, the overall agreement
rate and Kappa are determined.
}
\details{
\code{postResample} is meant to be used with \code{apply}
across a matrix. For numeric data the code checks to see if the
standard deviation of either vector is zero. If so, the
correlation between those samples is assigned a value of zero.
\code{NA} values are ignored everywhere.
Note that many models have more predictors (or parameters) than
data points, so the typical mean squared error denominator (n -
p) does not apply. Root mean squared error is calculated using
\code{sqrt(mean((pred - obs)^2}. Also, \eqn{R^2} is calculated
wither using as the square of the correlation between the
observed and predicted outcomes when \code{form = "corr"}. when
\code{form = "traditional"}, \deqn{ R^2 = 1-\frac{\sum (y_i -
\hat{y}_i)^2}{\sum (y_i - \bar{y})^2} }. Mean absolute error
is calculated using \code{mean(abs(pred-obs))}.
\code{defaultSummary} is the default function to compute
performance metrics in \code{\link{train}}. It is a wrapper
around \code{postResample}. The first argument is \code{data},
which is \code{data.frame} with columns named \code{obs} and
\code{pred} for the observed and predicted outcome values
(either numeric data for regression or character values for
classification). The second argument is \code{lev}, a character
string that has the outcome factor levels or NULL for a
regression model. The third parameter is \code{model}, which can
be used if a summary metric is specific to a model function. If
other columns from the data are required to compute the summary
statistics, but should not be used in the model, the
\code{recipe} method for \code{\link{train}} can be used.
\code{twoClassSummary} computes sensitivity, specificity and
the area under the ROC curve. \code{mnLogLoss} computes the
minus log-likelihood of the multinomial distribution (without
the constant term): \deqn{ -logLoss = \frac{-1}{n}\sum_{i=1}^n
\sum_{j=1}^C y_{ij} \log(p_{ij}) } where the \code{y} values are
binary indicators for the classes and \code{p} are the predicted
class probabilities.
\code{prSummary} (for precision and recall) computes values for
the default 0.50 probability cutoff as well as the area under
the precision-recall curve across all cutoffs and is labelled as
\code{"AUC"} in the output. If assumes that the first level of
the factor variables corresponds to a relevant result but the
\code{lev} argument can be used to change this.
\code{multiClassSummary} computes some overall measures of for
performance (e.g. overall accuracy and the Kappa statistic) and
several averages of statistics calculated from "one-versus-all"
configurations. For example, if there are three classes, three
sets of sensitivity values are determined and the average is
reported with the name ("Mean_Sensitivity"). The same is true
for a number of statistics generated by
\code{\link{confusionMatrix}}. With two classes, the basic
sensitivity is reported with the name "Sensitivity".
To use \code{twoClassSummary} and/or \code{mnLogLoss}, the
\code{classProbs} argument of \code{\link{trainControl}} should
be \code{TRUE}. \code{multiClassSummary} can be used without
class probabilities but some statistics (e.g. overall log loss
and the average of per-class area under the ROC curves) will not
be in the result set.
Other functions can be used via the \code{summaryFunction}
argument of \code{\link{trainControl}}. Custom functions must
have the same arguments as\code{defaultSummary}.
The function \code{getTrainPerf} returns a one row data frame
with the resampling results for the chosen model. The statistics
will have the prefix "\code{Train}" (i.e. "\code{TrainROC}").
There is also a column called "\code{method}" that echoes the
argument of the call to \code{\link{trainControl}} of the same
name.
}
\examples{
predicted <- matrix(rnorm(50), ncol = 5)
observed <- rnorm(10)
apply(predicted, 2, postResample, obs = observed)
classes <- c("class1", "class2")
set.seed(1)
dat <- data.frame(obs = factor(sample(classes, 50, replace = TRUE)),
pred = factor(sample(classes, 50, replace = TRUE)),
class1 = runif(50))
dat$class2 <- 1 - dat$class1
defaultSummary(dat, lev = classes)
twoClassSummary(dat, lev = classes)
prSummary(dat, lev = classes)
mnLogLoss(dat, lev = classes)
}
\references{
Kvalseth. Cautionary note about \eqn{R^2}. American Statistician
(1985) vol. 39 (4) pp. 279-285
}
\seealso{
\code{\link{trainControl}}
}
\author{
Max Kuhn, Zachary Mayer
}
\keyword{utilities}
|