1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
#' Create Data to Plot a Learning Curve
#'
#' For a given model, this function fits several versions on different sizes of
#' the total training set and returns the results
#'
#' This function creates a data set that can be used to plot how well the model
#' performs over different sized versions of the training set. For each data
#' set size, the performance metrics are determined and saved. If
#' \code{test_prop == 0}, the apparent measure of performance (i.e.
#' re-predicting the training set) and the resampled estimate of performance
#' are available. Otherwise, the test set results are also added.
#'
#' If the model being fit has tuning parameters, the results are based on the
#' optimal settings determined by \code{\link{train}}.
#'
#' @param dat the training data
#' @param outcome a character string identifying the outcome column name
#' @param proportion the incremental proportions of the training set that are
#' used to fit the model
#' @param test_prop an optional proportion of the data to be used to measure
#' performance.
#' @param verbose a logical to print logs to the screen as models are fit
#' @param \dots options to pass to \code{\link{train}} to specify the model.
#' These should not include \code{x}, \code{y}, \code{formula}, or \code{data}.
#' If \code{trainControl} is used here, do not use \code{method = "none"}.
#' @return a data frame with columns for each performance metric calculated by
#' \code{\link{train}} as well as columns: \item{Training_Size }{the number of
#' data points used in the current model fit} \item{Data }{which data were used
#' to calculate performance. Values are "Resampling", "Training", and
#' (optionally) "Testing"} In the results, each data set size will have one row
#' for the apparent error rate, one row for the test set results (if used) and
#' as many rows as resamples (e.g. 10 rows if 10-fold CV is used).
#' @author Max Kuhn
#' @seealso \code{\link{train}}
#' @keywords models
#' @examples
#'
#' \dontrun{
#' set.seed(1412)
#' class_dat <- twoClassSim(1000)
#'
#' ctrl <- trainControl(classProbs = TRUE,
#' summaryFunction = twoClassSummary)
#'
#' set.seed(29510)
#' lda_data <-
#' learning_curve_dat(dat = class_dat,
#' outcome = "Class",
#' test_prop = 1/4,
#' ## `train` arguments:
#' method = "lda",
#' metric = "ROC",
#' trControl = ctrl)
#'
#'
#'
#' ggplot(lda_data, aes(x = Training_Size, y = ROC, color = Data)) +
#' geom_smooth(method = loess, span = .8) +
#' theme_bw()
#' }
#'
#' @export learning_curve_dat
learning_curve_dat <- function(dat,
outcome = NULL,
proportion = (1:10)/10,
test_prop = 0,
verbose = TRUE, ...) {
if(is.null(outcome))
stop("Please give a character stirng for the outcome column name")
proportion <- sort(unique(proportion))
n_size <- length(proportion)
if(test_prop > 0) {
for_model <- createDataPartition(dat[, outcome], p = 1 - test_prop, list = FALSE)
} else for_model <- 1:nrow(dat)
n <- length(for_model)
resampled <- vector(mode = "list", length = n_size)
tested <- if(test_prop > 0) resampled else NULL
apparent <- resampled
for(i in seq(along.with = proportion)) {
if(verbose) cat("Training for ", round(proportion[i]*100, 1),
"% (n = ", floor(n*proportion[i]), ")\n", sep = "")
in_mod <- if(proportion[i] < 1) sample(for_model, size = floor(n*proportion[i])) else for_model
mod <- train(x = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
y = dat[in_mod, outcome],
...)
if (mod$control$method == "none")
stop("`learning_curve_dat` uses resampling so please choose a value of ",
"`method` that is not 'none'", call. = FALSE)
if(i == 1) perf_names <- mod$perfNames
resampled[[i]] <- merge(mod$resample, mod$bestTune)
resampled[[i]]$Training_Size <- length(in_mod)
if(test_prop > 0) {
if(!mod$control$classProbs) {
test_preds <- extractPrediction(list(model = mod),
testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
testY = dat[-for_model, outcome])
} else {
test_preds <- extractProb(list(model = mod),
testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
testY = dat[-for_model, outcome])
}
test_perf <- mod$control$summaryFunction(test_preds, lev = mod$finalModel$obsLevels)
test_perf <- as.data.frame(t(test_perf), stringsAsFactors = FALSE)
test_perf$Training_Size <- length(in_mod)
tested[[i]] <- test_perf
try(rm(test_preds, test_perf), silent = TRUE)
}
if(!mod$control$classProbs) {
app_preds <- extractPrediction(list(model = mod),
testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
testY = dat[in_mod, outcome])
} else {
app_preds <- extractProb(list(model = mod),
testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
testY = dat[in_mod, outcome])
}
app_perf <- mod$control$summaryFunction(app_preds, lev = mod$finalModel$obsLevels)
app_perf <- as.data.frame(t(app_perf), stringsAsFactors = FALSE)
app_perf$Training_Size <- length(in_mod)
apparent[[i]] <- app_perf
try(rm(mod, in_mod, app_preds, app_perf), silent = TRUE)
}
resampled <- do.call("rbind", resampled)
resampled <- resampled[, c(perf_names, "Training_Size")]
resampled$Data <- "Resampling"
apparent <- do.call("rbind", apparent)
apparent <- apparent[, c(perf_names, "Training_Size")]
apparent$Data <- "Training"
out <- rbind(resampled, apparent)
if(test_prop > 0) {
tested <- do.call("rbind", tested)
tested <- tested[, c(perf_names, "Training_Size")]
tested$Data <- "Testing"
out <- rbind(out, tested)
}
out
}
|