File: learning_curve.R

package info (click to toggle)
r-cran-caret 7.0-1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,036 kB
  • sloc: ansic: 210; sh: 10; makefile: 2
file content (146 lines) | stat: -rw-r--r-- 6,207 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#' Create Data to Plot a Learning Curve
#'
#' For a given model, this function fits several versions on different sizes of
#' the total training set and returns the results
#'
#' This function creates a data set that can be used to plot how well the model
#' performs over different sized versions of the training set. For each data
#' set size, the performance metrics are determined and saved. If
#' \code{test_prop == 0}, the apparent measure of performance (i.e.
#' re-predicting the training set) and the resampled estimate of performance
#' are available. Otherwise, the test set results are also added.
#'
#' If the model being fit has tuning parameters, the results are based on the
#' optimal settings determined by \code{\link{train}}.
#'
#' @param dat the training data
#' @param outcome a character string identifying the outcome column name
#' @param proportion the incremental proportions of the training set that are
#' used to fit the model
#' @param test_prop an optional proportion of the data to be used to measure
#' performance.
#' @param verbose a logical to print logs to the screen as models are fit
#' @param \dots options to pass to \code{\link{train}} to specify the model.
#' These should not include \code{x}, \code{y}, \code{formula}, or \code{data}.
#' If \code{trainControl} is used here, do not use \code{method = "none"}.
#' @return a data frame with columns for each performance metric calculated by
#' \code{\link{train}} as well as columns: \item{Training_Size }{the number of
#' data points used in the current model fit} \item{Data }{which data were used
#' to calculate performance. Values are "Resampling", "Training", and
#' (optionally) "Testing"} In the results, each data set size will have one row
#' for the apparent error rate, one row for the test set results (if used) and
#' as many rows as resamples (e.g. 10 rows if 10-fold CV is used).
#' @author Max Kuhn
#' @seealso \code{\link{train}}
#' @keywords models
#' @examples
#'
#' \dontrun{
#' set.seed(1412)
#' class_dat <- twoClassSim(1000)
#'
#' ctrl <- trainControl(classProbs = TRUE,
#'                      summaryFunction = twoClassSummary)
#'
#' set.seed(29510)
#' lda_data <-
#'   learning_curve_dat(dat = class_dat,
#'                      outcome = "Class",
#'                      test_prop = 1/4,
#'                      ## `train` arguments:
#'                      method = "lda",
#'                      metric = "ROC",
#'                      trControl = ctrl)
#'
#'
#'
#' ggplot(lda_data, aes(x = Training_Size, y = ROC, color = Data)) +
#'   geom_smooth(method = loess, span = .8) +
#'   theme_bw()
#'  }
#'
#' @export learning_curve_dat
learning_curve_dat <- function(dat,
                              outcome = NULL,
                              proportion = (1:10)/10,
                              test_prop = 0,
                              verbose = TRUE, ...) {
  if(is.null(outcome))
    stop("Please give a character stirng for the outcome column name")
  proportion <- sort(unique(proportion))
  n_size <- length(proportion)

  if(test_prop > 0) {
    for_model <- createDataPartition(dat[, outcome], p = 1 - test_prop, list = FALSE)
  } else for_model <- 1:nrow(dat)

  n <- length(for_model)

  resampled <- vector(mode = "list", length = n_size)
  tested <- if(test_prop > 0) resampled else NULL
  apparent <- resampled
  for(i in seq(along.with = proportion)) {
    if(verbose) cat("Training for ", round(proportion[i]*100, 1),
                    "% (n = ", floor(n*proportion[i]), ")\n", sep = "")
    in_mod <- if(proportion[i] < 1) sample(for_model, size = floor(n*proportion[i])) else for_model
    mod <- train(x = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
                 y = dat[in_mod, outcome],
                 ...)
    if (mod$control$method == "none")
      stop("`learning_curve_dat` uses resampling so please choose a value of ",
           "`method` that is not 'none'", call. = FALSE)

    if(i == 1) perf_names <- mod$perfNames
    resampled[[i]] <- merge(mod$resample, mod$bestTune)
    resampled[[i]]$Training_Size <- length(in_mod)

    if(test_prop > 0) {
      if(!mod$control$classProbs) {
        test_preds <- extractPrediction(list(model = mod),
                                        testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
                                        testY = dat[-for_model, outcome])
      } else {
        test_preds <- extractProb(list(model = mod),
                                  testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
                                  testY = dat[-for_model, outcome])
      }
      test_perf <- mod$control$summaryFunction(test_preds, lev = mod$finalModel$obsLevels)
      test_perf <- as.data.frame(t(test_perf), stringsAsFactors = FALSE)
      test_perf$Training_Size <- length(in_mod)
      tested[[i]] <- test_perf
      try(rm(test_preds, test_perf), silent = TRUE)
    }

    if(!mod$control$classProbs) {
      app_preds <- extractPrediction(list(model = mod),
                                     testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
                                     testY = dat[in_mod, outcome])
    } else {
      app_preds <- extractProb(list(model = mod),
                               testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
                               testY = dat[in_mod, outcome])
    }
    app_perf <- mod$control$summaryFunction(app_preds, lev = mod$finalModel$obsLevels)
    app_perf <- as.data.frame(t(app_perf), stringsAsFactors = FALSE)
    app_perf$Training_Size <- length(in_mod)
    apparent[[i]] <- app_perf

    try(rm(mod, in_mod, app_preds, app_perf), silent = TRUE)
  }

  resampled <- do.call("rbind", resampled)
  resampled <- resampled[, c(perf_names, "Training_Size")]
  resampled$Data <- "Resampling"
  apparent <- do.call("rbind", apparent)
  apparent <- apparent[, c(perf_names, "Training_Size")]
  apparent$Data <- "Training"
  out <- rbind(resampled, apparent)
  if(test_prop > 0) {
    tested <- do.call("rbind", tested)
    tested <- tested[, c(perf_names, "Training_Size")]
    tested$Data <- "Testing"
    out <- rbind(out, tested)
  }
  out
}