File: bagFDA.R

package info (click to toggle)
r-cran-caret 7.0-1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,036 kB
  • sloc: ansic: 210; sh: 10; makefile: 2
file content (254 lines) | stat: -rw-r--r-- 8,225 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#' Bagged FDA
#' @aliases bagFDA print.bagFDA bagFDA.default bagFDA.formula
#'
#' @description A bagging wrapper for flexible discriminant analysis (FDA) using multivariate adaptive regression splines (MARS) basis functions
#'
#'
#' @param formula A formula of the form \code{y ~ x1 + x2 + ...}
#' @param x matrix or data frame of 'x' values for examples.
#' @param y matrix or data frame of numeric values outcomes.
#' @param weights (case) weights for each example - if missing defaults to 1.
#' @param data Data frame from which variables specified in  'formula' are
#'         preferentially to be taken.
#' @param subset An index vector specifying the cases to be used in the
#'         training sample.  (NOTE: If given, this argument must be
#'         named.)
#' @param na.action A function to specify the action to be taken if 'NA's are
#'         found. The default action is for the procedure to fail.  An
#'         alternative is na.omit, which leads to rejection of cases
#'         with missing values on any required variable.  (NOTE: If
#'         given, this argument must be named.)
#'
#' @param B the number of bootstrap samples
#'
#' @param keepX a logical: should the original training data be kept?
#'
#' @param \dots arguments passed to the \code{mars} function
#'
#' @details The function computes a FDA model for each bootstap sample.
#'
#' @return
#' A list with elements
#' \item{fit }{a list of \code{B} FDA fits}
#' \item{B }{the number of bootstrap samples}
#' \item{call }{the function call}
#' \item{x }{either \code{NULL} or the value of \code{x}, depending on the
#'   value of \code{keepX}}
#' \item{oob}{a matrix of performance estimates for each bootstrap sample}
#'
#' @references J. Friedman, ``Multivariate Adaptive Regression Splines'' (with discussion) (1991).  Annals of Statistics, 19/1, 1-141.
#'
#' @author Max Kuhn (\code{bagFDA.formula} is based on Ripley's \code{nnet.formula})
#'
#' @seealso \code{\link[mda]{fda}}, \code{\link{predict.bagFDA}}
#'
#' @examples
#' library(mlbench)
#' library(earth)
#' data(Glass)
#'
#' set.seed(36)
#' inTrain <- sample(1:dim(Glass)[1], 150)
#'
#' trainData <- Glass[ inTrain, ]
#' testData  <- Glass[-inTrain, ]
#'
#'
#' set.seed(3577)
#' baggedFit <- bagFDA(Type ~ ., trainData)
#' confusionMatrix(data = predict(baggedFit, testData[, -10]),
#'                 reference = testData[, 10])
#'
#' @keywords regression
#'
#' @export
"bagFDA" <-
function(x, ...)
   UseMethod("bagFDA")

#' @rdname bagFDA
#' @method bagFDA default
#' @importFrom stats predict
#' @export
"bagFDA.default" <-
function(x, y, weights = NULL, B = 50, keepX = TRUE, ...)
{
  requireNamespaceQuietStop("mda")
  requireNamespaceQuietStop("earth")
   if(!is.matrix(x)) x <- as.matrix(x)
   if(!is.vector(y) & !is.factor(y)) y <- as.vector(y)
   if(!is.vector(y) & !is.factor(y)) y <- factor(y[,1])
   if(is.null(weights)) weights <- rep(1, dim(x)[1])
   foo <- function(index, x, y, w, ...)
   {
      subX <- x[index,, drop = FALSE]
      subY <- y[index]
      tmp <- as.data.frame(subX, stringsAsFactors = FALSE)
      tmp$.outcome <- subY
      if(!is.null(w)) subW <- w[index]
      fit <- if(is.null(w))
        mda::fda(.outcome ~., data = tmp, method = earth::earth, ...) else
          mda::fda(.outcome ~., data = tmp, method = earth::earth, weights = subW, ...)
      fit$index <- index
      fit
   }

   oobFoo <- function(fit, x, y)
   {
      index <- fit$index
      subX <- x[-index,, drop = FALSE]
      subY <- y[-index]
      predY <- predict(fit, subX)
      postResample(predY, subY)
   }

   btSamples <- createResample(y, times = B)
   btFits <- lapply(btSamples, foo, x = x, y = y, w = weights, ...)
   oobList <- lapply(btFits, oobFoo, x = x, y = y)
   oob <- matrix(unlist(oobList), ncol = length(oobList[[1]]), byrow = TRUE)
   colnames(oob) <- names(oobList[[1]])
   if(keepX) x <- x else x <- NULL
   structure(list(fit = btFits, B = B, oob = oob, x = x, levels = levels(y),
                  weights = !is.null(weights), dots = list(...)), class = "bagFDA")
}

#' @rdname bagFDA
#' @method bagFDA formula
#' @importFrom stats contrasts model.matrix model.response model.weights na.omit
#' @export
"bagFDA.formula" <-
function (formula, data = NULL, B = 50, keepX = TRUE, ..., subset, weights = NULL, na.action = na.omit)
{

   if (!inherits(formula, "formula"))
     stop("method is only for formula objects")
   m <- match.call(expand.dots = FALSE)
   mIndex <- match(c("formula", "data", "subset", "weights", "na.action"), names(m), 0)
   m <- m[c(1, mIndex)]
   m$... <- NULL
   m$na.action <- na.action
   m[[1]] <- as.name("model.frame")
   m <- eval(m, parent.frame())
   Terms <- attr(m, "terms")
   attr(Terms, "intercept") <- 0
   y <- model.response(m)
   w <- model.weights(m)
   x <- model.matrix(Terms, m)
   cons <- attr(x, "contrast")
   xint <- match("(Intercept)", colnames(x), nomatch = 0)
   if (xint > 0)  x <- x[, -xint, drop = FALSE]

   out <- bagFDA.default(x = x, y = y, weights = weights, B = B, keepX = keepX, ...)
   out
}

#' @rdname bagFDA
#' @method print bagFDA
#' @export
"print.bagFDA" <-
function (x, ...)
{
    if(!is.null(x$x))cat("Data:\n   # variables:\t", dim(x$x)[2], "\n   # samples:\t", dim(x$x)[1], "\n")
    cat(
      "\nModel:",
      "\n   B:        \t", x$B,
      "\n   dimension:\t", x$fit[[1]]$dimension,
      "\n")
    if(x$weights) cat("case weights used\n")

    cat("\n")
    invisible(x)
}




#' @rdname predict.bagEarth
#' @importFrom stats predict
#' @export
"predict.bagFDA" <-
function(object, newdata = NULL, type = "class", ...)
{
  requireNamespaceQuietStop("mda")
  requireNamespaceQuietStop("earth")
   getTrainPred <- function(x)
     {
       oobIndex <- 1:nrow(x$fit$fitted.values)
       oobIndex <- oobIndex[!(oobIndex %in% unique(x$index))]
       tmp <- predict(x, type = "posterior")[oobIndex,,drop = FALSE]
       rownames(tmp) <- 1:nrow(tmp)
       out <- data.frame(pred = tmp,
                         sample = oobIndex,
                         check.rows = FALSE)
       colnames(out)[1:ncol(tmp)] <- names(x$prior)
       out
     }

   if(is.null(newdata) & !is.null(object$x)) newdata <- object$x

   if(is.null(newdata))
     {
       pred <- lapply(object$fit, getTrainPred)
    } else {
       pred <- lapply(object$fit,
                      function(x, y)
                      {
                        tmp <- predict(x, newdata = y, type = "posterior")
                        nms <- colnames(tmp)
                        tmp <- as.data.frame(tmp, stringsAsFactors = FALSE)
                        names(tmp) <- nms
                        tmp$sample <- 1:nrow(tmp)
                        tmp
                      },
                      y = newdata)
     }
   pred <- rbind.fill(pred)
   out <- ddply(pred, .(sample),
                function(x) colMeans(x[,seq(along.with = object$levels)], na.rm = TRUE))
   out <- out[,-1,drop = FALSE]
   rownames(out) <- rownames(newdata)
   predClass <- object$levels[apply(out, 1, which.max)]
   predClass <- factor(predClass, levels = object$levels)
   switch(type, class = predClass, probs = out, posterior = out)
}

#' @rdname summary.bagEarth
#' @method summary bagFDA
#' @importFrom stats quantile
#' @export
"summary.bagFDA" <-
function(object, ...)
{

   oobStat <- apply(object$oob, 2, function(x) quantile(x, probs = c(0, 0.025, .5, .975, 1)))

   numTerms <- unlist(lapply(object$fit, function(x) length(x$fit$selected.terms)))
   numVar <- unlist(lapply(
      object$fit,
      function(x)
      {
         sum(
            apply(
               x$fit$dirs,
               2,
               function(u) any(u != 0)))
      }))
   modelInfo <- cbind(numTerms, numVar)
   colnames(modelInfo) <- c("Num Terms", "Num Variables")
   out <- list(modelInfo = modelInfo, oobStat = oobStat)
   class(out) <- "summary.bagFDA"
   out
}

#' @export
"print.summary.bagFDA" <-
function(x, digits = max(3, getOption("digits") - 3), ...)
{
   oobStat <- apply(x$oob, 2, function(x) quantile(x, probs = c(0, 0.025, .25, .5, .75, .975, 1)))
   cat("Out of bag statistics:\n\n")
   print(x$oobStat, digits = digits)
   cat("\nModel Selection Statistics:\n\n")
   print(summary(x$modelInfo))
   cat("\n")
}