File: classDist.R

package info (click to toggle)
r-cran-caret 7.0-1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,036 kB
  • sloc: ansic: 210; sh: 10; makefile: 2
file content (173 lines) | stat: -rw-r--r-- 6,229 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#' Compute and predict the distances to class centroids
#'
#' @aliases classDist.default classDist predict.classDist
#' @description This function computes the class centroids and covariance matrix for a training set for determining Mahalanobis distances of samples to each class centroid.
#'
#'
#' @param x a matrix or data frame of predictor variables
#' @param y a numeric or factor vector of class labels
#' @param groups an integer for the number of bins for splitting a numeric outcome
#' @param pca a logical: should principal components analysis be  applied to the dataset prior to splitting the data by class?
#' @param keep an integer for the number of PCA components that should by used to predict new samples (\code{NULL} uses all within a tolerance of \code{sqrt(.Machine$double.eps)})
#' @param object an object of class \code{classDist}
#' @param newdata a matrix or data frame. If \code{vars} was previously specified, these columns should be in \code{newdata}
#' @param trans an optional function that can be applied to each class distance. \code{trans = NULL} will not apply a function
#' @param \dots optional arguments to pass (not currently used)
#'
#' @details
#' For factor outcomes, the data are split into groups for each class
#' and the mean and covariance matrix are calculated. These are then
#' used to compute Mahalanobis distances to the class centers (using
#' \code{predict.classDist} The function will check for non-singular matrices.
#'
#' For numeric outcomes, the data are split into roughly equal sized
#' bins based on \code{groups}. Percentiles are used to split the data.
#'
#' @return
#' for \code{classDist}, an object of class \code{classDist} with
#' elements:
#'   \item{values }{a list with elements for each class. Each element
#'                  contains a mean vector for the class centroid and the
#'                  inverse of the class covariance matrix}
#'   \item{classes}{a character vector of class labels}
#'   \item{pca}{the results of \code{\link[stats]{prcomp}} when
#'              \code{pca = TRUE}}
#'   \item{call}{the function call}
#'   \item{p}{the number of variables}
#'   \item{n}{a vector of samples sizes per class}
#'
#' For \code{predict.classDist}, a matrix with columns for each class.
#' The columns names are the names of the class with the prefix
#' \code{dist.}. In the case of numeric \code{y}, the class labels are
#' the percentiles. For example, of \code{groups = 9}, the variable names
#' would be \code{dist.11.11}, \code{dist.22.22}, etc.
#'
#' @author Max Kuhn
#'
#' @references Forina et al. CAIMAN brothers: A family of powerful classification and class modeling techniques. Chemometrics and Intelligent Laboratory Systems (2009) vol. 96 (2) pp. 239-245
#'
#' @seealso \code{\link[stats]{mahalanobis}}
#'
#' @examples
#' trainSet <- sample(1:150, 100)
#'
#' distData <- classDist(iris[trainSet, 1:4],
#'                       iris$Species[trainSet])
#'
#' newDist <- predict(distData,
#'                    iris[-trainSet, 1:4])
#'
#' splom(newDist, groups = iris$Species[-trainSet])
#'
#' @keywords manip
#' @export
classDist <- function (x, ...)  UseMethod("classDist")

#' @rdname classDist
#' @method classDist default
#' @importFrom stats cov predict quantile prcomp
#' @export
classDist.default <- function(x, y, groups = 5,
                              pca = FALSE,
                              keep = NULL,
                              ...)
{
  if(is.numeric(y))
    {
      y <- cut(y,
               unique(quantile(y, probs = seq(0, 1, length.out = groups + 1))),
               include.lowest = TRUE)
      classLabels <- paste(round((1:groups)/groups*100, 2))
      y <- factor(y)
      cuts <- levels(y)
    } else {
      classLabels <- levels(y)
      cuts <- NULL
    }

  p <- ncol(x)

  if(pca)
    {
      pca <- prcomp(x, center = TRUE, scale. = TRUE,
                    tol = sqrt(.Machine$double.eps))
      keep <- min(keep, ncol(pca$rotation))
      if(!is.null(keep)) pca$rotation <- pca$rotation[, 1:keep, drop = FALSE]
      x <- as.data.frame(predict(pca, newdata = x), stringsAsFactors = FALSE)
    } else pca <- NULL

  x <- split(x, y)

  getStats <- function(u)
    {
      if(nrow(u) < ncol(u))
        stop("there must be more rows than columns for this class")
      A <- try(cov(u), silent = TRUE)
      if(inherits(A, "try-error"))
        stop("Cannot compute the covariance matrix")
      A <- try(solve(A), silent = TRUE)
      if(inherits(A, "try-error"))
        stop("Cannot invert the covariance matrix")
      list(means = colMeans(u, na.rm = TRUE),
           A = A)
    }
  structure(
            list(values = lapply(x, getStats),
                 classes = classLabels,
                 cuts = cuts,
                 pca = pca,
                 call = match.call(),
                 p = p,
                 n = unlist(lapply(x, nrow))),
            class = "classDist")
}

#' @export
print.classDist <- function(x, ...)
  {
    printCall(x$call)

    if(!is.null(x$cuts))
      {
        cat("Classes based on", length(x$cuts) - 1,
            "cuts of the data\n")
        paste(x$cuts, collapse = " ")
        cat("\n")
      }

    if(!is.null(x$pca)) cat("PCA applied,",
                            ncol(x$pca$rotation),
                            "components retained\n\n")
    cat("# predictors variables:", x$p, "\n")
    cat("# samples:",
        paste(
              paste(x$n,
                    ifelse(is.null(x$cuts), " (", " "),
                    names(x$n),
                    ifelse(is.null(x$cuts), ")", ""),
                    sep = ""),
              collapse = ", "),
        "\n")
    invisible(x)
  }

#' @rdname classDist
#' @method predict classDist
#' @importFrom stats mahalanobis predict
#' @export
predict.classDist <- function(object, newdata, trans = log, ...)
{
  if(!is.null(object$pca))
    {
      newdata <- predict(object$pca, newdata = newdata)
    }

  pred <- function(a, x) mahalanobis(x, center = a$means, cov = a$A, inverted = TRUE)

  out <- lapply(object$values, pred, x = newdata)
  out <- do.call("cbind", out)
  colnames(out) <- paste("dist.", object$classes, sep = "")

  if(!is.null(trans)) out <- apply(out, 2, trans)
  out
}