File: nearZeroVar.R

package info (click to toggle)
r-cran-caret 7.0-1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,036 kB
  • sloc: ansic: 210; sh: 10; makefile: 2
file content (174 lines) | stat: -rw-r--r-- 6,701 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#' Identification of near zero variance predictors
#' 
#' \code{nearZeroVar} diagnoses predictors that have one unique value (i.e. are
#' zero variance predictors) or predictors that are have both of the following
#' characteristics: they have very few unique values relative to the number of
#' samples and the ratio of the frequency of the most common value to the
#' frequency of the second most common value is large. \code{checkConditionalX}
#' looks at the distribution of the columns of \code{x} conditioned on the
#' levels of \code{y} and identifies columns of \code{x} that are sparse within
#' groups of \code{y}.
#' 
#' For example, an example of near zero variance predictor is one that, for
#' 1000 samples, has two distinct values and 999 of them are a single value.
#' 
#' To be flagged, first the frequency of the most prevalent value over the
#' second most frequent value (called the ``frequency ratio'') must be above
#' \code{freqCut}. Secondly, the ``percent of unique values,'' the number of
#' unique values divided by the total number of samples (times 100), must also
#' be below \code{uniqueCut}.
#' 
#' In the above example, the frequency ratio is 999 and the unique value
#' percentage is 0.0001.
#' 
#' Checking the conditional distribution of \code{x} may be needed for some
#' models, such as naive Bayes where the conditional distributions should have
#' at least one data point within a class.
#' 
#' \code{nzv} is the original version of the function.
#' 
#' @aliases nearZeroVar nzv checkResamples checkConditionalX
#' @param x a numeric vector or matrix, or a data frame with all numeric data
#' @param freqCut the cutoff for the ratio of the most common value to the
#' second most common value
#' @param uniqueCut the cutoff for the percentage of distinct values out of the
#' number of total samples
#' @param saveMetrics a logical. If false, the positions of the zero- or
#' near-zero predictors is returned. If true, a data frame with predictor
#' information is returned.
#' @param names a logical. If false, column indexes are returned. If true,
#' column names are returned.
#' @param y a factor vector with at least two levels
#' @param index a list. Each element corresponds to the training set samples in
#' \code{x} for a given resample
#' @param foreach should the \pkg{foreach} package be used for the
#' computations? If \code{TRUE}, less memory should be used.
#' @param allowParallel should the parallel processing via the \pkg{foreach}
#' package be used for the computations? If \code{TRUE}, more memory will be
#' used but execution time should be shorter.
#' @return For \code{nearZeroVar}: if \code{saveMetrics = FALSE}, a vector of
#' integers corresponding to the column positions of the problematic
#' predictors. If \code{saveMetrics = TRUE}, a data frame with columns:
#' \item{freqRatio }{the ratio of frequencies for the most common value over
#' the second most common value} \item{percentUnique }{the percentage of unique
#' data points out of the total number of data points} \item{zeroVar }{a vector
#' of logicals for whether the predictor has only one distinct value} \item{nzv
#' }{a vector of logicals for whether the predictor is a near zero variance
#' predictor}
#' 
#' For \code{checkResamples} or \code{checkConditionalX}, a vector of column
#' indicators for predictors with empty conditional distributions in at least
#' one class of \code{y}.
#' @author Max Kuhn, with speed improvements to nearZeroVar by Allan Engelhardt
#' @keywords utilities
#' @examples
#' 
#' nearZeroVar(iris[, -5], saveMetrics = TRUE)
#' 
#' data(BloodBrain)
#' nearZeroVar(bbbDescr)
#' nearZeroVar(bbbDescr, names = TRUE)
#' 
#' 
#' set.seed(1)
#' classes <- factor(rep(letters[1:3], each = 30))
#' x <- data.frame(x1 = rep(c(0, 1), 45),
#'                 x2 = c(rep(0, 10), rep(1, 80)))
#' 
#' lapply(x, table, y = classes)
#' checkConditionalX(x, classes)
#' 
#' folds <- createFolds(classes, k = 3, returnTrain = TRUE)
#' x$x3 <- x$x1
#' x$x3[folds[[1]]] <- 0
#' 
#' checkResamples(folds, x, classes)
#' 
#' 
#' 
#' @export nearZeroVar
nearZeroVar <- function (x, freqCut = 95/5, uniqueCut = 10, saveMetrics = FALSE, names = FALSE, foreach = FALSE, allowParallel = TRUE) {

  if(!foreach) return(nzv(x, freqCut = freqCut, uniqueCut = uniqueCut, saveMetrics = saveMetrics, names = names))

  `%op%` <- getOper(foreach && allowParallel && getDoParWorkers() > 1)

  if(saveMetrics) {
    res <- foreach(name = colnames(x), .combine=rbind) %op% {
      r <- nzv(x[[name]], freqCut = freqCut, uniqueCut = uniqueCut, saveMetrics = TRUE)
      r[,"column" ] <-  name
      r
    }
    res <- res[, c(5, 1, 2, 3, 4)]
    rownames(res) <- as.character(res$column)
    res$column <- NULL
  } else {
    res <- foreach(name = colnames(x), .combine=c) %op% {
      r <- nzv(x[[name]], freqCut = freqCut, uniqueCut = uniqueCut, saveMetrics = FALSE)
      ## needed because either integer() or 1, r is never 0
      if (length(r) > 0 && r == 1) TRUE else FALSE
    }
    res <- which(res)
    if(names){
      res <- colnames(x)[res]
    }
  }
  res
}

#' @export
nzv <- function (x, freqCut = 95/5, uniqueCut = 10, saveMetrics = FALSE, names = FALSE)
{
  if (is.null(dim(x))) x <- matrix(x, ncol = 1)
  freqRatio <- apply(x, 2, function(data)
  {
    t <- table(data[!is.na(data)])
    if (length(t) <= 1) {
      return(0);
    }
    w <- which.max(t);
    return(max(t, na.rm=TRUE)/max(t[-w], na.rm=TRUE))
  })
  lunique <- apply(x, 2, function(data) length(unique(data[!is.na(data)])))
  percentUnique <- 100 * lunique / apply(x, 2, length)
  zeroVar <- (lunique == 1) | apply(x, 2, function(data) all(is.na(data)))
  if (saveMetrics)
  {
    out <- data.frame(freqRatio = freqRatio,
                      percentUnique = percentUnique,
                      zeroVar = zeroVar,
                      nzv = (freqRatio > freqCut & percentUnique <= uniqueCut) | zeroVar)
  }
  else {
    out <- which((freqRatio > freqCut & percentUnique <= uniqueCut) | zeroVar)
    names(out) <- NULL
    if(names){
      out <- colnames(x)[out]
    }
  }
  out
}

zeroVar <- function(x)
{
  x <- x[,colnames(x) != ".outcome", drop = FALSE]
  which(apply(x, 2, function(x) length(unique(x)) < 2))
}

#' @rdname nearZeroVar
#' @export
checkConditionalX <- function(x, y)
{
  x$.outcome <- y
  unique(unlist(dlply(x, .(.outcome), zeroVar)))
}

#' @rdname nearZeroVar
#' @export
checkResamples <- function(index, x, y)
{
  if(!is.factor(y)) stop("y must be a factor")
  if(length(levels(y)) < 2) stop("y must have at least 2 levels")
  wrap <- function(index, x, y) checkConditionalX(x[index,,drop=FALSE], y[index])
  unique(unlist(lapply(index, wrap, x = x, y = y)))
}