File: dummyVar.R

package info (click to toggle)
r-cran-caret 7.0-1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,036 kB
  • sloc: ansic: 210; sh: 10; makefile: 2
file content (320 lines) | stat: -rw-r--r-- 12,396 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#' Create A Full Set of Dummy Variables
#'
#' \code{dummyVars} creates a full set of dummy variables (i.e. less than full
#' rank parameterization)
#'
#' Most of the \code{\link[stats]{contrasts}} functions in R produce full rank
#' parameterizations of the predictor data. For example,
#' \code{\link[stats]{contr.treatment}} creates a reference cell in the data
#' and defines dummy variables for all factor levels except those in the
#' reference cell. For example, if a factor with 5 levels is used in a model
#' formula alone, \code{\link[stats]{contr.treatment}} creates columns for the
#' intercept and all the factor levels except the first level of the factor.
#' For the data in the Example section below, this would produce:
#' \preformatted{ (Intercept) dayTue dayWed dayThu dayFri daySat daySun
#'            1      0      0      0      0      0      0
#'            1      0      0      0      0      0      0
#'            1      0      0      0      0      0      0
#'            1      0      1      0      0      0      0
#'            1      0      1      0      0      0      0
#'            1      0      0      0      1      0      0
#'            1      0      0      0      0      1      0
#'            1      0      0      0      0      1      0
#'            1      0      0      0      1      0      0}
#'
#' In some situations, there may be a need for dummy variables for all the
#' levels of the factor. For the same example:
#' \preformatted{ dayMon dayTue dayWed dayThu dayFri daySat daySun
#'       1      0      0      0      0      0      0
#'       1      0      0      0      0      0      0
#'       1      0      0      0      0      0      0
#'       0      0      1      0      0      0      0
#'       0      0      1      0      0      0      0
#'       0      0      0      0      1      0      0
#'       0      0      0      0      0      1      0
#'       0      0      0      0      0      1      0
#'       0      0      0      0      1      0      0}
#'
#' Given a formula and initial data set, the class \code{dummyVars} gathers all
#' the information needed to produce a full set of dummy variables for any data
#' set. It uses \code{contr.ltfr} as the base function to do this.
#'
#' \code{class2ind} is most useful for converting a factor outcome vector to a
#' matrix (or vector) of dummy variables.
#'
#' @aliases dummyVars dummyVars.default predict.dummyVars contr.dummy
#' contr.ltfr class2ind
#' @param formula An appropriate R model formula, see References
#' @param data A data frame with the predictors of interest
#' @param sep An optional separator between factor variable names and their
#' levels. Use \code{sep = NULL} for no separator (i.e. normal behavior of
#' \code{\link[stats]{model.matrix}} as shown in the Details section)
#' @param levelsOnly A logical; \code{TRUE} means to completely remove the
#' variable names from the column names
#' @param fullRank A logical; should a full rank or less than full rank
#' parameterization be used? If \code{TRUE}, factors are encoded to be
#' consistent with \code{\link[stats]{model.matrix}} and the resulting there
#' are no linear dependencies induced between the columns.
#' @param object An object of class \code{dummyVars}
#' @param newdata A data frame with the required columns
#' @param na.action A function determining what should be done with missing
#' values in \code{newdata}. The default is to predict \code{NA}.
#' @param n A vector of levels for a factor, or the number of levels.
#' @param contrasts A logical indicating whether contrasts should be computed.
#' @param sparse A logical indicating if the result should be sparse.
#' @param x A factor vector.
#' @param ... additional arguments to be passed to other methods
#' @return The output of \code{dummyVars} is a list of class 'dummyVars' with
#' elements \item{call }{the function call} \item{form }{the model formula}
#' \item{vars }{names of all the variables in the model} \item{facVars }{names
#' of all the factor variables in the model} \item{lvls }{levels of any factor
#' variables} \item{sep }{\code{NULL} or a character separator} \item{terms
#' }{the \code{\link[stats]{terms.formula}} object} \item{levelsOnly }{a
#' logical}
#'
#' The \code{predict} function produces a data frame.
#'
#' \code{class2ind} returns a matrix (or a vector if \code{drop2nd = TRUE}).
#'
#' \code{contr.ltfr} generates a design matrix.
#' @author \code{contr.ltfr} is a small modification of
#' \code{\link[stats]{contr.treatment}} by Max Kuhn
#' @seealso \code{\link[stats]{model.matrix}}, \code{\link[stats]{contrasts}},
#' \code{\link[stats]{formula}}
#' @references
#' \url{https://cran.r-project.org/doc/manuals/R-intro.html#Formulae-for-statistical-models}
#' @keywords models
#' @examples
#' when <- data.frame(time = c("afternoon", "night", "afternoon",
#'                             "morning", "morning", "morning",
#'                             "morning", "afternoon", "afternoon"),
#'                    day = c("Mon", "Mon", "Mon",
#'                            "Wed", "Wed", "Fri",
#'                            "Sat", "Sat", "Fri"),
#'                            stringsAsFactors = TRUE)
#'
#' levels(when$time) <- list(morning="morning",
#'                           afternoon="afternoon",
#'                           night="night")
#' levels(when$day) <- list(Mon="Mon", Tue="Tue", Wed="Wed", Thu="Thu",
#'                          Fri="Fri", Sat="Sat", Sun="Sun")
#'
#' ## Default behavior:
#' model.matrix(~day, when)
#'
#' mainEffects <- dummyVars(~ day + time, data = when)
#' mainEffects
#' predict(mainEffects, when[1:3,])
#'
#' when2 <- when
#' when2[1, 1] <- NA
#' predict(mainEffects, when2[1:3,])
#' predict(mainEffects, when2[1:3,], na.action = na.omit)
#'
#'
#' interactionModel <- dummyVars(~ day + time + day:time,
#'                               data = when,
#'                               sep = ".")
#' predict(interactionModel, when[1:3,])
#'
#' noNames <- dummyVars(~ day + time + day:time,
#'                      data = when,
#'                      levelsOnly = TRUE)
#' predict(noNames, when)
#'
#' head(class2ind(iris$Species))
#'
#' two_levels <- factor(rep(letters[1:2], each = 5))
#' class2ind(two_levels)
#' class2ind(two_levels, drop2nd = TRUE)
#' @export dummyVars
"dummyVars" <-
  function(formula, ...){
    UseMethod("dummyVars")
  }

#' @rdname dummyVars
#' @method dummyVars default
#' @importFrom stats as.formula model.frame
#' @export
dummyVars.default <- function (formula, data, sep = ".", levelsOnly = FALSE, fullRank = FALSE, ...)
{
  formula <- as.formula(formula)
  if(!is.data.frame(data)) data <- as.data.frame(data, stringsAsFactors = FALSE)

  vars <- all.vars(formula)
  if(any(vars == "."))
  {
    vars <- vars[vars != "."]
    vars <- unique(c(vars, colnames(data)))
  }
  isFac <- unlist(lapply(data[,vars,drop = FALSE], is.factor))
  if(sum(isFac) > 0)
  {
    facVars <- vars[isFac]
    lvls <- lapply(data[,facVars,drop = FALSE], levels)
    if(levelsOnly)
    {
      tabs <- table(unlist(lvls))
      if(any(tabs > 1))
      {
        stop(paste("You requested `levelsOnly = TRUE` but",
                   "the following levels are not unique",
                   "across predictors:",
                   paste(names(tabs)[tabs > 1], collapse = ", ")))
      }
    }
  } else {
    facVars <- NULL
    lvls <- NULL
  }
  trms <- attr(model.frame(formula, data), "terms")
  out <- list(call = match.call(),
              form = formula,
              vars = vars,
              facVars = facVars,
              lvls = lvls,
              sep = sep,
              terms = trms,
              levelsOnly = levelsOnly,
              fullRank = fullRank)
  class(out) <- "dummyVars"
  out

}

#' @rdname dummyVars
#' @method print dummyVars
#' @export
print.dummyVars <- function(x, ...)
{
  cat("Dummy Variable Object\n\n")
  cat("Formula: ")
  print(x$form)
  cat(length(x$vars),  " variables, ", length(x$facVars), " factors\n", sep = "")
  if(!is.null(x$sep) & !x$levelsOnly) cat("Variables and levels will be separated by '",
                                          x$sep, "'\n", sep = "")
  if(x$levelsOnly) cat("Factor variable names will be removed\n")
  if(x$fullRank) cat("A full rank encoding is used") else cat("A less than full rank encoding is used")
  cat("\n")
  invisible(x)
}

#' @rdname dummyVars
#' @method predict dummyVars
#' @importFrom stats delete.response model.frame model.matrix na.pass
#' @export
predict.dummyVars <- function(object, newdata, na.action = na.pass, ...)
{
  if(is.null(newdata)) stop("newdata must be supplied")
  if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = FALSE)
  if(!all(object$vars %in% names(newdata))) stop(
    paste("Variable(s)",
          paste("'", object$vars[!object$vars %in% names(newdata)],
                "'", sep = "",
                collapse = ", "),
          "are not in newdata"))
  Terms <- object$terms
  Terms <- delete.response(Terms)
  if(!object$fullRank)
  {
    oldContr <- options("contrasts")$contrasts
    newContr <- oldContr
    newContr["unordered"] <- "contr.ltfr"
    options(contrasts = newContr)
    on.exit(options(contrasts = oldContr))
  }
  m <- model.frame(Terms, newdata, na.action = na.action, xlev = object$lvls)

  x <- model.matrix(Terms, m)

  cnames <- colnames(x)
  if(object$levelsOnly) {
    for(i in object$facVars) {
      for(j in object$lvls[[i]]) {
        from_text <- paste0(i, j)
        cnames[which(cnames == from_text)] <- j
      }
    }
  }
  if(!is.null(object$sep) & !object$levelsOnly) {
    for(i in object$facVars[order(-nchar(object$facVars))]) {
      ## the default output form model.matrix is NAMElevel with no separator.
      for(j in object$lvls[[i]]) {
        from_text <- paste0(i, j)
        to_text <- paste(i, j, sep = object$sep)
        pos = which(cnames == from_text)
        # If there are several identical NAMElevel matching (example: "X1" with level "11" and "X11" with level "1")
        if (length(pos) > 1) {
          # If the level j is not the first level of the feature i
          if (which(object$lvls[[i]] == j) > 1) {
            # Then we just have to test for the preceding NAMElevel being NAME(level-1)
            cnames[pos][cnames[pos-1] == paste(i, object$lvls[[i]][which(object$lvls[[i]] == j)-1], sep = object$sep)] <- to_text
          } else {
            # Otherwise, we have to test for the preceding NAMElevel being (NAME-1)(last_level)
            cnames[pos][cnames[pos-1] == paste(object$facVars[order(-nchar(object$facVars))][which(object$facVars[order(-nchar(object$facVars))] == i) - 1], utils::tail(object$lvls[[object$facVars[order(-nchar(object$facVars))][which(object$facVars[order(-nchar(object$facVars))] == i) - 1]]],n=1), sep = object$sep)] <- to_text
          }
        } else {
          # Otherwise simply replace the last occurence of the pattern
          cnames[pos] <- to_text
        }
      }
    }
  }
  colnames(x) <- cnames
  x[, colnames(x) != "(Intercept)", drop = FALSE]
}

#' @rdname dummyVars
#' @export
contr.ltfr <- function (n, contrasts = TRUE, sparse = FALSE)
{
  if (is.numeric(n) && length(n) == 1L) {
    if (n > 1L)
      levels <- as.character(seq_len(n))
    else stop("not enough degrees of freedom to define contrasts")
  }
  else {
    levels <- as.character(n)
    n <- length(n)
  }
  contr <- .RDiag(levels, sparse = sparse)
  if (contrasts) {
    if (n < 2L) stop(gettextf("contrasts not defined for %d degrees of freedom", n - 1L), domain = NA)
  }
  contr
}

#' @export
contr.dummy <- function(n, ...)
{
  if (is.numeric(n) && length(n) == 1L) {
    if (n > 1L)
      levels <- as.character(seq_len(n))
    else stop("not enough degrees of freedom to define contrasts")
  }
  else {
    levels <- as.character(n)
    n <- length(n)
  }
  out <- diag(n)
  rownames(out) <- levels
  colnames(out) <- levels
  out
}

#' @rdname dummyVars
#' @importFrom stats model.matrix
#' @export
#' @param drop2nd A logical: if the factor has two levels, should a single binary vector be returned?
class2ind <- function(x, drop2nd = FALSE) {
  if(!is.factor(x)) stop("'x' should be a factor")
  y <- model.matrix(~ x - 1)
  colnames(y) <- gsub("^x", "", colnames(y))
  attributes(y)$assign <- NULL
  attributes(y)$contrasts <- NULL
  if(length(levels(x)) == 2 & drop2nd) {
    y <- y[,1]
  }
  y
}