File: detect_lin_dep_alias.R

package info (click to toggle)
r-cran-plm 2.6-2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 6,032 kB
  • sloc: sh: 13; makefile: 4
file content (299 lines) | stat: -rw-r--r-- 12,368 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# functions to aid in detecting linear dependent columns in the (transformed) 
# model matrix or estimated plm models:
#  * detect.lindep
#  * alias (the latter is a wrapper around alias.lm)
#
# doc file provides an extensive example how linear dependence can arise after
# the data transformation, e. g., for within transformation

### detect.lindep.matrix, .data.frame, .plm




#' Functions to detect linear dependence
#' 
#' Little helper functions to aid users to detect linear dependent columns in a
#' two-dimensional data structure, especially in a (transformed) model matrix -
#' typically useful in interactive mode during model building phase.
#' 
#' 
#' Linear dependence of columns/variables is (usually) readily avoided when
#' building one's model.  However, linear dependence is sometimes not obvious
#' and harder to detect for less experienced applied statisticians. The so
#' called "dummy variable trap" is a common and probably the best--known
#' fallacy of this kind (see e. g. Wooldridge (2016), sec. 7-2.). When building
#' linear models with `lm` or `plm`'s `pooling` model, linear
#' dependence in one's model is easily detected, at times post hoc.
#' 
#' However, linear dependence might also occur after some transformations of
#' the data, albeit it is not present in the untransformed data. The within
#' transformation (also called fixed effect transformation) used in the
#' `"within"` model can result in such linear dependence and this is
#' harder to come to mind when building a model. See **Examples** for two
#' examples of linear dependent columns after the within transformation: ex. 1)
#' the transformed variables have the opposite sign of one another; ex. 2) the
#' transformed variables are identical.
#' 
#' During `plm`'s model estimation, linear dependent columns and their
#' corresponding coefficients in the resulting object are silently dropped,
#' while the corresponding model frame and model matrix still contain the
#' affected columns.  The plm object contains an element `aliased` which
#' indicates any such aliased coefficients by a named logical.
#' 
#' Both functions, `detect.lindep` and `alias`, help to
#' detect linear dependence and accomplish almost the same:
#' `detect.lindep` is a stand alone implementation while
#' `alias` is a wrapper around
#' [stats::alias.lm()], extending the `alias`
#' generic to classes `"plm"` and `"pdata.frame"`.
#' `alias` hinges on the availability of the package
#' \CRANpkg{MASS} on the system. Not all arguments of `alias.lm`
#' are supported.  Output of `alias` is more informative as it
#' gives the linear combination of dependent columns (after data
#' transformations, i. e., after (quasi)-demeaning) while
#' `detect.lindep` only gives columns involved in the linear
#' dependence in a simple format (thus being more suited for automatic
#' post--processing of the information).
#' 
#' @aliases detect.lindep
#' @param object for `detect.lindep`: an object which should be checked
#' for linear dependence (of class `"matrix"`, `"data.frame"`, or
#' `"plm"`); for `alias`: either an estimated model of class
#' `"plm"` or a `"pdata.frame"`. Usually, one wants to input a model
#' matrix here or check an already estimated plm model,
#' @param suppressPrint for `detect.lindep` only: logical indicating
#' whether a message shall be printed; defaults to printing the message, i. e.,
#' to `suppressPrint = FALSE`,
#' @param model (see `plm`),
#' @param effect (see `plm`),
#' @param \dots further arguments.
#' @return For `detect.lindep`: A named numeric vector containing column
#' numbers of the linear dependent columns in the object after data
#' transformation, if any are present. `NULL` if no linear dependent
#' columns are detected.
#' 
#' For `alias`: return value of [stats::alias.lm()] run on the
#' (quasi-)demeaned model, i. e., the information outputted applies to
#' the transformed model matrix, not the original data.
#' @note function `detect.lindep` was called `detect_lin_dep`
#'     initially but renamed for naming consistency later.
#' @export
#' @author Kevin Tappe
#' @seealso [stats::alias()], [stats::model.matrix()] and especially
#'     `plm`'s [model.matrix()] for (transformed) model matrices,
#'     plm's [model.frame()].
#' @references
#'
#' \insertRef{WOOL:13}{plm}
#'
#' @keywords manip array
#' @examples
#'
#' ### Example 1 ###
#' # prepare the data
#' data("Cigar" , package = "plm")
#' Cigar[ , "fact1"] <- c(0,1)
#' Cigar[ , "fact2"] <- c(1,0)
#' Cigar.p <- pdata.frame(Cigar)
#' 
#' # setup a formula and a model frame
#' form <- price ~ 0 + cpi + fact1 + fact2
#' mf <- model.frame(Cigar.p, form)
#' # no linear dependence in the pooling model's model matrix
#' # (with intercept in the formula, there would be linear depedence)
#' detect.lindep(model.matrix(mf, model = "pooling"))
#' # linear dependence present in the FE transformed model matrix
#' modmat_FE <- model.matrix(mf, model = "within")
#' detect.lindep(modmat_FE)
#' mod_FE <- plm(form, data = Cigar.p, model = "within")
#' detect.lindep(mod_FE) 
#' alias(mod_FE) # => fact1 == -1*fact2
#' plm(form, data = mf, model = "within")$aliased # "fact2" indicated as aliased
#' 
#' # look at the data: after FE transformation fact1 == -1*fact2
#' head(modmat_FE)
#' all.equal(modmat_FE[ , "fact1"], -1*modmat_FE[ , "fact2"])
#' 
#' ### Example 2 ###
#' # Setup the data:
#' # Assume CEOs stay with the firms of the Grunfeld data
#' # for the firm's entire lifetime and assume some fictional
#' # data about CEO tenure and age in year 1935 (first observation
#' # in the data set) to be at 1 to 10 years and 38 to 55 years, respectively.
#' # => CEO tenure and CEO age increase by same value (+1 year per year).
#' data("Grunfeld", package = "plm")
#' set.seed(42)
#' # add fictional data
#' Grunfeld$CEOtenure <- c(replicate(10, seq(from=s<-sample(1:10,  1), to=s+19, by=1)))
#' Grunfeld$CEOage    <- c(replicate(10, seq(from=s<-sample(38:65, 1), to=s+19, by=1)))
#' 
#' # look at the data
#' head(Grunfeld, 50)
#' 
#' form <- inv ~ value + capital + CEOtenure + CEOage
#' mf <- model.frame(pdata.frame(Grunfeld), form)
#' # no linear dependent columns in original data/pooling model
#' modmat_pool <- model.matrix(mf, model="pooling")
#' detect.lindep(modmat_pool)
#' mod_pool <- plm(form, data = Grunfeld, model = "pooling")
#' alias(mod_pool)
#' 
#' # CEOtenure and CEOage are linear dependent after FE transformation
#' # (demeaning per individual)
#' modmat_FE <- model.matrix(mf, model="within")
#' detect.lindep(modmat_FE)
#' mod_FE <- plm(form, data = Grunfeld, model = "within")
#' detect.lindep(mod_FE)
#' alias(mod_FE)
#' 
#' # look at the transformed data: after FE transformation CEOtenure == 1*CEOage
#' head(modmat_FE, 50)
#' all.equal(modmat_FE[ , "CEOtenure"], modmat_FE[ , "CEOage"])
#' 
detect.lindep <- function(object, ...) {
  UseMethod("detect.lindep")
}

#' @rdname detect.lindep
#' @method detect.lindep matrix
#' @export
detect.lindep.matrix <- function(object, suppressPrint = FALSE, ...) {
  if (!inherits(object, "matrix")) {
    stop("Input 'object' must be a matrix. Presumably, one wants a model matrix
         generated by some 'model.matrix' function.")}
  # do rank reduction to detect lin. dep. columns
  rank_rec <- sapply(seq_len(ncol(object)), function(col) qr(object[ , -col])$rank)
  if (length(rank_rec) == 0 || diff(range(rank_rec)) == 0) {
    # return NULL if there is no linear dep. (length(rank_rec) == 0 to detect matrix with 0 columns)
    num <- NULL
  } else {
    num <- which(rank_rec == max(rank_rec))
    names(num) <- colnames(object)[num]
  }
  
  if(!suppressPrint) {
    if(is.null(num)) {
      print("No linear dependent column(s) detected.")
    } else {
      print(paste0("Suspicious column number(s): ", paste(num,        collapse = ", ")))
      print(paste0("Suspicious column name(s):   ", paste(names(num), collapse = ", ")))
    }
    return(invisible(num))
  }
  return(num)
}

#' @rdname detect.lindep
#' @method detect.lindep data.frame
#' @export
detect.lindep.data.frame <- function(object, suppressPrint = FALSE, ...) {
  if (!inherits(object, "data.frame")) {
    stop("Input 'object' must be a data.frame")}

  return(detect.lindep.matrix(as.matrix(object), suppressPrint = suppressPrint, ...))
}

#' @rdname detect.lindep
#' @method detect.lindep plm
#' @export
detect.lindep.plm <- function(object, suppressPrint = FALSE, ...) {
  if (!inherits(object, "plm")) {
    stop("Input 'object' must be of class \"plm\"")}

  return(detect.lindep.matrix(model.matrix(object), suppressPrint = suppressPrint, ...))
}


### alias.plm, alias.pFormula
# This is just a wrapper function to allow to apply the generic stats::alias on
# plm objects and pFormulas with the _transformed data_ (the transformed model.matrix).
# NB: arguments 'model' and 'effect' are not treated here.


#' @rdname detect.lindep
#' @export
alias.plm <- function(object, ...) {
  dots <- list(...)
  if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported")
  if (length(formula(object))[2] == 2) stop("alias.plm/alias.pFormula: IV not supported")
  
  # catch unsupported alias.lm args and convert
  if (!is.null(dots[["partial"]])) {
    if (dots[["partial"]]) {
      dots[["partial"]] <- FALSE
      warning("alias.plm/alias.pFormula: arg partial = TRUE not supported, changed to FALSE")
    }
  } 
  if (!is.null(dots[["partial.pattern"]])) {
    if (dots[["partial.pattern"]]) {
      dots[["partial.pattern"]] <- FALSE
      warning("alias.plm/alias.pFormula: arg partial.pattern = TRUE not supported, changed to FALSE")
    }
  }
  
  X <- model.matrix(object)
  y <- pmodel.response(object)
  
  lm.fit.obj <- lm.fit(X, y)
  class(lm.fit.obj) <- "lm"
  lm.fit.obj$terms <- deparse(object$formula)
  
  ## use lm.fit rather than lm():
  ## could estimate lm model with lm(), but takes more resources and 
  ## need to remove extra classes "formula" for lm to prevent warning
  # form <- object$formula
  # form <- setdiff(class(form), c("pFormula", "Formula"))
  # Xdf <- as.data.frame(X)
  # ydf <- as.data.frame(y)
  # names(ydf) <- names(object$model)[1]
  # data <- cbind(ydf, Xdf)
  # lmobj <- lm(form, data = data)
  # return(stats::alias(lmobj))
  
  return(stats::alias(lm.fit.obj, ... = dots))
}

## alias.pFormula <- function(object, data,
##                            model = c("pooling", "within", "Between", "between",
##                                      "mean", "random", "fd"),
##                            effect = c("individual", "time", "twoways"),
##                            ...) {
##   dots <- list(...)
##   if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported")
##   model <- match.arg(model)
##   effect <- match.arg(effect)
##   formula <- object

##   # check if object is already pFormula, try to convert if not    
##   if (!inherits(formula, "pFormula")) formula <- pFormula(formula)
  
##   # check if data is already a model frame, convert to if not
##   if (is.null(attr(data, "terms"))) {
##     data <- model.frame.pFormula(pFormula(formula), data)
##   }

##   plmobj <- plm(formula, data = data, model = model, effect = effect, ...)
## #  print(summary(plmobj))
##   return(alias(plmobj, ...))
## }


#' @rdname detect.lindep
#' @export
alias.pdata.frame <- function(object,
                              model = c("pooling", "within", "Between", "between",
                                        "mean", "random", "fd"),
                              effect = c("individual", "time", "twoways"),
                              ...) {
    dots <- list(...)
    if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported")
    model <- match.arg(model)
    effect <- match.arg(effect)
      # check if data is already a model frame, if not exit
    if (is.null(attr(object, "terms")))
        stop("the argument must be a model.frame")
    formula <- attr(object, "formula")
    plmobj <- plm(formula, data = object, model = model, effect = effect, ...)
    return(alias(plmobj, ...))
}