File: demean.R

package info (click to toggle)
r-cran-datawizard 0.6.5%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,736 kB
  • sloc: sh: 13; makefile: 2
file content (400 lines) | stat: -rw-r--r-- 17,168 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#' Compute group-meaned and de-meaned variables
#'
#' @description
#'
#' `demean()` computes group- and de-meaned versions of a variable that can be
#' used in regression analysis to model the between- and within-subject effect.
#' `degroup()` is more generic in terms of the centering-operation. While
#' `demean()` always uses mean-centering, `degroup()` can also use the mode or
#' median for centering.
#'
#' @param x A data frame.
#' @param select Character vector (or formula) with names of variables to select
#'   that should be group- and de-meaned.
#' @param group Character vector (or formula) with the name of the variable that
#'   indicates the group- or cluster-ID.
#' @param center Method for centering. `demean()` always performs
#'   mean-centering, while `degroup()` can use `center = "median"` or
#'   `center = "mode"` for median- or mode-centering, and also `"min"`
#'   or `"max"`.
#' @param suffix_demean,suffix_groupmean String value, will be appended to the
#'   names of the group-meaned and de-meaned variables of `x`. By default,
#'   de-meaned variables will be suffixed with `"_within"` and
#'   grouped-meaned variables with `"_between"`.
#' @param add_attributes Logical, if `TRUE`, the returned variables gain
#'   attributes to indicate the within- and between-effects. This is only
#'   relevant when printing `model_parameters()` - in such cases, the
#'   within- and between-effects are printed in separated blocks.
#' @inheritParams center
#'
#' @return
#'
#' A data frame with the group-/de-meaned variables, which get the suffix
#' `"_between"` (for the group-meaned variable) and `"_within"` (for the
#' de-meaned variable) by default.
#'
#' @seealso If grand-mean centering (instead of centering within-clusters)
#'   is required, see [center()].
#'
#' @details
#'
#'   \subsection{Heterogeneity Bias}{
#'     Mixed models include different levels of sources of variability, i.e.
#'     error terms at each level. When macro-indicators (or level-2 predictors,
#'     or higher-level units, or more general: *group-level predictors that
#'     **vary** within and across groups*) are included as fixed effects (i.e.
#'     treated as covariate at level-1), the variance that is left unaccounted for
#'     this covariate will be absorbed into the error terms of level-1 and level-2
#'     (\cite{Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6.}):
#'     \dQuote{Such covariates contain two parts: one that is specific to the
#'     higher-level entity that does not vary between occasions, and one that
#'     represents the difference between occasions, within higher-level entities}
#'     (\cite{Bell et al. 2015}). Hence, the error terms will be correlated with
#'     the covariate, which violates one of the assumptions of mixed models
#'     (iid, independent and identically distributed error terms). This bias is
#'     also called the *heterogeneity bias* (\cite{Bell et al. 2015}). To
#'     resolve this problem, level-2 predictors used as (level-1) covariates should
#'     be separated into their "within" and "between" effects by "de-meaning" and
#'     "group-meaning": After demeaning time-varying predictors, \dQuote{at the
#'     higher level, the mean term is no longer constrained by Level 1 effects,
#'     so it is free to account for all the higher-level variance associated
#'     with that variable} (\cite{Bell et al. 2015}).
#'   }
#'
#'   \subsection{Panel data and correlating fixed and group effects}{
#'     `demean()` is intended to create group- and de-meaned variables
#'     for panel regression models (fixed effects models), or for complex
#'     random-effect-within-between models (see \cite{Bell et al. 2015, 2018}),
#'     where group-effects (random effects) and fixed effects correlate (see
#'     \cite{Bafumi and Gelman 2006}). This can happen, for instance, when
#'     analyzing panel data, which can lead to *Heterogeneity Bias*. To
#'     control for correlating predictors and group effects, it is recommended
#'     to include the group-meaned and de-meaned version of *time-varying covariates*
#'     (and group-meaned version of *time-invariant covariates* that are on
#'     a higher level, e.g. level-2 predictors) in the model. By this, one can
#'     fit complex multilevel models for panel data, including time-varying
#'     predictors, time-invariant predictors and random effects.
#'    }
#'
#'   \subsection{Why mixed models are preferred over fixed effects models}{
#'     A mixed models approach can model the causes of endogeneity explicitly
#'     by including the (separated) within- and between-effects of time-varying
#'     fixed effects and including time-constant fixed effects. Furthermore,
#'     mixed models also include random effects, thus a mixed models approach
#'     is superior to classic fixed-effects models, which lack information of
#'     variation in the group-effects or between-subject effects. Furthermore,
#'     fixed effects regression cannot include random slopes, which means that
#'     fixed effects regressions are neglecting \dQuote{cross-cluster differences
#'     in the effects of lower-level controls (which) reduces the precision of
#'     estimated context effects, resulting in unnecessarily wide confidence
#'     intervals and low statistical power} (\cite{Heisig et al. 2017}).
#'   }
#'
#'   \subsection{Terminology}{
#'     The group-meaned variable is simply the mean of an independent variable
#'     within each group (or id-level or cluster) represented by `group`.
#'     It represents the cluster-mean of an independent variable. The regression
#'     coefficient of a group-meaned variable is the *between-subject-effect*.
#'     The de-meaned variable is then the centered version of the group-meaned
#'     variable. De-meaning is sometimes also called person-mean centering or
#'     centering within clusters. The regression coefficient of a de-meaned
#'     variable represents the *within-subject-effect*.
#'   }
#'
#'   \subsection{De-meaning with continuous predictors}{
#'     For continuous time-varying predictors, the recommendation is to include
#'     both their de-meaned and group-meaned versions as fixed effects, but not
#'     the raw (untransformed) time-varying predictors themselves. The de-meaned
#'     predictor should also be included as random effect (random slope). In
#'     regression models, the coefficient of the de-meaned predictors indicates
#'     the within-subject effect, while the coefficient of the group-meaned
#'     predictor indicates the between-subject effect.
#'   }
#'
#'   \subsection{De-meaning with binary predictors}{
#'     For binary time-varying predictors, there are two recommendations. First
#'     is to include the raw (untransformed) binary predictor as fixed effect
#'     only and the *de-meaned* variable as random effect (random slope).
#'     The alternative would be to add the de-meaned version(s) of binary
#'     time-varying covariates as additional fixed effect as well (instead of
#'     adding it as random slope). Centering time-varying binary variables to
#'     obtain within-effects (level 1) isn't necessary. They have a sensible
#'     interpretation when left in the typical 0/1 format (\cite{Hoffmann 2015,
#'     chapter 8-2.I}). `demean()` will thus coerce categorical time-varying
#'     predictors to numeric to compute the de- and group-meaned versions for
#'     these variables, where the raw (untransformed) binary predictor and the
#'     de-meaned version should be added to the model.
#'   }
#'
#'   \subsection{De-meaning of factors with more than 2 levels}{
#'     Factors with more than two levels are demeaned in two ways: first, these
#'     are also converted to numeric and de-meaned; second, dummy variables
#'     are created (binary, with 0/1 coding for each level) and these binary
#'     dummy-variables are de-meaned in the same way (as described above).
#'     Packages like \pkg{panelr} internally convert factors to dummies before
#'     demeaning, so this behaviour can be mimicked here.
#'   }
#'
#'   \subsection{De-meaning interaction terms}{ There are multiple ways to deal
#'   with interaction terms of within- and between-effects. A classical approach
#'   is to simply use the product term of the de-meaned variables (i.e.
#'   introducing the de-meaned variables as interaction term in the model
#'   formula, e.g. `y ~ x_within * time_within`). This approach, however,
#'   might be subject to bias (see \cite{Giesselmann & Schmidt-Catran 2020}).
#'     \cr \cr
#'     Another option is to first calculate the product term and then apply the
#'     de-meaning to it. This approach produces an estimator \dQuote{that reflects
#'     unit-level differences of interacted variables whose moderators vary
#'     within units}, which is desirable if *no* within interaction of
#'     two time-dependent variables is required. \cr \cr
#'     A third option, when the interaction should result in a genuine within
#'     estimator, is to "double de-mean" the interaction terms
#'     (\cite{Giesselmann & Schmidt-Catran 2018}), however, this is currently
#'     not supported by `demean()`. If this is required, the `wmb()`
#'     function from the \pkg{panelr} package should be used. \cr \cr
#'     To de-mean interaction terms for within-between models, simply specify
#'     the term as interaction for the `select`-argument, e.g.
#'     `select = "a*b"` (see 'Examples').
#'   }
#'
#'   \subsection{Analysing panel data with mixed models using lme4}{
#'     A description of how to translate the
#'     formulas described in *Bell et al. 2018* into R using `lmer()`
#'     from \pkg{lme4} can be found in
#'     [this vignette](https://easystats.github.io/parameters/articles/demean.html).
#'   }
#'
#' @references
#'
#'   - Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors
#'   and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the
#'   American Political Science Association.
#'
#'   - Bell A, Fairbrother M, Jones K. 2019. Fixed and Random Effects
#'   Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074
#'
#'   - Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects
#'   Modeling of Time-Series Cross-Sectional and Panel Data. Political Science
#'   Research and Methods, 3(1), 133–153.
#'
#'   - Gelman A, Hill J. 2007. Data Analysis Using Regression and
#'   Multilevel/Hierarchical Models. Analytical Methods for Social Research.
#'   Cambridge, New York: Cambridge University Press
#'
#'   - Giesselmann M, Schmidt-Catran, AW. 2020. Interactions in fixed
#'   effects regression models. Sociological Methods & Research, 1–28.
#'   https://doi.org/10.1177/0049124120914934
#'
#'   - Heisig JP, Schaeffer M, Giesecke J. 2017. The Costs of Simplicity:
#'   Why Multilevel Models May Benefit from Accounting for Cross-Cluster
#'   Differences in the Effects of Controls. American Sociological Review 82
#'   (4): 796–827.
#'
#'   - Hoffman L. 2015. Longitudinal analysis: modeling within-person
#'   fluctuation and change. New York: Routledge
#'
#' @examples
#'
#' data(iris)
#' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
#' iris$binary <- as.factor(rbinom(150, 1, .35)) # binary variable
#'
#' x <- demean(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID")
#' head(x)
#'
#' x <- demean(iris, select = c("Sepal.Length", "binary", "Species"), group = "ID")
#' head(x)
#'
#'
#' # demean interaction term x*y
#' dat <- data.frame(
#'   a = c(1, 2, 3, 4, 1, 2, 3, 4),
#'   x = c(4, 3, 3, 4, 1, 2, 1, 2),
#'   y = c(1, 2, 1, 2, 4, 3, 2, 1),
#'   ID = c(1, 2, 3, 1, 2, 3, 1, 2)
#' )
#' demean(dat, select = c("a", "x*y"), group = "ID")
#'
#' # or in formula-notation
#' demean(dat, select = ~ a + x * y, group = ~ID)
#'
#' @export
demean <- function(x,
                   select,
                   group,
                   suffix_demean = "_within",
                   suffix_groupmean = "_between",
                   add_attributes = TRUE,
                   verbose = TRUE) {
  degroup(
    x = x,
    select = select,
    group = group,
    center = "mean",
    suffix_demean = suffix_demean,
    suffix_groupmean = suffix_groupmean,
    add_attributes = add_attributes,
    verbose = verbose
  )
}






#' @rdname demean
#' @export
degroup <- function(x,
                    select,
                    group,
                    center = "mean",
                    suffix_demean = "_within",
                    suffix_groupmean = "_between",
                    add_attributes = TRUE,
                    verbose = TRUE) {
  # ugly tibbles again...
  x <- as.data.frame(x)

  center <- match.arg(tolower(center), choices = c("mean", "median", "mode", "min", "max"))

  if (inherits(select, "formula")) {
    # formula to character, remove "~", split at "+"
    select <- trimws(unlist(strsplit(gsub("~", "", insight::safe_deparse(select), fixed = TRUE), "+", fixed = TRUE)))
  }

  if (inherits(group, "formula")) {
    group <- all.vars(group)
  }

  interactions_no <- select[!grepl("(\\*|\\:)", select)]
  interactions_yes <- select[grepl("(\\*|\\:)", select)]

  if (length(interactions_yes)) {
    interaction_terms <- lapply(strsplit(interactions_yes, "*", fixed = TRUE), trimws)
    product <- lapply(interaction_terms, function(i) do.call(`*`, x[, i]))
    new_dat <- as.data.frame(stats::setNames(product, gsub("\\s", "", gsub("*", "_", interactions_yes, fixed = TRUE))))
    x <- cbind(x, new_dat)
    select <- c(interactions_no, colnames(new_dat))
  }

  not_found <- setdiff(select, colnames(x))

  if (length(not_found) && isTRUE(verbose)) {
    insight::format_alert(
      sprintf(
        "%i variables were not found in the dataset: %s\n",
        length(not_found),
        paste0(not_found, collapse = ", ")
      )
    )
  }

  select <- intersect(colnames(x), select)

  # get data to demean...
  dat <- x[, c(select, group)]


  # find categorical predictors that are coded as factors
  categorical_predictors <- sapply(dat[select], is.factor)

  # convert binary predictors to numeric
  if (any(categorical_predictors)) {
    # convert categorical to numeric, and then demean
    dat[select[categorical_predictors]] <- lapply(
      dat[select[categorical_predictors]],
      function(i) as.numeric(i) - 1
    )
    # convert categorical to dummy, and demean each binary dummy
    for (i in select[categorical_predictors]) {
      if (nlevels(x[[i]]) > 2) {
        for (j in levels(x[[i]])) {
          # create vector with zeros
          f <- rep(0, nrow(x))
          # for each matching level, set dummy to 1
          f[x[[i]] == j] <- 1
          dummy <- data.frame(f)
          # colnames = variable name + factor level
          # also add new dummy variables to "select"
          colnames(dummy) <- sprintf("%s_%s", i, j)
          select <- c(select, sprintf("%s_%s", i, j))
          # add to data
          dat <- cbind(dat, dummy)
        }
      }
    }
    # tell user...
    if (isTRUE(verbose)) {
      insight::print_color(
        sprintf(
          "Categorical predictors (%s) have been coerced to numeric values to compute de- and group-meaned variables.\n",
          paste0(names(categorical_predictors)[categorical_predictors], collapse = ", ")
        ),
        "yellow"
      )
    }
  }


  # group variables, then calculate the mean-value
  # for variables within each group (the group means). assign
  # mean values to a vector of same length as the data

  if (center == "mode") {
    x_gm_list <- lapply(select, function(i) {
      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) distribution_mode(stats::na.omit(.gm)))
    })
  } else if (center == "median") {
    x_gm_list <- lapply(select, function(i) {
      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) stats::median(.gm, na.rm = TRUE))
    })
  } else if (center == "min") {
    x_gm_list <- lapply(select, function(i) {
      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) min(.gm, na.rm = TRUE))
    })
  } else if (center == "max") {
    x_gm_list <- lapply(select, function(i) {
      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) max(.gm, na.rm = TRUE))
    })
  } else {
    x_gm_list <- lapply(select, function(i) {
      stats::ave(dat[[i]], dat[[group]], FUN = function(.gm) mean(.gm, na.rm = TRUE))
    })
  }

  names(x_gm_list) <- select


  # create de-meaned variables by subtracting the group mean from each individual value

  x_dm_list <- lapply(select, function(i) dat[[i]] - x_gm_list[[i]])
  names(x_dm_list) <- select


  # convert to data frame and add suffix to column names

  x_gm <- as.data.frame(x_gm_list)
  x_dm <- as.data.frame(x_dm_list)

  colnames(x_dm) <- sprintf("%s%s", colnames(x_dm), suffix_demean)
  colnames(x_gm) <- sprintf("%s%s", colnames(x_gm), suffix_groupmean)

  if (isTRUE(add_attributes)) {
    x_dm[] <- lapply(x_dm, function(i) {
      attr(i, "within-effect") <- TRUE
      i
    })
    x_gm[] <- lapply(x_gm, function(i) {
      attr(i, "between-effect") <- TRUE
      i
    })
  }

  cbind(x_gm, x_dm)
}


#' @rdname demean
#' @export
detrend <- degroup