File: adjust.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (198 lines) | stat: -rw-r--r-- 7,221 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#' Adjust data for the effect of other variable(s)
#'
#' This function can be used to adjust the data for the effect of other
#' variables present in the dataset. It is based on an underlying fitting of
#' regressions models, allowing for quite some flexibility, such as including
#' factors as random effects in mixed models (multilevel partialization),
#' continuous variables as smooth terms in general additive models (non-linear
#' partialization) and/or fitting these models under a Bayesian framework. The
#' values returned by this function are the residuals of the regression models.
#' Note that a regular correlation between two "adjusted" variables is
#' equivalent to the partial correlation between them.
#'
#' @param data A data frame.
#' @param effect Character vector of column names to be adjusted for (regressed
#'   out). If `NULL` (the default), all variables will be selected.
#' @param multilevel If `TRUE`, the factors are included as random factors.
#'   Else, if `FALSE` (default), they are included as fixed effects in the
#'   simple regression model.
#' @param additive If `TRUE`, continuous variables as included as smooth terms
#'   in additive models. The goal is to regress-out potential non-linear
#'   effects.
#' @param bayesian If `TRUE`, the models are fitted under the Bayesian framework
#'   using `rstanarm`.
#' @param keep_intercept If `FALSE` (default), the intercept of the model is
#'   re-added. This avoids the centering around 0 that happens by default
#'   when regressing out another variable (see the examples below for a
#'   visual representation of this).
#' @inheritParams extract_column_names
#' @inheritParams standardize
#'
#' @return A data frame comparable to `data`, with adjusted variables.
#'
#' @examplesIf all(insight::check_if_installed(c("bayestestR", "rstanarm", "gamm4"), quietly = TRUE))
#' adjusted_all <- adjust(attitude)
#' head(adjusted_all)
#' adjusted_one <- adjust(attitude, effect = "complaints", select = "rating")
#' head(adjusted_one)
#' \donttest{
#' adjust(attitude, effect = "complaints", select = "rating", bayesian = TRUE)
#' adjust(attitude, effect = "complaints", select = "rating", additive = TRUE)
#' attitude$complaints_LMH <- cut(attitude$complaints, 3)
#' adjust(attitude, effect = "complaints_LMH", select = "rating", multilevel = TRUE)
#' }
#'
#' # Generate data
#' data <- bayestestR::simulate_correlation(n = 100, r = 0.7)
#' data$V2 <- (5 * data$V2) + 20 # Add intercept
#'
#' # Adjust
#' adjusted <- adjust(data, effect = "V1", select = "V2")
#' adjusted_icpt <- adjust(data, effect = "V1", select = "V2", keep_intercept = TRUE)
#'
#' # Visualize
#' plot(
#'   data$V1, data$V2,
#'   pch = 19, col = "blue",
#'   ylim = c(min(adjusted$V2), max(data$V2)),
#'   main = "Original (blue), adjusted (green), and adjusted - intercept kept (red) data"
#' )
#' abline(lm(V2 ~ V1, data = data), col = "blue")
#' points(adjusted$V1, adjusted$V2, pch = 19, col = "green")
#' abline(lm(V2 ~ V1, data = adjusted), col = "green")
#' points(adjusted_icpt$V1, adjusted_icpt$V2, pch = 19, col = "red")
#' abline(lm(V2 ~ V1, data = adjusted_icpt), col = "red")
#'
#' @export
adjust <- function(data,
                   effect = NULL,
                   select = is.numeric,
                   exclude = NULL,
                   multilevel = FALSE,
                   additive = FALSE,
                   bayesian = FALSE,
                   keep_intercept = FALSE,
                   ignore_case = FALSE,
                   regex = FALSE,
                   verbose = FALSE) {
  # make sure column names are syntactically valid
  .check_dataframe_names(data, action = "error")

  # check for formula notation, convert to character vector
  if (inherits(effect, "formula")) {
    effect <- all.vars(effect)
  }

  # Find predictors
  if (is.null(effect)) {
    effect <- names(data)
  }

  if (is.null(select)) {
    select <- is.numeric
  }

  select <- .select_nse(select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # Factors
  formula_random <- NULL
  facs <- names(data[effect][!vapply(data[effect], is.numeric, logical(1L))])
  if (length(facs) >= 1 && multilevel) {
    if (additive) {
      formula_random <- stats::as.formula(paste("~", paste(paste0("(1|", facs, ")"), collapse = " + ")))
    } else {
      formula_random <- paste("+", paste(paste0("(1|", facs, ")"), collapse = " + "))
    }
    effect <- effect[!effect %in% facs]
  }

  # Fit models
  out <- data.frame(.ID = seq_len(nrow(data)))
  for (var in select) {
    predictors <- effect[effect != var]
    if (additive) {
      predictors_num <- names(data[predictors][vapply(data[predictors], is.numeric, logical(1L))])
      predictors[predictors == predictors_num] <- paste0("s(", predictors_num, ")")
    }
    formula_predictors <- paste(c("1", predictors), collapse = " + ")
    model_formula <- paste(var, "~", formula_predictors)

    x <- .model_adjust_for(
      data = data[unique(c(var, effect, facs))],
      model_formula = model_formula,
      multilevel = multilevel,
      additive = additive,
      bayesian = bayesian,
      formula_random = formula_random,
      keep_intercept = keep_intercept
    )
    out[var] <- x
  }
  out[names(data)[!names(data) %in% names(out)]] <- data[names(data)[!names(data) %in% names(out)]]
  out[names(data)]
}

#' @rdname adjust
#' @export
data_adjust <- adjust


#' @keywords internal
.model_adjust_for <- function(data,
                              model_formula,
                              multilevel = FALSE,
                              additive = FALSE,
                              bayesian = FALSE,
                              formula_random = NULL,
                              keep_intercept = FALSE) {
  # Additive -----------------------
  if (additive) {
    # Bayesian
    if (bayesian) {
      insight::check_if_installed("rstanarm")
      model <- rstanarm::stan_gamm4(stats::as.formula(model_formula), random = formula_random, data = data, refresh = 0)
      # Frequentist
    } else {
      insight::check_if_installed("gamm4")
      model <- gamm4::gamm4(stats::as.formula(model_formula), random = formula_random, data = data)
    }

    # Linear -------------------------
  } else if (bayesian) {
    # Bayesian
    insight::check_if_installed("rstanarm")
    if (multilevel) {
      model <- rstanarm::stan_lmer(paste(model_formula, formula_random), data = data, refresh = 0)
    } else {
      model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0)
    }
  } else if (multilevel) {
    # Frequentist
    insight::check_if_installed("lme4")
    model <- lme4::lmer(paste(model_formula, formula_random), data = data)
  } else {
    model <- stats::lm(model_formula, data = data)
  }

  adjusted <- insight::get_residuals(model)

  # Re-add intercept if need be
  if (keep_intercept) {
    intercept <- insight::get_intercept(model)
    if (length(intercept) > 1) intercept <- stats::median(intercept) # For bayesian model
    if (is.na(intercept)) intercept <- 0
    adjusted <- adjusted + intercept
  }

  # Deal with missing data
  out <- rep(NA, nrow(data))
  out[stats::complete.cases(data)] <- as.vector(adjusted)

  out
}