File: REmargins.R

package info (click to toggle)
r-cran-mertools 0.6.2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,716 kB
  • sloc: sh: 13; makefile: 2
file content (186 lines) | stat: -rw-r--r-- 9,452 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#' Calculate the predicted value for each observation across the distribution
#' of the random effect terms.
#'
#' \code{REmargins} calculates the average predicted value for each row of a
#' new data frame across the distribution of \code{\link{expectedRank}} for a
#' merMod object. This allows the user to make meaningful comparisons about the
#' influence of random effect terms on the scale of the response variable,
#' for user-defined inputs, and accounting for the variability in grouping terms.
#'
#' The function simulates the
#'
#' The function predicts the response at every level in the random effect term
#' specified by the user. Then, the expected rank of each group level is binned
#' to the number of bins specified by the user. Finally, a weighted mean of the
#' fitted value for all observations in each bin of the expected ranks is
#' calculated using the inverse of the variance as the weight -- so that less
#' precise estimates are downweighted in the calculation of the mean for the bin.
#' Finally, a standard error for the bin mean is calculated.
#'
#' @param merMod An object of class merMod
#'
#' @param newdata a data frame of observations to calculate group-level differences
#' for
#'
#' @param groupFctr The name of the grouping factor over which the random
#'   coefficient of interest varies.  This is the variable to the right of the
#'   pipe, \code{|}, in the [g]lmer formula. This parameter is optional, if not
#'   specified, it will perform the calculation for the first effect listed
#'   by \code{ranef}. If the length is > 1 then the combined effect of all
#'   listed groups will calculated and marginalized over co-occurences of those
#'   groups if desired.
#'
#' @param term The name of the random coefficient of interest. This is the
#'   variable to the left of the pipe, \code{|}, in the [g]lmer formula. Partial
#'   matching is attempted on the intercept term so the following character
#'   strings will all return rankings based on the intercept (\emph{provided that
#'   they do not match the name of another random coefficient for that factor}):
#'   \code{c("(Intercept)", "Int", "intercep", ...)}.
#'
#' @param breaks an integer representing the number of bins to divide the group
#' effects into, the default is 3.
#' @param .parallel, logical should parallel computation be used, default is TRUE
#'
#' @param ... additional arguments to pass to \code{\link{predictInterval}}
#'
#' @return A data.frame with all unique combinations of the number of cases, rows
#' in the newdata element:
#'   \describe{
#'     \item{...}{The columns of the original data taken from \code{newdata}}
#'     \item{case}{The row number of the observation from newdata. Each row in newdata will be
#'     repeated for all unique levels of the grouping_var, term, and breaks.}
#'     \item{grouping_var}{The grouping variable the random effect is being marginalized over.}
#'     \item{term}{The term for the grouping variable the random effect is being marginalized over.}
#'     \item{breaks}{The ntile of the effect size for \code{grouping_var} and \code{term}}
#'     \item{original_group_level}{The original grouping value for this \code{case}}
#'     \item{fit_combined}{The predicted value from \code{predictInterval} for this case simulated
#'     at the Nth ntile of the expected rank distribution of \code{grouping_var} and \code{term}}
#'     \item{upr_combined}{The upper bound of the predicted value.}
#'     \item{lwr_combined}{The lower bound of the predicted value.}
#'     \item{fit_XX}{For each grouping term in newdata the predicted value is decomposed into its
#'     fit components via predictInterval and these are all returned here}
#'     \item{upr_XX}{The upper bound for the effect of each grouping term}
#'     \item{lwr_XX}{The lower bound for the effect of each grouping term}
#'     \item{fit_fixed}{The predicted fit with all the grouping terms set to 0 (average)}
#'     \item{upr_fixed}{The upper bound fit with all the grouping terms set to 0 (average)}
#'     \item{lwr_fixed}{The lower bound fit with all the grouping terms set to 0 (average)}
#'   }
#'
#'
#' @references
#' Gatz, DF and Smith, L. The Standard Error of a Weighted Mean Concentration.
#' I. Bootstrapping vs other methods. \emph{Atmospheric Environment}.
#' 1995;11(2)1185-1193. Available at
#' \url{https://www.sciencedirect.com/science/article/pii/135223109400210C}
#'
#' Cochran, WG. 1977. Sampling Techniques (3rd Edition). Wiley, New York.
#'
#' @seealso \code{\link{expectedRank}}, \code{\link{predictInterval}}
#' @importFrom stats reshape
#' @examples
#' \donttest{
#' fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
#' mfx <- REmargins(merMod = fm1, newdata = sleepstudy[1:10,])
#'
#' # You can also pass additional arguments to predictInterval through REimpact
#'  g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=InstEval)
#'  margin_df <- REmargins(g1, newdata = InstEval[20:25, ], groupFctr = c("s"),
#'                         breaks = 4)
#'  margin_df <- REmargins(g1, newdata = InstEval[20:25, ], groupFctr = c("d"),
#'                          breaks = 3)
#' }
#' @export
REmargins <- function(merMod, newdata = NULL, groupFctr = NULL, term = NULL, breaks = 4,
                      .parallel = FALSE, ...){
  # Validate inputs
  if (is.null(groupFctr)) {
    # If the user doesn't tell us which term to use, we take the first term
    groupFctr <- names(ranef(merMod))[1]
  }
  if (is.null(newdata)) {
    # If the user doesn't give us data, we take the whole dataset
    # TODO - how does performance scale to a large number of observations?
    newdata <- merMod@frame
  }

  # If the user doesn't tell us what term to use, we take all the terms
  if (is.null(term)) {
    term <- names(ranef(merMod)[[groupFctr]])
    # Sub out intercept
    term[term == "(Intercept)"] <- "Intercept"
  }

  # This is a rough way to break the ER distribution into quantiles
  brks <- ceiling(seq(1, 100, by = 100/breaks))
  # Fallback so we always take a 99th percentile effect (for the maximum)
  if (!99 %in% brks) {
    brks <- c(brks, 99)
  }
  # Inputs are validated - now we get the effect distribution
  # Generate the expected rank distribution
  ER_DF <- expectedRank(merMod, groupFctr = groupFctr, term = term)
  # With many effects there is a lot of duplication - drop duplicated pctER
  ER_DF <- ER_DF[!duplicated(ER_DF[, c("groupFctr", "term", "pctER")]), ]

  # Now we create a data frame to capture the factor levels of each groupFctr that
  # correspond to the right break in the expectedRank distribution of the random
  # effect grouping factor and term
  par_df <- expand.grid("grouping_var" = groupFctr, "term" = term, "breaks" = 1:breaks)

  # Keep only factor levels that have effects at the margins
  # Need to match closest value here
  # Find N closest values
  # Drop duplicates
    # For each combination build an index of candidate rows/effect levels
  # Then choose the level that has the most precise estimate within a
  # tolerance of the effect size
  for (trm in term) {
    for (i in seq_along(brks)) {
      # Compute each terms distance from the break
      rank_dist <- abs(ER_DF$pctER[ER_DF$term == trm] - brks[i])
      # Get the index for the rank that minimizes the distance
      # TODO - how to break ties here?
      tmp <- which(rank_dist %in% rank_dist[order(rank_dist)][1])
      # Store the result in the par_df object
      par_df$groupLevel[par_df$breaks == i & par_df$term == trm] <- ER_DF$groupLevel[tmp]
    }
  }
  # Get ready to expand the data
  sim_data <- as.data.frame(lapply(newdata, rep, each = nrow(par_df)))
  # sim_data now repeats each row of newdata by the number of rows in par_df
  # case labels the rows with an integer for later mapping
  sim_data$case <- rep(1:nrow(newdata), each = nrow(par_df))
  sim_data <- cbind(sim_data, par_df)
  sim_data$original_group_level <- sim_data[, groupFctr]
  sim_data[, groupFctr] <- sim_data$groupLevel
  sim_data$groupLevel <- NULL
  #

  # Maybe strongly recommend parallel here?
  if (.parallel & requireNamespace("foreach", quietly = TRUE)) {
    # TODO use future here
      setup_parallel()
        out <- predictInterval(merMod, newdata = sim_data,
                               which = "all", ...)
        out_w <- stats::reshape(out, direction = "wide",
                                idvar = "obs", timevar = "effect",
                                v.names = c("fit", "upr", "lwr"), sep = "_")
        out_w$obs <- NULL
        sim_data <- cbind(sim_data, out_w)
    } else if ( .parallel & !requireNamespace("foreach", quietly = TRUE)) {
      warning("foreach package is unavailable, parallel computing not available")
    } else {
      out <- predictInterval(merMod, newdata = sim_data,
                             which = "all", ...)
      out_w <- stats::reshape(out, direction = "wide",
                       idvar = "obs", timevar = "effect",
                       v.names = c("fit", "upr", "lwr"), sep = "_")
      out_w$obs <- NULL
      sim_data <- cbind(sim_data, out_w)
    }
  # Case is the number of the row in newdata
  # obs is the variance among the selected random effects to marginalize over
  # So we want to collapse by case if we can

  return(sim_data)
}