1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
|
#' @title Rescale design weights for multilevel analysis
#' @name rescale_weights
#'
#' @description Most functions to fit multilevel and mixed effects models only
#' allow to specify frequency weights, but not design (i.e. sampling or
#' probability) weights, which should be used when analyzing complex samples
#' and survey data. `rescale_weights()` implements an algorithm proposed
#' by \cite{Asparouhov (2006)} and \cite{Carle (2009)} to rescale design
#' weights in survey data to account for the grouping structure of multilevel
#' models, which then can be used for multilevel modelling.
#'
#' @param data A data frame.
#' @param group Variable names (as character vector, or as formula), indicating
#' the grouping structure (strata) of the survey data (level-2-cluster
#' variable). It is also possible to create weights for multiple group
#' variables; in such cases, each created weighting variable will be suffixed
#' by the name of the group variable.
#' @param probability_weights Variable indicating the probability (design or
#' sampling) weights of the survey data (level-1-weight).
#' @param nest Logical, if `TRUE` and `group` indicates at least two
#' group variables, then groups are "nested", i.e. groups are now a
#' combination from each group level of the variables in `group`.
#'
#' @return `data`, including the new weighting variables: `pweights_a`
#' and `pweights_b`, which represent the rescaled design weights to use
#' in multilevel models (use these variables for the `weights` argument).
#'
#' @details
#'
#' Rescaling is based on two methods: For `pweights_a`, the sample weights
#' `probability_weights` are adjusted by a factor that represents the proportion
#' of group size divided by the sum of sampling weights within each group. The
#' adjustment factor for `pweights_b` is the sum of sample weights within each
#' group divided by the sum of squared sample weights within each group (see
#' Carle (2009), Appendix B). In other words, `pweights_a` "scales the weights
#' so that the new weights sum to the cluster sample size" while `pweights_b`
#' "scales the weights so that the new weights sum to the effective cluster
#' size".
#'
#' Regarding the choice between scaling methods A and B, Carle suggests that
#' "analysts who wish to discuss point estimates should report results based on
#' weighting method A. For analysts more interested in residual between-group
#' variance, method B may generally provide the least biased estimates". In
#' general, it is recommended to fit a non-weighted model and weighted models
#' with both scaling methods and when comparing the models, see whether the
#' "inferential decisions converge", to gain confidence in the results.
#'
#' Though the bias of scaled weights decreases with increasing group size,
#' method A is preferred when insufficient or low group size is a concern.
#'
#' The group ID and probably PSU may be used as random effects (e.g. nested
#' design, or group and PSU as varying intercepts), depending on the survey
#' design that should be mimicked.
#'
#' @references
#' - Carle A.C. (2009). Fitting multilevel models in complex survey data
#' with design weights: Recommendations. BMC Medical Research Methodology
#' 9(49): 1-13
#'
#' - Asparouhov T. (2006). General Multi-Level Modeling with Sampling
#' Weights. Communications in Statistics - Theory and Methods 35: 439-460
#'
#' @examples
#' if (require("lme4")) {
#' data(nhanes_sample)
#' head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR"))
#'
#' # also works with multiple group-variables
#' head(rescale_weights(nhanes_sample, c("SDMVSTRA", "SDMVPSU"), "WTINT2YR"))
#'
#' # or nested structures.
#' x <- rescale_weights(
#' data = nhanes_sample,
#' group = c("SDMVSTRA", "SDMVPSU"),
#' probability_weights = "WTINT2YR",
#' nest = TRUE
#' )
#' head(x)
#'
#' nhanes_sample <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR")
#'
#' glmer(
#' total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)) + (1 | SDMVPSU),
#' family = poisson(),
#' data = nhanes_sample,
#' weights = pweights_a
#' )
#' }
#' @export
rescale_weights <- function(data, group, probability_weights, nest = FALSE) {
if (inherits(group, "formula")) {
group <- all.vars(group)
}
# check if weight has missings. we need to remove them first,
# and add back weights to correct cases later
weight_missings <- which(is.na(data[[probability_weights]]))
weight_non_na <- which(!is.na(data[[probability_weights]]))
if (length(weight_missings) > 0) {
data_tmp <- data[weight_non_na, ]
} else {
data_tmp <- data
}
# sort id
data_tmp$.bamboozled <- 1:nrow(data_tmp)
if (nest && length(group) < 2) {
insight::format_warning(
sprintf(
"Only one group variable selected, no nested structure possible. Rescaling weights for grout '%s' now.",
group
)
)
nest <- FALSE
}
if (nest) {
out <- .rescale_weights_nested(data_tmp, group, probability_weights, nrow(data), weight_non_na)
} else {
out <- lapply(group, function(i) {
x <- .rescale_weights(data_tmp, i, probability_weights, nrow(data), weight_non_na)
if (length(group) > 1) {
colnames(x) <- sprintf(c("pweight_a_%s", "pweight_b_%s"), i)
}
x
})
}
do.call(cbind, list(data, out))
}
# rescale weights, for one or more group variables ----------------------------
.rescale_weights <- function(x, group, probability_weights, n, weight_non_na) {
# compute sum of weights per group
design_weights <- .data_frame(
group = sort(unique(x[[group]])),
sum_weights_by_group = tapply(x[[probability_weights]], as.factor(x[[group]]), sum),
sum_squared_weights_by_group = tapply(x[[probability_weights]]^2, as.factor(x[[group]]), sum),
n_per_group = as.vector(table(x[[group]]))
)
colnames(design_weights)[1] <- group
x <- merge(x, design_weights, by = group, sort = FALSE)
# restore original order
x <- x[order(x$.bamboozled), ]
x$.bamboozled <- NULL
# multiply the original weight by the fraction of the
# sampling unit total population based on Carle 2009
w_a <- x[[probability_weights]] * x$n_per_group / x$sum_weights_by_group
w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group
out <- data.frame(
pweights_a = rep(as.numeric(NA), times = n),
pweights_b = rep(as.numeric(NA), times = n)
)
out$pweights_a[weight_non_na] <- w_a
out$pweights_b[weight_non_na] <- w_b
out
}
# rescale weights, for nested groups ----------------------------
.rescale_weights_nested <- function(x, group, probability_weights, n, weight_non_na) {
groups <- expand.grid(lapply(group, function(i) sort(unique(x[[i]]))))
colnames(groups) <- group
# compute sum of weights per group
design_weights <- cbind(
groups,
.data_frame(
sum_weights_by_group = unlist(as.list(tapply(
x[[probability_weights]], lapply(group, function(i) {
as.factor(x[[i]])
}), sum
))),
sum_squared_weights_by_group = unlist(as.list(tapply(
x[[probability_weights]]^2, lapply(group, function(i) {
as.factor(x[[i]])
}), sum
))),
n_per_group = unlist(as.list(table(x[, group])))
)
)
x <- merge(x, design_weights, by = group, sort = FALSE)
# restore original order
x <- x[order(x$.bamboozled), ]
x$.bamboozled <- NULL
# multiply the original weight by the fraction of the
# sampling unit total population based on Carle 2009
w_a <- x[[probability_weights]] * x$n_per_group / x$sum_weights_by_group
w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group
out <- data.frame(
pweights_a = rep(as.numeric(NA), times = n),
pweights_b = rep(as.numeric(NA), times = n)
)
out$pweights_a[weight_non_na] <- w_a
out$pweights_b[weight_non_na] <- w_b
out
}
|