1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
# ---------------------------------------
# Author: Alexander Kowarik
# ---------------------------------------
#' Regression Imputation
#'
#' Impute missing values based on a regression model.
#'
#'
#' [lm()] is used for family "normal" and [glm()] for all other families.
#' (robust=TRUE: [lmrob()], [glmrob()])
#'
#' @param formula model formula to impute one variable
#' @param data A data.frame containing the data
#' @param family family argument for [glm()]. `"AUTO"` (the default) tries to choose
#' automatically and is the only really tested option!!!
#' @param robust `TRUE`/`FALSE` if robust regression should be used. See details.
#' @param mod_cat `TRUE`/`FALSE` if `TRUE` for categorical variables the level with
#' the highest prediction probability is selected, otherwise it is sampled
#' according to the probabilities.
#' @param imp_var `TRUE`/`FALSE` if a `TRUE`/`FALSE` variables for each imputed
#' variable should be created show the imputation status
#' @param imp_suffix suffix used for TF imputation variables
#' @return the imputed data set.
#' @author Alexander Kowarik
#' @references A. Kowarik, M. Templ (2016) Imputation with
#' R package VIM. *Journal of
#' Statistical Software*, 74(7), 1-16.
#' @keywords manip
#' @family imputation methods
#' @examples
#'
#' data(sleep)
#' sleepImp1 <- regressionImp(Dream+NonD~BodyWgt+BrainWgt,data=sleep)
#' sleepImp2 <- regressionImp(Sleep+Gest+Span+Dream+NonD~BodyWgt+BrainWgt,data=sleep)
#'
#' data(testdata)
#' imp_testdata1 <- regressionImp(b1+b2~x1+x2,data=testdata$wna)
#' imp_testdata3 <- regressionImp(x1~x2,data=testdata$wna,robust=TRUE)
#'
#' @export
regressionImp <- function(formula, data, family = "AUTO", robust = FALSE, imp_var = TRUE, imp_suffix = "imp", mod_cat = FALSE) {
check_data(data)
data <- as.data.frame(data)
formchar <- as.character(formula)
lhs <- gsub(" ", "", strsplit(formchar[2], "\\+")[[1]])
rhs <- formchar[3]
rhs2 <- gsub(" ", "", strsplit(rhs, "\\+")[[1]])
#Missings in RHS variables
TFna2 <- apply(subset(data, select = rhs2), 1, function(x) !any(is.na(x)))
for(lhsV in lhs) {
form <- as.formula(paste(lhsV, "~", rhs))
lhs_vector <- data[[lhsV]]
#Check if there are missings in this LHS variable
if (!any(is.na(lhs_vector))) {
cat(paste0("No missings in ", lhsV, ".\n"))
} else {
if (!inherits(family, "function") & !inherits(family, "family")){
if (family == "AUTO") {
TFna <- TFna2 & !is.na(lhs_vector)
if (is.numeric(lhs_vector)) {
nLev <- 0
if (robust) {
fn <- lmrob
} else {
fn <- lm
}
mod <- fn(form, data = data[TFna, ])
} else if (inherits(lhs_vector, "factor") || inherits(lhs_vector, "character")) {
if (inherits(lhs_vector, "character")) {
dataset[[lhsV]] <- as.factor(dataset[[lhsV]])
lhs_vector <- data[[lhsV]]
}
nLev <- length(levels(lhs_vector))
if (nLev == 2) {
fam <- binomial
if (robust) {
fn <- glmrob
} else {
fn <- glm
}
mod <- fn(form, data = data[TFna, ], family = fam)
} else {
## TODO: what to do if this clause is executed and !all(!TFna3) ?
co <- capture.output(mod <- multinom(form, data[TFna, ]))
}
}
if (imp_var) {
if (imp_var %in% colnames(data)) {
data[, paste(lhsV, "_", imp_suffix, sep = "")] <- as.logical(data[, paste(lhsV, "_", imp_suffix, sep = "")])
warning(paste("The following TRUE/FALSE imputation status variables will be updated:",
paste(lhsV, "_", imp_suffix, sep = "")))
} else {
data$NEWIMPTFVARIABLE <- is.na(lhs_vector)
colnames(data)[ncol(data)] <- paste(lhsV, "_", imp_suffix, sep = "")
}
}
TFna1 <- is.na(lhs_vector)
TFna3 <- TFna1 & TFna2
if (all(!TFna3)) {
#Check if there are missings in this LHS variable where theR RHS variables are "not missing"
cat(paste0("No missings in ", lhsV, " with valid values in the predictor variables.\n"))
} else {
tmp <- data[TFna3, ]
tmp[, lhsV] <- 1
if (nLev > 2) {
if (mod_cat) {
pre <- predict(mod, newdata = tmp)
}else{
pre <- predict(mod, newdata = tmp, type = "probs")
pre <- levels(data[, lhsV])[apply(pre, 1, function(x) sample(1:length(x), 1, prob = x))]
}
} else if (nLev == 2) {
if (mod_cat) {
pre <- predict(mod, newdata = tmp, type = "response")
pre <- levels(lhs_vector)[as.numeric(pre>.5)+1]
} else {
pre <- predict(mod, newdata = tmp, type = "response")
pre <- levels(lhs_vector)[sapply(pre, function(x) sample(1:2, 1, prob = c(1 - x, x)))]
}
} else {
pre <- predict(mod, newdata = tmp)
}
if (sum(TFna1) > sum(TFna3))
cat(paste("There still missing values in variable", lhsV, ". Probably due to missing values in the regressors.\n"))
data[TFna3, lhsV] <- pre
}
}
}else{
TFna1 <- is.na(lhs_vector)
TFna3 <- TFna1 & TFna2
tmp <- data[TFna3, ]
tmp[, lhsV] <- 1
if(robust)
mod <- glmrob(form, data = data, family = family)
else
mod <- glm(form, data = data, family = family)
pre <- predict(mod, newdata = tmp, type = "response")
data[TFna3, lhsV] <- pre
}
}
}
invisible(data)
}
|