File: regressionImp.R

package info (click to toggle)
r-cran-vim 6.2.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 1,556 kB
  • sloc: cpp: 141; sh: 12; makefile: 2
file content (146 lines) | stat: -rw-r--r-- 5,848 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# ---------------------------------------
# Author: Alexander Kowarik
# ---------------------------------------



#' Regression Imputation
#'
#' Impute missing values based on a regression model.
#'
#'
#' [lm()] is used for family "normal" and [glm()] for all other families.
#' (robust=TRUE: [lmrob()], [glmrob()])
#'
#' @param formula model formula to impute one variable
#' @param data A data.frame containing the data
#' @param family family argument for [glm()]. `"AUTO"` (the default) tries to choose
#' automatically and is the only really tested option!!!
#' @param robust `TRUE`/`FALSE` if robust regression should be used. See details.
#' @param mod_cat `TRUE`/`FALSE` if `TRUE` for categorical variables the level with
#' the highest prediction probability is selected, otherwise it is sampled
#' according to the probabilities.
#' @param imp_var `TRUE`/`FALSE` if a `TRUE`/`FALSE` variables for each imputed
#' variable should be created show the imputation status
#' @param imp_suffix suffix used for TF imputation variables
#' @return the imputed data set.
#' @author Alexander Kowarik
#' @references A. Kowarik, M. Templ (2016) Imputation with
#' R package VIM.  *Journal of
#' Statistical Software*, 74(7), 1-16.
#' @keywords manip
#' @family imputation methods
#' @examples
#'
#' data(sleep)
#' sleepImp1 <- regressionImp(Dream+NonD~BodyWgt+BrainWgt,data=sleep)
#' sleepImp2 <- regressionImp(Sleep+Gest+Span+Dream+NonD~BodyWgt+BrainWgt,data=sleep)
#'
#' data(testdata)
#' imp_testdata1 <- regressionImp(b1+b2~x1+x2,data=testdata$wna)
#' imp_testdata3 <- regressionImp(x1~x2,data=testdata$wna,robust=TRUE)
#'
#' @export
regressionImp <- function(formula, data, family = "AUTO", robust = FALSE, imp_var = TRUE, imp_suffix = "imp", mod_cat = FALSE) {
  check_data(data)
  data <- as.data.frame(data)
  formchar <- as.character(formula)
  lhs <- gsub(" ", "", strsplit(formchar[2], "\\+")[[1]])
  rhs <- formchar[3]
  rhs2 <- gsub(" ", "", strsplit(rhs, "\\+")[[1]])
  #Missings in RHS variables
  TFna2 <- apply(subset(data, select = rhs2), 1, function(x) !any(is.na(x)))
  for(lhsV in lhs) {
    form <- as.formula(paste(lhsV, "~", rhs))
    lhs_vector <- data[[lhsV]]
    #Check if there are missings in this LHS variable
    if (!any(is.na(lhs_vector))) {
      cat(paste0("No missings in ", lhsV, ".\n"))
    } else {
      if (!inherits(family, "function") & !inherits(family, "family")){
        if (family == "AUTO") {
          TFna <- TFna2 & !is.na(lhs_vector)
          if (is.numeric(lhs_vector)) {
            nLev <- 0
            if (robust) {
              fn <- lmrob
            } else {
              fn <- lm
            }
            mod <- fn(form, data = data[TFna, ])
          } else if (inherits(lhs_vector, "factor") || inherits(lhs_vector, "character")) {
            if (inherits(lhs_vector, "character")) {
              dataset[[lhsV]] <- as.factor(dataset[[lhsV]])
              lhs_vector <- data[[lhsV]]
            }
            nLev <- length(levels(lhs_vector))
            if (nLev == 2) {
              fam <- binomial
              if (robust) {
                fn <- glmrob
              } else {
                fn <- glm
              }
              mod <- fn(form, data = data[TFna, ], family = fam)
            } else {
              ## TODO: what to do if this clause is executed and !all(!TFna3) ?
              co <- capture.output(mod <- multinom(form, data[TFna, ]))
            }
          }
          if (imp_var) {
            if (imp_var %in% colnames(data)) {
              data[, paste(lhsV, "_", imp_suffix, sep = "")] <- as.logical(data[, paste(lhsV, "_", imp_suffix, sep = "")])
              warning(paste("The following TRUE/FALSE imputation status variables will be updated:",
                      paste(lhsV, "_", imp_suffix, sep = "")))
            } else {
              data$NEWIMPTFVARIABLE <- is.na(lhs_vector)
              colnames(data)[ncol(data)] <- paste(lhsV, "_", imp_suffix, sep = "")
            }
          }
          TFna1 <- is.na(lhs_vector)
          TFna3 <- TFna1 & TFna2
          if (all(!TFna3)) {
            #Check if there are missings in this LHS variable where theR RHS variables are "not missing"
            cat(paste0("No missings in ", lhsV, " with valid values in the predictor variables.\n"))
          } else {
            tmp <- data[TFna3, ]
            tmp[, lhsV] <- 1
            if (nLev > 2) {
              if (mod_cat) {
                pre <- predict(mod, newdata = tmp)
              }else{
                pre <- predict(mod, newdata = tmp, type = "probs")
                pre <- levels(data[, lhsV])[apply(pre, 1, function(x) sample(1:length(x), 1, prob = x))]
              }
            } else if (nLev == 2) {
              if (mod_cat) {
                pre <- predict(mod, newdata = tmp, type = "response")
                pre <- levels(lhs_vector)[as.numeric(pre>.5)+1]
              } else {
                pre <- predict(mod, newdata = tmp, type = "response")
                pre <- levels(lhs_vector)[sapply(pre, function(x) sample(1:2, 1, prob = c(1 - x, x)))]
              }
            } else {
              pre <- predict(mod, newdata = tmp)
            }
            if (sum(TFna1) > sum(TFna3))
              cat(paste("There still missing values in variable", lhsV, ". Probably due to missing values in the regressors.\n"))
            data[TFna3, lhsV] <- pre
          }
        }
      }else{
        TFna1 <- is.na(lhs_vector)
        TFna3 <- TFna1 & TFna2
        tmp <- data[TFna3, ]
        tmp[, lhsV] <- 1
        if(robust)
          mod <- glmrob(form, data = data, family = family)
        else
          mod <- glm(form, data = data, family = family)
        pre <- predict(mod, newdata = tmp, type = "response")
        data[TFna3, lhsV] <- pre
      }
    }
  }
  invisible(data)
}