File: ImputeMethods.R

package info (click to toggle)
r-cran-mlr 2.19.2%2Bdfsg-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 8,264 kB
sloc: ansic: 65; sh: 13; makefile: 5
file content (335 lines) | stat: -rw-r--r-- 11,439 bytes
parent folder | download | duplicates (3)
# FIXME: From a design point of view, using the task here as input argument
# might be more extensible for us later and
# convenient for the user.
# For us, we can use polymorphism, and for the user
# the task contains a lot of helpful info.
# But I do not want to revamp the structure now

#' Create a custom imputation method.
#'
#' This is a constructor to create your own imputation methods.
#' @param learn (`function(data, target, col, ...)`)\cr
#'   Function to learn and extract information on column `col`
#'   out of data frame `data`. Argument `target` specifies
#'   the target column of the learning task.
#'   The function has to return a named list of values.
#' @param impute (`function(data, target, col, ...)`)\cr
#'   Function to impute missing values in `col` using information
#'   returned by `learn` on the same column.
#'   All list elements of the return values o `learn`
#'   are passed to this function into `...`.
#' @param args ([list])\cr
#'   Named list of arguments to pass to `learn` via `...`.
#' @family impute
#' @export
makeImputeMethod = function(learn, impute, args = list()) {
  assertFunction(learn, args = c("data", "target", "col"))
  assertFunction(impute, args = c("data", "target", "col"))
  assertList(args, names = "named")
  setClasses(list(learn = learn, impute = impute, args = args), "ImputeMethod")
}

# helper function to impute missings of a col to const val
simpleImpute = function(data, target, col, const) {
  if (is.na(const)) {
    stopf("Error imputing column '%s'. Maybe all input data was missing?", col)
  }
  x = data[[col]]

  # cast logicals to factor if required (#1522)
  if (is.logical(x) && !is.logical(const)) {
    x = as.factor(x)
  }
  if (is.factor(x) && const %nin% levels(x)) {
    levels(x) = c(levels(x), as.character(const))
  }
  replace(x, is.na(x), const)
}


#' Built-in imputation methods.
#'
#' The built-ins are:
#' \itemize{
#'   \item `imputeConstant(const)` for imputation using a constant value,
#'   \item `imputeMedian()` for imputation using the median,
#'   \item `imputeMode()` for imputation using the mode,
#'   \item `imputeMin(multiplier)` for imputing constant values shifted below the minimum
#'     using `min(x) - multiplier * diff(range(x))`,
#'   \item `imputeMax(multiplier)` for imputing constant values shifted above the maximum
#'     using `max(x) + multiplier * diff(range(x))`,
#'   \item `imputeNormal(mean, sd)` for imputation using normally
#'     distributed random values. Mean and standard deviation will be calculated
#'     from the data if not provided.
#'   \item `imputeHist(breaks, use.mids)` for imputation using random values
#'     with probabilities calculated using `table` or `hist`.
#'   \item `imputeLearner(learner, features = NULL)` for imputations using the response
#'     of a classification or regression learner.
#' }
#' @name imputations
#' @rdname imputations
#' @family impute
NULL

#' @export
#' @param const (any)\cr
#'  Constant valued use for imputation.
#' @rdname imputations
imputeConstant = function(const) {
  assertVector(const, len = 1L, any.missing = FALSE)
  makeImputeMethod(
    learn = function(data, target, col, const) const,
    impute = simpleImpute,
    args = list(const = const)
  )
}

#' @export
#' @rdname imputations
imputeMedian = function() {
  makeImputeMethod(
    learn = function(data, target, col) median(data[[col]], na.rm = TRUE),
    impute = simpleImpute
  )
}

#' @export
#' @rdname imputations
imputeMean = function() {
  makeImputeMethod(
    learn = function(data, target, col) mean(data[[col]], na.rm = TRUE),
    impute = simpleImpute
  )
}

#' @export
#' @rdname imputations
imputeMode = function() {
  makeImputeMethod(
    learn = function(data, target, col) computeMode(data[[col]], na.rm = TRUE),
    impute = simpleImpute
  )
}

#' @export
#' @param multiplier (`numeric(1)`)\cr
#'   Value that stored minimum or maximum is multiplied with when imputation is done.
#' @rdname imputations
imputeMin = function(multiplier = 1) {
  assertNumber(multiplier)
  makeImputeMethod(
    learn = function(data, target, col, multiplier) {
      r = range(data[[col]], na.rm = TRUE)
      r[1L] - multiplier * diff(r)
    },
    impute = simpleImpute,
    args = list(multiplier = multiplier)
  )
}

#' @export
#' @rdname imputations
imputeMax = function(multiplier = 1) {
  assertNumber(multiplier)
  makeImputeMethod(
    learn = function(data, target, col, multiplier) {
      r = range(data[[col]], na.rm = TRUE)
      r[2L] + multiplier * diff(r)
    },
    impute = simpleImpute,
    args = list(multiplier = multiplier)
  )
}

#' @export
#' @param min (`numeric(1)`)\cr
#'   Lower bound for uniform distribution.
#'   If NA (default), it will be estimated from the data.
#' @param max (`numeric(1)`)\cr
#'   Upper bound for uniform distribution.
#'   If NA (default), it will be estimated from the data.
#' @rdname imputations
imputeUniform = function(min = NA_real_, max = NA_real_) {
  assertNumber(min, na.ok = TRUE)
  assertNumber(max, na.ok = TRUE)
  makeImputeMethod(
    learn = function(data, target, col, min, max) {
      if (is.na(min)) {
        min = min(data[[col]], na.rm = TRUE)
        if (is.na(min)) {
          stop("All values are missing. Unable to calculate minimum.")
        }
      }
      if (is.na(max)) {
        max = max(data[[col]], na.rm = TRUE)
        if (is.na(max)) {
          stop("All values are missing. Unable to calculate maximum.")
        }
      }
      list(min = min, max = max)
    },
    impute = function(data, target, col, min, max) {
      x = data[[col]]
      ind = is.na(x)
      replace(x, ind, runif(sum(ind), min = min, max = max))
    },
    args = list(min = min, max = max)
  )
}

#' @export
#' @param mu (`numeric(1)`)\cr
#'   Mean of normal distribution. If missing it will be estimated from the data.
#' @param sd (`numeric(1)`)\cr
#'   Standard deviation of normal distribution. If missing it will be estimated from the data.
#' @rdname imputations
imputeNormal = function(mu = NA_real_, sd = NA_real_) {
  assertNumber(mu, na.ok = TRUE)
  assertNumber(sd, na.ok = TRUE)

  makeImputeMethod(
    learn = function(data, target, col, mu, sd) {
      if (is.na(mu)) {
        mu = mean(data[[col]], na.rm = TRUE)
        if (is.na(mu)) {
          stop("All values missing. Unable to calculate mean.")
        }
      }
      if (is.na(sd)) {
        sd = sd(data[[col]], na.rm = TRUE)
        if (is.na(sd)) {
          stop("All values missing. Unable to calculate sd.")
        }
      }
      list(mu = mu, sd = sd)
    },
    impute = function(data, target, col, mu, sd) {
      x = data[[col]]
      ind = is.na(x)
      replace(x, ind, rnorm(sum(ind), mean = mu, sd = sd))
    },
    args = list(mu = mu, sd = sd)
  )
}

#' @export
#' @param breaks (`numeric(1)`)\cr
#'  Number of breaks to use in [graphics::hist]. If missing,
#'  defaults to auto-detection via \dQuote{Sturges}.
#' @param use.mids (`logical(1)`)\cr
#'  If `x` is numeric and a histogram is used, impute with bin mids (default)
#'  or instead draw uniformly distributed samples within bin range.
#' @rdname imputations
imputeHist = function(breaks, use.mids = TRUE) {
  if (missing(breaks)) {
    breaks = "Sturges"
  }
  if (!identical(breaks, "Sturges")) {
    breaks = asCount(breaks)
  }
  assertFlag(use.mids)

  makeImputeMethod(

    learn = function(data, target, col, breaks, use.mids) {
      x = data[[col]]
      if (all(is.na(x))) {
        stop("All values missing. Unable to impute with Hist.")
      }
      if (is.numeric(x)) {
        tmp = hist(x, breaks = breaks, plot = FALSE)
        if (use.mids) {
          return(list(counts = tmp$counts, values = tmp$mids))
        } else {
          return(list(counts = tmp$counts, breaks = tmp$breaks))
        }
      } else { # factor or logical feature
        tmp = table(x, useNA = "no")
        values = names(tmp)
        if (is.logical(x)) {
          values = as.logical(x)
        }
        return(list(counts = as.integer(tmp), values = values))
      }
    },

    impute = function(data, target, col, counts, values, breaks) {
      x = data[[col]]
      ind = which(is.na(x))
      if (missing(values)) {
        w = sample(seq_along(counts), length(ind), replace = TRUE, prob = counts)
        values = runif(length(ind), min = head(breaks, -1L)[w], max = tail(breaks, -1L)[w])
      } else {
        values = sample(values, length(ind), replace = TRUE, prob = counts)
      }
      replace(x, ind, values)
    },
    args = list(breaks = breaks, use.mids = use.mids)
  )
}

#' @param learner ([Learner] | `character(1)`)\cr
#'  Supervised learner. Its predictions will be used for imputations.
#'  If you pass a string the learner will be created via [makeLearner].
#'  Note that the target column is not available for this operation.
#' @param features ([character])\cr
#'  Features to use in `learner` for prediction.
#'  Default is `NULL` which uses all available features except the target column
#'  of the original task.
#' @rdname imputations
#' @export
imputeLearner = function(learner, features = NULL) {
  learner = checkLearner(learner)
  if (!is.null(features)) {
    assertCharacter(features, any.missing = FALSE)
  }

  makeImputeMethod(
    learn = function(data, target, col, learner, features) {

      constructor = getTaskConstructorForLearner(learner)
      if (is.null(features)) {
        features = setdiff(names(data), target)
      } else {
        not.ok = which(features %nin% names(data))
        if (length(not.ok)) {
          stopf("Features for imputation not found in data: '%s'", collapse(features[not.ok]))
        }
        not.ok = which.first(target %in% features)
        if (length(not.ok)) {
          stopf("Target column used as feature for imputation: '%s'", target[not.ok])
        }
        if (col %nin% features) {
          features = c(col, features)
        }
      }
      # features used for imputation might have NAs, but the learner might not support that
      # we need an extra check, otherwise this might not get noticed by checkLearnerBeforeTrain because
      # we remove observations with NAs in column col before generating the task
      impute.feats = setdiff(features, col)
      if (anyMissing(data[impute.feats]) && !hasLearnerProperties(learner, "missings")) {
        has.na = vlapply(data[impute.feats], anyMissing)
        wrong.feats = clipString(collapse(colnames(data[impute.feats])[has.na], ", "), 50L)
        stopf("Feature(s) '%s' used for imputation has/have missing values, but learner '%s' does not support that!", wrong.feats, learner$id)
      }
      # remove all observations with missing values in column col (which is the target in the imputation task)
      ind = !is.na(data[[col]])
      task = constructor("impute", data = subset(data, subset = ind, select = features), target = col,
        check.data = TRUE, fixup.data = "quiet")
      list(model = train(learner, task), features = features)
    },

    impute = function(data, target, col, model, features) {
      x = data[[col]]
      ind = is.na(x)
      # if no NAs are present in data, we always return it unchanged
      if (all(!ind)) {
        return(x)
      }
      newdata = as.data.frame(data)[ind, features, drop = FALSE]
      p = predict(model, newdata = newdata)$data$response
      replace(x, ind, p)
    },
    args = list(learner = learner, features = features)
  )
}