1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
#' Winsorize data
#'
#' @details
#'
#' Winsorizing or winsorization is the transformation of statistics by limiting
#' extreme values in the statistical data to reduce the effect of possibly
#' spurious outliers. The distribution of many statistics can be heavily
#' influenced by outliers. A typical strategy is to set all outliers (values
#' beyond a certain threshold) to a specified percentile of the data; for
#' example, a `90%` winsorization would see all data below the 5th percentile set
#' to the 5th percentile, and data above the 95th percentile set to the 95th
#' percentile. Winsorized estimators are usually more robust to outliers than
#' their more standard forms.
#'
#' @return
#'
#' A data frame with winsorized columns or a winsorized vector.
#'
#' @param data data frame or vector.
#' @param threshold The amount of winsorization, depends on the value of `method`:
#' - For `method = "percentile"`: the amount to winsorize from *each* tail.
#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`)
#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization.
#' @param verbose Toggle warnings.
#' @param method One of "percentile" (default), "zscore", or "raw".
#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is
#' done via the median and the median absolute deviation (MAD); if FALSE, via
#' the mean and the standard deviation.
#' @param ... Currently not used.
#'
#' @examples
#' hist(iris$Sepal.Length, main = "Original data")
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 0.2),
#' xlim = c(4, 8), main = "Percentile Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
#' xlim = c(4, 8), main = "Mean (+/- SD) Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
#' xlim = c(4, 8), main = "Median (+/- MAD) Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
#' xlim = c(4, 8), main = "Raw Thresholds"
#' )
#'
#' # Also works on a data frame:
#' winsorize(iris, threshold = 0.2)
#'
#' @inherit data_rename seealso
#' @export
winsorize <- function(data, ...) {
UseMethod("winsorize")
}
#' @export
winsorize.factor <- function(data, ...) {
data
}
#' @export
winsorize.character <- winsorize.factor
#' @export
winsorize.logical <- winsorize.factor
#' @export
winsorize.data.frame <- function(data,
threshold = 0.2,
method = "percentile",
robust = FALSE,
verbose = TRUE,
...) {
data[] <- lapply(
data,
winsorize,
threshold = threshold,
method = method,
robust = robust,
verbose = verbose
)
data
}
#' @rdname winsorize
#' @export
winsorize.numeric <- function(data,
threshold = 0.2,
method = "percentile",
robust = FALSE,
verbose = TRUE,
...) {
method <- match.arg(method, choices = c("percentile", "zscore", "raw"))
if (method == "raw") {
if (length(threshold) != 2L) {
if (isTRUE(verbose)) {
insight::format_warning(
"`threshold` must be of length 2 for lower and upper bound.",
"Did not winsorize data."
)
}
return(data)
}
}
if (method == "percentile") {
if (threshold < 0 || threshold > 0.5) {
if (isTRUE(verbose)) {
insight::format_warning(
"`threshold` for winsorization must be a scalar between 0 and 0.5.",
"Did not winsorize data."
)
}
return(data)
}
y <- sort(data)
n <- length(data)
ibot <- floor(threshold * n) + 1
itop <- length(data) - ibot + 1
threshold <- c(y[ibot], y[itop])
}
if (method == "zscore") {
if (threshold <= 0) {
if (isTRUE(verbose)) {
insight::format_warning(
"'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data."
)
}
return(data)
}
if (isTRUE(robust)) {
centeral <- stats::median(data, na.rm = TRUE)
deviation <- stats::mad(data, center = centeral, na.rm = TRUE)
} else {
centeral <- mean(data, na.rm = TRUE)
deviation <- stats::sd(data, na.rm = TRUE)
}
threshold <- centeral + c(-1, 1) * deviation * threshold
}
data[data < threshold[1]] <- threshold[1]
data[data > threshold[2]] <- threshold[2]
return(data)
}
|