File: winsorize.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (146 lines) | stat: -rw-r--r-- 4,572 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#' Winsorize data
#'
#' @details
#'
#' Winsorizing or winsorization is the transformation of statistics by limiting
#' extreme values in the statistical data to reduce the effect of possibly
#' spurious outliers. The distribution of many statistics can be heavily
#' influenced by outliers. A typical strategy is to set all outliers (values
#' beyond a certain threshold) to a specified percentile of the data; for
#' example, a `90%` winsorization would see all data below the 5th percentile set
#' to the 5th percentile, and data above the 95th percentile set to the 95th
#' percentile. Winsorized estimators are usually more robust to outliers than
#' their more standard forms.
#'
#' @return
#'
#' A data frame with winsorized columns or a winsorized vector.
#'
#' @param data data frame or vector.
#' @param threshold The amount of winsorization, depends on the value of `method`:
#' - For `method = "percentile"`: the amount to winsorize from *each* tail.
#'   The value of `threshold` must be between 0 and 0.5 and of length 1.
#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the
#'   *mean*/*median* (see `robust`). The value of `threshold` must be greater
#'   than 0 and of length 1.
#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound
#'   for winsorization.
#' @param method One of "percentile" (default), "zscore", or "raw".
#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is
#'   done via the median and the median absolute deviation (MAD); if FALSE, via
#'   the mean and the standard deviation.
#' @param ... Currently not used.
#' @param verbose Not used anymore since `datawizard` 0.6.6.
#'
#' @examples
#' hist(iris$Sepal.Length, main = "Original data")
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 0.2),
#'   xlim = c(4, 8), main = "Percentile Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
#'   xlim = c(4, 8), main = "Mean (+/- SD) Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
#'   xlim = c(4, 8), main = "Median (+/- MAD) Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
#'   xlim = c(4, 8), main = "Raw Thresholds"
#' )
#'
#' # Also works on a data frame:
#' winsorize(iris, threshold = 0.2)
#'
#' @inherit data_rename seealso
#' @export
winsorize <- function(data, ...) {
  UseMethod("winsorize")
}


#' @export
winsorize.factor <- function(data, ...) {
  data
}

#' @export
winsorize.character <- winsorize.factor

#' @export
winsorize.logical <- winsorize.factor

#' @export
winsorize.data.frame <- function(data,
                                 threshold = 0.2,
                                 method = "percentile",
                                 robust = FALSE,
                                 verbose = TRUE,
                                 ...) {
  data[] <- lapply(
    data,
    winsorize,
    threshold = threshold,
    method = method,
    robust = robust,
    verbose = verbose
  )
  data
}

#' @rdname winsorize
#' @export
winsorize.numeric <- function(data,
                              threshold = 0.2,
                              method = "percentile",
                              robust = FALSE,
                              verbose = TRUE,
                              ...) {
  method <- match.arg(method, choices = c("percentile", "zscore", "raw"))

  if (method == "raw" && length(threshold) != 2L) {
    insight::format_error(
      "`threshold` must be of length 2 for lower and upper bound."
    )
  }

  if (method == "percentile") {
    if (threshold < 0 || threshold > 0.5) {
      insight::format_error(
        "`threshold` for winsorization must be a scalar between 0 and 0.5."
      )
    }

    y <- sort(data)
    n <- length(data)
    ibot <- floor(threshold * n) + 1
    itop <- length(data) - ibot + 1

    threshold <- c(y[ibot], y[itop])
  }

  if (method == "zscore") {
    if (threshold <= 0) {
      insight::format_error(
        "'threshold' for winsorization must be a scalar greater than 0."
      )
    }

    if (isTRUE(robust)) {
      centeral <- stats::median(data, na.rm = TRUE)
      deviation <- stats::mad(data, center = centeral, na.rm = TRUE)
    } else {
      centeral <- mean(data, na.rm = TRUE)
      deviation <- stats::sd(data, na.rm = TRUE)
    }

    threshold <- centeral + c(-1, 1) * deviation * threshold
  }


  data[data < threshold[1]] <- threshold[1]
  data[data > threshold[2]] <- threshold[2]
  return(data)
}