File: winsorize.R

package info (click to toggle)
r-cran-datawizard 0.6.5%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,736 kB
  • sloc: sh: 13; makefile: 2
file content (155 lines) | stat: -rw-r--r-- 4,702 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#' Winsorize data
#'
#' @details
#'
#' Winsorizing or winsorization is the transformation of statistics by limiting
#' extreme values in the statistical data to reduce the effect of possibly
#' spurious outliers. The distribution of many statistics can be heavily
#' influenced by outliers. A typical strategy is to set all outliers (values
#' beyond a certain threshold) to a specified percentile of the data; for
#' example, a `90%` winsorization would see all data below the 5th percentile set
#' to the 5th percentile, and data above the 95th percentile set to the 95th
#' percentile. Winsorized estimators are usually more robust to outliers than
#' their more standard forms.
#'
#' @return
#'
#' A data frame with winsorized columns or a winsorized vector.
#'
#' @param data data frame or vector.
#' @param threshold The amount of winsorization, depends on the value of `method`:
#' - For `method = "percentile"`: the amount to winsorize from *each* tail.
#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`)
#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization.
#' @param verbose Toggle warnings.
#' @param method One of "percentile" (default), "zscore", or "raw".
#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is
#'   done via the median and the median absolute deviation (MAD); if FALSE, via
#'   the mean and the standard deviation.
#' @param ... Currently not used.
#'
#' @examples
#' hist(iris$Sepal.Length, main = "Original data")
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 0.2),
#'   xlim = c(4, 8), main = "Percentile Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
#'   xlim = c(4, 8), main = "Mean (+/- SD) Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
#'   xlim = c(4, 8), main = "Median (+/- MAD) Winsorization"
#' )
#'
#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
#'   xlim = c(4, 8), main = "Raw Thresholds"
#' )
#'
#' # Also works on a data frame:
#' winsorize(iris, threshold = 0.2)
#'
#' @inherit data_rename seealso
#' @export
winsorize <- function(data, ...) {
  UseMethod("winsorize")
}


#' @export
winsorize.factor <- function(data, ...) {
  data
}

#' @export
winsorize.character <- winsorize.factor

#' @export
winsorize.logical <- winsorize.factor

#' @export
winsorize.data.frame <- function(data,
                                 threshold = 0.2,
                                 method = "percentile",
                                 robust = FALSE,
                                 verbose = TRUE,
                                 ...) {
  data[] <- lapply(
    data,
    winsorize,
    threshold = threshold,
    method = method,
    robust = robust,
    verbose = verbose
  )
  data
}

#' @rdname winsorize
#' @export
winsorize.numeric <- function(data,
                              threshold = 0.2,
                              method = "percentile",
                              robust = FALSE,
                              verbose = TRUE,
                              ...) {
  method <- match.arg(method, choices = c("percentile", "zscore", "raw"))

  if (method == "raw") {
    if (length(threshold) != 2L) {
      if (isTRUE(verbose)) {
        insight::format_warning(
          "`threshold` must be of length 2 for lower and upper bound.",
          "Did not winsorize data."
        )
      }
      return(data)
    }
  }

  if (method == "percentile") {
    if (threshold < 0 || threshold > 0.5) {
      if (isTRUE(verbose)) {
        insight::format_warning(
          "`threshold` for winsorization must be a scalar between 0 and 0.5.",
          "Did not winsorize data."
        )
      }
      return(data)
    }

    y <- sort(data)
    n <- length(data)
    ibot <- floor(threshold * n) + 1
    itop <- length(data) - ibot + 1

    threshold <- c(y[ibot], y[itop])
  }

  if (method == "zscore") {
    if (threshold <= 0) {
      if (isTRUE(verbose)) {
        insight::format_warning(
          "'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data."
        )
      }
      return(data)
    }

    if (isTRUE(robust)) {
      centeral <- stats::median(data, na.rm = TRUE)
      deviation <- stats::mad(data, center = centeral, na.rm = TRUE)
    } else {
      centeral <- mean(data, na.rm = TRUE)
      deviation <- stats::sd(data, na.rm = TRUE)
    }

    threshold <- centeral + c(-1, 1) * deviation * threshold
  }


  data[data < threshold[1]] <- threshold[1]
  data[data > threshold[2]] <- threshold[2]
  return(data)
}