File: quantcut.R

package info (click to toggle)
gtools 3.9.5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 748 kB
  • sloc: ansic: 190; makefile: 2
file content (141 lines) | stat: -rw-r--r-- 4,161 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#' Create a Factor Variable Using the Quantiles of a Continuous Variable
#'
#' Create a factor variable using the quantiles of a continuous variable.
#'
#'
#' This function uses \code{\link{quantile}} to obtain the specified quantiles
#' of \code{x}, then calls \code{\link{cut}} to create a factor variable using
#' the intervals specified by these quantiles.
#'
#' It properly handles cases where more than one quantile obtains the same
#' value, as in the second example below.  Note that in this case, there will
#' be fewer generated factor levels than the specified number of quantile
#' intervals.
#'
#' @param x Continuous variable.
#' @param q Either a integer number of equally spaced quantile groups to
#' create, or a vector of quantiles used for creating groups. Defaults to
#' \code{q=4} which is equivalent to \code{q=seq(0, 1, by=0.25)}.  See
#' \code{\link{quantile}} for details.
#' @param na.rm Boolean indicating whether missing values should be removed
#' when computing quantiles.  Defaults to TRUE.
#' @param \dots Optional arguments passed to \code{\link{cut}}.
#' @return Factor variable with one level for each quantile interval.
#' @author Gregory R. Warnes \email{greg@@warnes.net}
#' @seealso \code{\link{cut}}, \code{\link{quantile}}
#' @keywords manip
#' @examples
#'
#'
#' ## create example data
#' # testonly{
#' set.seed(1234)
#' # }
#' x <- rnorm(1000)
#'
#' ## cut into quartiles
#' quartiles <- quantcut(x)
#' table(quartiles)
#'
#' ## cut into deciles
#' deciles.1 <- quantcut(x, 10)
#' table(deciles.1)
#' # or equivalently
#' deciles.2 <- quantcut(x, seq(0, 1, by = 0.1))
#'
#' # testonly{
#' stopifnot(identical(deciles.1, deciles.2))
#' # }
#'
#' ## show handling of 'tied' quantiles.
#' x <- round(x) # discretize to create ties
#' stem(x) # display the ties
#' deciles <- quantcut(x, 10)
#'
#' table(deciles) # note that there are only 5 groups (not 10)
#' # due to duplicates
#' @importFrom stats quantile
#' @export
quantcut <- function(x, q = 4, na.rm = TRUE, ...) {
  if (length(q) == 1) {
    q <- seq(0, 1, length.out = q + 1)
  }

  quant <- quantile(x, q, na.rm = na.rm)
  dups <- duplicated(quant)
  if (any(dups)) {
    flag <- x %in% unique(quant[dups])
    retval <- ifelse(flag,
      paste("[",
        as.character(x),
        "]",
        sep = ""
      ),
      NA
    )
    uniqs <- unique(quant)

    # move cut points over a bit...
    reposition <- function(cut) {
      flag <- x >= cut
      if (sum(flag, na.rm = na.rm) == 0) {
        return(cut)
      } else {
        return(min(x[flag], na.rm = na.rm))
      }
    }

    newquant <- sapply(uniqs, reposition)
    retval[!flag] <- as.character(cut(x[!flag],
      breaks = newquant,
      include.lowest = TRUE, ...
    ))

    levs <- unique(retval[order(x)]) # ensure factor levels are
    # properly ordered
    retval <- factor(retval, levels = levs)

    ## determine open/closed interval ends
    mkpairs <- function(x) { # make table of lower, upper
      sapply(
        x,
        function(y) if (length(y) == 2) y[c(2, 2)] else y[2:3]
      )
    }
    pairs <- mkpairs(strsplit(levs, "[^0-9+\\.\\-]+"))
    rownames(pairs) <- c("lower.bound", "upper.bound")
    colnames(pairs) <- levs

    closed.lower <- rep(F, ncol(pairs)) # default lower is open
    closed.upper <- rep(T, ncol(pairs)) # default upper is closed
    closed.lower[1] <- TRUE # lowest interval is always closed

    for (i in 2:ncol(pairs)) { # open lower interval if above singlet
      if (pairs[1, i] == pairs[1, i - 1] && pairs[1, i] == pairs[2, i - 1]) {
        closed.lower[i] <- FALSE
      }
    }

    for (i in 1:(ncol(pairs) - 1)) { # open upper inteval if below singlet
      if (pairs[2, i] == pairs[1, i + 1] && pairs[2, i] == pairs[2, i + 1]) {
        closed.upper[i] <- FALSE
      }
    }

    levs <- ifelse(pairs[1, ] == pairs[2, ],
      pairs[1, ],
      paste(ifelse(closed.lower, "[", "("),
        pairs[1, ],
        ",",
        pairs[2, ],
        ifelse(closed.upper, "]", ")"),
        sep = ""
      )
    )
    levels(retval) <- levs
  }
  else {
    retval <- cut(x, quant, include.lowest = TRUE, ...)
  }
  return(retval)
}