1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
|
#' Summarise y values at unique/binned x
#'
#' `stat_summary()` operates on unique `x` or `y`; `stat_summary_bin()`
#' operates on binned `x` or `y`. They are more flexible versions of
#' [stat_bin()]: instead of just counting, they can compute any
#' aggregate.
#'
#' @eval rd_orientation()
#'
#' @eval rd_aesthetics("stat", "summary")
#' @seealso [geom_errorbar()], [geom_pointrange()],
#' [geom_linerange()], [geom_crossbar()] for geoms to
#' display summarised data
#' @inheritParams stat_identity
#' @section Summary functions:
#' You can either supply summary functions individually (`fun`,
#' `fun.max`, `fun.min`), or as a single function (`fun.data`):
#'
#' \describe{
#' \item{fun.data}{Complete summary function. Should take numeric vector as
#' input and return data frame as output}
#' \item{fun.min}{min summary function (should take numeric vector and
#' return single number)}
#' \item{fun}{main summary function (should take numeric vector and return
#' single number)}
#' \item{fun.max}{max summary function (should take numeric vector and
#' return single number)}
#' }
#'
#' A simple vector function is easiest to work with as you can return a single
#' number, but is somewhat less flexible. If your summary function computes
#' multiple values at once (e.g. min and max), use `fun.data`.
#'
#' `fun.data` will receive data as if it was oriented along the x-axis and
#' should return a data.frame that corresponds to that orientation. The layer
#' will take care of flipping the input and output if it is oriented along the
#' y-axis.
#'
#' If no aggregation functions are supplied, will default to
#' [mean_se()].
#'
#' @param fun.data A function that is given the complete data and should
#' return a data frame with variables `ymin`, `y`, and `ymax`.
#' @param fun.min,fun,fun.max Alternatively, supply three individual
#' functions that are each passed a vector of values and should return a
#' single number.
#' @param fun.ymin,fun.y,fun.ymax `r lifecycle::badge("deprecated")` Use the
#' versions specified above instead.
#' @param fun.args Optional additional arguments passed on to the functions.
#' @export
#' @examples
#' d <- ggplot(mtcars, aes(cyl, mpg)) + geom_point()
#' d + stat_summary(fun.data = "mean_cl_boot", colour = "red", linewidth = 2, size = 3)
#'
#' # Orientation follows the discrete axis
#' ggplot(mtcars, aes(mpg, factor(cyl))) +
#' geom_point() +
#' stat_summary(fun.data = "mean_cl_boot", colour = "red", linewidth = 2, size = 3)
#'
#' # You can supply individual functions to summarise the value at
#' # each x:
#' d + stat_summary(fun = "median", colour = "red", size = 2, geom = "point")
#' d + stat_summary(fun = "mean", colour = "red", size = 2, geom = "point")
#' d + aes(colour = factor(vs)) + stat_summary(fun = mean, geom="line")
#'
#' d + stat_summary(fun = mean, fun.min = min, fun.max = max, colour = "red")
#'
#' d <- ggplot(diamonds, aes(cut))
#' d + geom_bar()
#' d + stat_summary(aes(y = price), fun = "mean", geom = "bar")
#'
#' # Orientation of stat_summary_bin is ambiguous and must be specified directly
#' ggplot(diamonds, aes(carat, price)) +
#' stat_summary_bin(fun = "mean", geom = "bar", orientation = 'y')
#'
#' \donttest{
#' # Don't use ylim to zoom into a summary plot - this throws the
#' # data away
#' p <- ggplot(mtcars, aes(cyl, mpg)) +
#' stat_summary(fun = "mean", geom = "point")
#' p
#' p + ylim(15, 30)
#' # Instead use coord_cartesian
#' p + coord_cartesian(ylim = c(15, 30))
#'
#' # A set of useful summary functions is provided from the Hmisc package:
#' stat_sum_df <- function(fun, geom="crossbar", ...) {
#' stat_summary(fun.data = fun, colour = "red", geom = geom, width = 0.2, ...)
#' }
#' d <- ggplot(mtcars, aes(cyl, mpg)) + geom_point()
#' # The crossbar geom needs grouping to be specified when used with
#' # a continuous x axis.
#' d + stat_sum_df("mean_cl_boot", mapping = aes(group = cyl))
#' d + stat_sum_df("mean_sdl", mapping = aes(group = cyl))
#' d + stat_sum_df("mean_sdl", fun.args = list(mult = 1), mapping = aes(group = cyl))
#' d + stat_sum_df("median_hilow", mapping = aes(group = cyl))
#'
#' # An example with highly skewed distributions:
#' if (require("ggplot2movies")) {
#' set.seed(596)
#' mov <- movies[sample(nrow(movies), 1000), ]
#' m2 <-
#' ggplot(mov, aes(x = factor(round(rating)), y = votes)) +
#' geom_point()
#' m2 <-
#' m2 +
#' stat_summary(
#' fun.data = "mean_cl_boot",
#' geom = "crossbar",
#' colour = "red", width = 0.3
#' ) +
#' xlab("rating")
#' m2
#' # Notice how the overplotting skews off visual perception of the mean
#' # supplementing the raw data with summary statistics is _very_ important
#'
#' # Next, we'll look at votes on a log scale.
#'
#' # Transforming the scale means the data are transformed
#' # first, after which statistics are computed:
#' m2 + scale_y_log10()
#' # Transforming the coordinate system occurs after the
#' # statistic has been computed. This means we're calculating the summary on the raw data
#' # and stretching the geoms onto the log scale. Compare the widths of the
#' # standard errors.
#' m2 + coord_trans(y="log10")
#' }
#' }
stat_summary <- function(mapping = NULL, data = NULL,
geom = "pointrange", position = "identity",
...,
fun.data = NULL,
fun = NULL,
fun.max = NULL,
fun.min = NULL,
fun.args = list(),
na.rm = FALSE,
orientation = NA,
show.legend = NA,
inherit.aes = TRUE,
fun.y = deprecated(),
fun.ymin = deprecated(),
fun.ymax = deprecated()) {
if (lifecycle::is_present(fun.y)) {
deprecate_warn0("3.3.0", "stat_summary(fun.y)", "stat_summary(fun)")
fun = fun %||% fun.y
}
if (lifecycle::is_present(fun.ymin)) {
deprecate_warn0("3.3.0", "stat_summary(fun.ymin)", "stat_summary(fun.min)")
fun.min = fun.min %||% fun.ymin
}
if (lifecycle::is_present(fun.ymax)) {
deprecate_warn0("3.3.0", "stat_summary(fun.ymax)", "stat_summary(fun.max)")
fun.max = fun.max %||% fun.ymax
}
layer(
data = data,
mapping = mapping,
stat = StatSummary,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list2(
fun.data = fun.data,
fun = fun,
fun.max = fun.max,
fun.min = fun.min,
fun.args = fun.args,
na.rm = na.rm,
orientation = orientation,
...
)
)
}
#' @rdname ggplot2-ggproto
#' @format NULL
#' @usage NULL
#' @export
StatSummary <- ggproto("StatSummary", Stat,
required_aes = c("x", "y"),
extra_params = c("na.rm", "orientation"),
setup_params = function(data, params) {
params$flipped_aes <- has_flipped_aes(data, params)
params
},
compute_panel = function(data, scales, fun.data = NULL, fun = NULL,
fun.max = NULL, fun.min = NULL, fun.args = list(),
na.rm = FALSE, flipped_aes = FALSE) {
data <- flip_data(data, flipped_aes)
fun <- make_summary_fun(fun.data, fun, fun.max, fun.min, fun.args)
summarised <- summarise_by_x(data, fun)
summarised$flipped_aes <- flipped_aes
flip_data(summarised, flipped_aes)
}
)
# Summarise a data.frame by parts
# Summarise a data frame by unique value of x
#
# This function is used by [stat_summary()] to break a
# data.frame into pieces, summarise each piece, and join the pieces
# back together, retaining original columns unaffected by the summary.
#
# @param [data.frame()] to summarise
# @param vector to summarise by
# @param summary function (must take and return a data.frame)
# @param other arguments passed on to summary function
# @keyword internal
summarise_by_x <- function(data, summary, ...) {
summary <- dapply(data, c("group", "x"), summary, ...)
unique <- dapply(data, c("group", "x"), uniquecols)
unique$y <- NULL
merge(summary, unique, by = c("x", "group"), sort = FALSE)
}
#' A selection of summary functions from Hmisc
#'
#' @description
#' These are wrappers around functions from \pkg{Hmisc} designed to make them
#' easier to use with [stat_summary()]. See the Hmisc documentation
#' for more details:
#'
#' - [Hmisc::smean.cl.boot()]
#' - [Hmisc::smean.cl.normal()]
#' - [Hmisc::smean.sdl()]
#' - [Hmisc::smedian.hilow()]
#'
#' @param x a numeric vector
#' @param ... other arguments passed on to the respective Hmisc function.
#' @return A data frame with columns `y`, `ymin`, and `ymax`.
#' @name hmisc
#' @examples
#' if (requireNamespace("Hmisc", quietly = TRUE)) {
#' set.seed(1)
#' x <- rnorm(100)
#' mean_cl_boot(x)
#' mean_cl_normal(x)
#' mean_sdl(x)
#' median_hilow(x)
#' }
NULL
wrap_hmisc <- function(fun) {
function(x, ...) {
check_installed("Hmisc")
fun <- getExportedValue("Hmisc", fun)
result <- fun(x = x, ...)
rename(
data_frame0(!!!as.list(result)),
c(Median = "y", Mean = "y", Lower = "ymin", Upper = "ymax")
)
}
}
#' @export
#' @rdname hmisc
mean_cl_boot <- wrap_hmisc("smean.cl.boot")
#' @export
#' @rdname hmisc
mean_cl_normal <- wrap_hmisc("smean.cl.normal")
#' @export
#' @rdname hmisc
mean_sdl <- wrap_hmisc("smean.sdl")
#' @export
#' @rdname hmisc
median_hilow <- wrap_hmisc("smedian.hilow")
#' Calculate mean and standard error of the mean
#'
#' For use with [stat_summary()]
#'
#' @param x numeric vector.
#' @param mult number of multiples of standard error.
#' @return A data frame with three columns:
#' \describe{
#' \item{`y`}{ The mean.}
#' \item{`ymin`}{ The mean minus the multiples of the standard error.}
#' \item{`ymax`}{ The mean plus the multiples of the standard error.}
#' }
#' @export
#' @examples
#' set.seed(1)
#' x <- rnorm(100)
#' mean_se(x)
mean_se <- function(x, mult = 1) {
x <- stats::na.omit(x)
se <- mult * sqrt(stats::var(x) / length(x))
mean <- mean(x)
data_frame0(
y = mean,
ymin = mean - se,
ymax = mean + se,
.size = 1
)
}
|