1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
|
#' @title Summarize data
#' @name data_summary
#'
#' @description This function can be used to compute summary statistics for a
#' data frame or a matrix.
#'
#' @param x A (grouped) data frame.
#' @param by Optional character string, indicating the names of one or more
#' variables in the data frame. If supplied, the data will be split by these
#' variables and summary statistics will be computed for each group.
#' @param remove_na Logical. If `TRUE`, missing values are omitted from the
#' grouping variable. If `FALSE` (default), missing values are included as a
#' level in the grouping variable.
#' @param ... One or more named expressions that define the new variable name
#' and the function to compute the summary statistic. Example:
#' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
#' as a character string, e.g. `"mean_sepal_width = mean(Sepal.Width)"`. The
#' summary function `n()` can be used to count the number of observations.
#'
#' @return A data frame with the requested summary statistics.
#'
#' @examples
#' data(iris)
#' data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
#' data_summary(
#' iris,
#' MW = mean(Sepal.Width),
#' SD = sd(Sepal.Width),
#' by = "Species"
#' )
#'
#' # same as
#' d <- data_group(iris, "Species")
#' data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
#'
#' # multiple groups
#' data(mtcars)
#' data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))
#'
#' # expressions can also be supplied as character strings
#' data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear"))
#'
#' # count observations within groups
#' data_summary(mtcars, observations = n(), by = c("am", "gear"))
#'
#' # first and last observations of "mpg" within groups
#' data_summary(
#' mtcars,
#' first = mpg[1],
#' last = mpg[length(mpg)],
#' by = c("am", "gear")
#' )
#' @export
data_summary <- function(x, ...) {
UseMethod("data_summary")
}
#' @export
data_summary.matrix <- function(x, ..., by = NULL, remove_na = FALSE) {
data_summary(as.data.frame(x), ..., by = by, remove_na = remove_na)
}
#' @export
data_summary.default <- function(x, ...) {
insight::format_error("`data_summary()` only works for (grouped) data frames and matrices.")
}
#' @rdname data_summary
#' @export
data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) {
dots <- eval(substitute(alist(...)))
# do we have any expression at all?
if (length(dots) == 0) {
insight::format_error("No expressions for calculating summary statistics provided.")
}
if (is.null(by)) {
# when we have no grouping, just compute a one-row summary
summarise <- .process_datasummary_dots(dots, x)
out <- data.frame(summarise)
colnames(out) <- vapply(summarise, names, character(1))
} else {
# sanity check - is "by" a character string?
if (!is.character(by)) {
insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.")
}
# is "by" in the data?
if (!all(by %in% colnames(x))) {
by_not_found <- by[!by %in% colnames(x)]
insight::format_error(
paste0(
"Variable",
ifelse(length(by_not_found) > 1, "s ", " "),
text_concatenate(by_not_found, enclose = "\""),
" not found in the data."
),
.misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
)
}
# split data, add NA levels, if requested
l <- lapply(x[by], function(i) {
if (remove_na || !anyNA(i)) {
i
} else {
addNA(i)
}
})
split_data <- split(x, l, drop = TRUE)
out <- lapply(split_data, function(s) {
# no data for combination? Return NULL
if (nrow(s) == 0) {
return(NULL)
}
# summarize data
summarise <- .process_datasummary_dots(dots, s)
# coerce to data frame
summarised_data <- data.frame(summarise)
# bind grouping-variables and values
summarised_data <- cbind(s[1, by], summarised_data)
# make sure we have proper column names
colnames(summarised_data) <- c(by, unlist(lapply(summarise, names)))
summarised_data
})
out <- do.call(rbind, out)
}
# sort data
out <- data_arrange(out, select = by)
# data attributes
class(out) <- c("dw_data_summary", "data.frame")
rownames(out) <- NULL
out
}
#' @export
data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) {
# extract group variables
grps <- attr(x, "groups", exact = TRUE)
group_variables <- data_remove(grps, ".rows")
# if "by" is not supplied, use group variables
if (is.null(by)) {
by <- colnames(group_variables)
}
# remove information specific to grouped df's
attr(x, "groups") <- NULL
class(x) <- "data.frame"
data_summary(x, ..., by = by, remove_na = remove_na)
}
# helper -----------------------------------------------------------------------
.process_datasummary_dots <- function(dots, data) {
out <- NULL
if (length(dots)) {
# we check for character vector of expressions, in which case
# "dots" should be unnamed
if (is.null(names(dots))) {
# if we have multiple strings, concatenate them to a character vector
# and put it into a list...
if (length(dots) > 1) {
if (all(vapply(dots, is.character, logical(1)))) {
dots <- list(unlist(dots))
} else {
insight::format_error("You cannot mix string and literal representation of expressions.")
}
}
# expression is given as character string, e.g.
# a <- "mean_sepwid = mean(Sepal.Width)"
# data_summary(iris, a, by = "Species")
# or as character vector, e.g.
# data_summary(iris, c("var_a = mean(Sepal.Width)", "var_b = sd(Sepal.Width)"))
character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL)
# do we have a character vector? Then we can proceed
if (is.character(character_symbol)) {
dots <- lapply(character_symbol, function(s) {
# turn value from character vector into expression
str2lang(.dynEval(s))
})
names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1))
}
}
out <- lapply(seq_along(dots), function(i) {
new_variable <- .get_new_dots_variable(dots, i, data)
if (inherits(new_variable, c("bayestestR_ci", "bayestestR_eti"))) {
stats::setNames(new_variable, c("CI", "CI_low", "CI_high"))
} else {
stats::setNames(new_variable, names(dots)[i])
}
})
}
# check for correct length of output - must be a single value!
# Exception: bayestestR::ci()
wrong_length <- !sapply(out, inherits, what = c("bayestestR_ci", "bayestestR_eti")) & lengths(out) != 1 # nolint
if (any(wrong_length)) {
insight::format_error(
paste0(
"Each expression must return a single value. Following expression",
ifelse(sum(wrong_length) > 1, "s", " "),
" returned more than one value: ",
text_concatenate(vapply(dots[wrong_length], insight::safe_deparse, character(1)), enclose = "\"")
)
)
}
out
}
# methods ----------------------------------------------------------------------
#' @export
print.dw_data_summary <- function(x, ...) {
if (nrow(x) == 0) {
cat("No matches found.\n")
} else {
if (all(c("CI", "CI_low", "CI_high") %in% colnames(x))) {
ci <- insight::format_table(x[c("CI", "CI_low", "CI_high")], ...)
x$CI <- x$CI_low <- x$CI_high <- NULL
x <- cbind(x, ci)
}
cat(insight::export_table(x, missing = "<NA>", ...))
}
}
|