File: data_summary.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (230 lines) | stat: -rw-r--r-- 7,697 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#' @title Summarize data
#' @name data_summary
#'
#' @description This function can be used to compute summary statistics for a
#' data frame or a matrix.
#'
#' @param x A (grouped) data frame.
#' @param by Optional character string, indicating the names of one or more
#' variables in the data frame. If supplied, the data will be split by these
#' variables and summary statistics will be computed for each group.
#' @param remove_na Logical. If `TRUE`, missing values are omitted from the
#' grouping variable. If `FALSE` (default), missing values are included as a
#' level in the grouping variable.
#' @param ... One or more named expressions that define the new variable name
#' and the function to compute the summary statistic. Example:
#' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
#' as a character string, e.g. `"mean_sepal_width = mean(Sepal.Width)"`. The
#' summary function `n()` can be used to count the number of observations.
#'
#' @return A data frame with the requested summary statistics.
#'
#' @examples
#' data(iris)
#' data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
#' data_summary(
#'   iris,
#'   MW = mean(Sepal.Width),
#'   SD = sd(Sepal.Width),
#'   by = "Species"
#' )
#'
#' # same as
#' d <- data_group(iris, "Species")
#' data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
#'
#' # multiple groups
#' data(mtcars)
#' data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))
#'
#' # expressions can also be supplied as character strings
#' data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear"))
#'
#' # count observations within groups
#' data_summary(mtcars, observations = n(), by = c("am", "gear"))
#'
#' # first and last observations of "mpg" within groups
#' data_summary(
#'   mtcars,
#'   first = mpg[1],
#'   last = mpg[length(mpg)],
#'   by = c("am", "gear")
#' )
#' @export
data_summary <- function(x, ...) {
  UseMethod("data_summary")
}


#' @export
data_summary.matrix <- function(x, ..., by = NULL, remove_na = FALSE) {
  data_summary(as.data.frame(x), ..., by = by, remove_na = remove_na)
}


#' @export
data_summary.default <- function(x, ...) {
  insight::format_error("`data_summary()` only works for (grouped) data frames and matrices.")
}


#' @rdname data_summary
#' @export
data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) {
  dots <- eval(substitute(alist(...)))

  # do we have any expression at all?
  if (length(dots) == 0) {
    insight::format_error("No expressions for calculating summary statistics provided.")
  }

  if (is.null(by)) {
    # when we have no grouping, just compute a one-row summary
    summarise <- .process_datasummary_dots(dots, x)
    out <- data.frame(summarise)
    colnames(out) <- vapply(summarise, names, character(1))
  } else {
    # sanity check - is "by" a character string?
    if (!is.character(by)) {
      insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.")
    }
    # is "by" in the data?
    if (!all(by %in% colnames(x))) {
      by_not_found <- by[!by %in% colnames(x)]
      insight::format_error(
        paste0(
          "Variable",
          ifelse(length(by_not_found) > 1, "s ", " "),
          text_concatenate(by_not_found, enclose = "\""),
          " not found in the data."
        ),
        .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
      )
    }
    # split data, add NA levels, if requested
    l <- lapply(x[by], function(i) {
      if (remove_na || !anyNA(i)) {
        i
      } else {
        addNA(i)
      }
    })
    split_data <- split(x, l, drop = TRUE)
    out <- lapply(split_data, function(s) {
      # no data for combination? Return NULL
      if (nrow(s) == 0) {
        return(NULL)
      }
      # summarize data
      summarise <- .process_datasummary_dots(dots, s)
      # coerce to data frame
      summarised_data <- data.frame(summarise)
      # bind grouping-variables and values
      summarised_data <- cbind(s[1, by], summarised_data)
      # make sure we have proper column names
      colnames(summarised_data) <- c(by, unlist(lapply(summarise, names)))
      summarised_data
    })
    out <- do.call(rbind, out)
  }
  # sort data
  out <- data_arrange(out, select = by)
  # data attributes
  class(out) <- c("dw_data_summary", "data.frame")
  rownames(out) <- NULL
  out
}


#' @export
data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) {
  # extract group variables
  grps <- attr(x, "groups", exact = TRUE)
  group_variables <- data_remove(grps, ".rows")
  # if "by" is not supplied, use group variables
  if (is.null(by)) {
    by <- colnames(group_variables)
  }
  # remove information specific to grouped df's
  attr(x, "groups") <- NULL
  class(x) <- "data.frame"
  data_summary(x, ..., by = by, remove_na = remove_na)
}


# helper -----------------------------------------------------------------------

.process_datasummary_dots <- function(dots, data) {
  out <- NULL
  if (length(dots)) {
    # we check for character vector of expressions, in which case
    # "dots" should be unnamed
    if (is.null(names(dots))) {
      # if we have multiple strings, concatenate them to a character vector
      # and put it into a list...
      if (length(dots) > 1) {
        if (all(vapply(dots, is.character, logical(1)))) {
          dots <- list(unlist(dots))
        } else {
          insight::format_error("You cannot mix string and literal representation of expressions.")
        }
      }
      # expression is given as character string, e.g.
      # a <- "mean_sepwid = mean(Sepal.Width)"
      # data_summary(iris, a, by = "Species")
      # or as character vector, e.g.
      # data_summary(iris, c("var_a = mean(Sepal.Width)", "var_b = sd(Sepal.Width)"))
      character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL)
      # do we have a character vector? Then we can proceed
      if (is.character(character_symbol)) {
        dots <- lapply(character_symbol, function(s) {
          # turn value from character vector into expression
          str2lang(.dynEval(s))
        })
        names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1))
      }
    }

    out <- lapply(seq_along(dots), function(i) {
      new_variable <- .get_new_dots_variable(dots, i, data)
      if (inherits(new_variable, c("bayestestR_ci", "bayestestR_eti"))) {
        stats::setNames(new_variable, c("CI", "CI_low", "CI_high"))
      } else {
        stats::setNames(new_variable, names(dots)[i])
      }
    })
  }

  # check for correct length of output - must be a single value!
  # Exception: bayestestR::ci()
  wrong_length <- !sapply(out, inherits, what = c("bayestestR_ci", "bayestestR_eti")) & lengths(out) != 1 # nolint
  if (any(wrong_length)) {
    insight::format_error(
      paste0(
        "Each expression must return a single value. Following expression",
        ifelse(sum(wrong_length) > 1, "s", " "),
        " returned more than one value: ",
        text_concatenate(vapply(dots[wrong_length], insight::safe_deparse, character(1)), enclose = "\"")
      )
    )
  }

  out
}


# methods ----------------------------------------------------------------------

#' @export
print.dw_data_summary <- function(x, ...) {
  if (nrow(x) == 0) {
    cat("No matches found.\n")
  } else {
    if (all(c("CI", "CI_low", "CI_high") %in% colnames(x))) {
      ci <- insight::format_table(x[c("CI", "CI_low", "CI_high")], ...)
      x$CI <- x$CI_low <- x$CI_high <- NULL
      x <- cbind(x, ci)
    }
    cat(insight::export_table(x, missing = "<NA>", ...))
  }
}