1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
|
#' @title Row means or sums (optionally with minimum amount of valid values)
#' @name row_means
#' @description This function is similar to the SPSS `MEAN.n` or `SUM.n`
#' function and computes row means or row sums from a data frame or matrix if at
#' least `min_valid` values of a row are valid (and not `NA`).
#'
#' @param data A data frame with at least two columns, where row means or row
#' sums are applied.
#' @param min_valid Optional, a numeric value of length 1. May either be
#' - a numeric value that indicates the amount of valid values per row to
#' calculate the row mean or row sum;
#' - or a value between `0` and `1`, indicating a proportion of valid values per
#' row to calculate the row mean or row sum (see 'Details').
#' - `NULL` (default), in which all cases are considered.
#'
#' If a row's sum of valid values is less than `min_valid`, `NA` will be returned.
#' @param digits Numeric value indicating the number of decimal places to be
#' used for rounding mean values. Negative values are allowed (see 'Details').
#' By default, `digits = NULL` and no rounding is used.
#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values
#' before calculating row means or row sums. Only applies if `min_valid` is not
#' specified.
#' @param verbose Toggle warnings.
#' @inheritParams extract_column_names
#'
#' @return A vector with row means (for `row_means()`) or row sums (for
#' `row_sums()`) for those rows with at least `n` valid values.
#'
#' @details Rounding to a negative number of `digits` means rounding to a power
#' of ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest
#' hundred. For `min_valid`, if not `NULL`, `min_valid` must be a numeric value
#' from `0` to `ncol(data)`. If a row in the data frame has at least `min_valid`
#' non-missing values, the row mean or row sum is returned. If `min_valid` is a
#' non-integer value from 0 to 1, `min_valid` is considered to indicate the
#' proportion of required non-missing values per row. E.g., if
#' `min_valid = 0.75`, a row must have at least `ncol(data) * min_valid`
#' non-missing values for the row mean or row sum to be calculated. See
#' 'Examples'.
#'
#' @examples
#' dat <- data.frame(
#' c1 = c(1, 2, NA, 4),
#' c2 = c(NA, 2, NA, 5),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, 8)
#' )
#'
#' # default, all means are shown, if no NA values are present
#' row_means(dat)
#'
#' # remove all NA before computing row means
#' row_means(dat, remove_na = TRUE)
#'
#' # needs at least 4 non-missing values per row
#' row_means(dat, min_valid = 4) # 1 valid return value
#' row_sums(dat, min_valid = 4) # 1 valid return value
#'
#' # needs at least 3 non-missing values per row
#' row_means(dat, min_valid = 3) # 2 valid return values
#'
#' # needs at least 2 non-missing values per row
#' row_means(dat, min_valid = 2)
#'
#' # needs at least 1 non-missing value per row, for two selected variables
#' row_means(dat, select = c("c1", "c3"), min_valid = 1)
#'
#' # needs at least 50% of non-missing values per row
#' row_means(dat, min_valid = 0.5) # 3 valid return values
#' row_sums(dat, min_valid = 0.5)
#'
#' # needs at least 75% of non-missing values per row
#' row_means(dat, min_valid = 0.75) # 2 valid return values
#'
#' @export
row_means <- function(data,
select = NULL,
exclude = NULL,
min_valid = NULL,
digits = NULL,
ignore_case = FALSE,
regex = FALSE,
remove_na = FALSE,
verbose = TRUE) {
# evaluate arguments
select <- .select_nse(select,
data,
exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)
# prepare data, sanity checks
data <- .prepare_row_data(data, select, min_valid, verbose)
# calculate row means
.row_sums_or_means(data, min_valid, digits, remove_na, fun = "mean")
}
#' @rdname row_means
#' @export
row_sums <- function(data,
select = NULL,
exclude = NULL,
min_valid = NULL,
digits = NULL,
ignore_case = FALSE,
regex = FALSE,
remove_na = FALSE,
verbose = TRUE) {
# evaluate arguments
select <- .select_nse(select,
data,
exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)
# prepare data, sanity checks
data <- .prepare_row_data(data, select, min_valid, verbose)
# calculate row sums
.row_sums_or_means(data, min_valid, digits, remove_na, fun = "sum")
}
# helper ------------------------
# calculate row means or sums
.row_sums_or_means <- function(data, min_valid, digits, remove_na, fun) {
if (is.null(min_valid)) {
# calculate row means or sums for complete data
out <- switch(fun,
mean = rowMeans(data, na.rm = remove_na),
rowSums(data, na.rm = remove_na)
)
} else {
# is 'min_valid' indicating a proportion?
decimals <- min_valid %% 1
if (decimals != 0) {
min_valid <- round(ncol(data) * decimals)
}
# min_valid may not be larger as df's amount of columns
if (ncol(data) < min_valid) {
insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.")
}
# row means or sums
to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
out <- switch(fun,
mean = rowMeans(data, na.rm = TRUE),
rowSums(data, na.rm = TRUE)
)
out[to_na] <- NA
}
# round, if requested
if (!is.null(digits) && !all(is.na(digits))) {
out <- round(out, digits = digits)
}
out
}
# check that data is in shape for row means or row sums
.prepare_row_data <- function(data, select, min_valid, verbose) {
if (is.null(select) || length(select) == 0) {
insight::format_error("No columns selected.")
}
data <- .coerce_to_dataframe(data[select])
# n must be a numeric, non-missing value
if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) {
insight::format_error("`min_valid` must be a numeric value of length 1.")
}
# make sure we only have numeric values
numeric_columns <- vapply(data, is.numeric, TRUE)
if (!all(numeric_columns)) {
if (verbose) {
insight::format_alert("Only numeric columns are considered for calculation.")
}
data <- data[numeric_columns]
}
# check if we have a data framme with at least two columns
if (ncol(data) < 2) {
insight::format_error("`data` must be a data frame with at least two numeric columns.")
}
data
}
|