File: row_means.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (197 lines) | stat: -rw-r--r-- 6,699 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#' @title Row means or sums (optionally with minimum amount of valid values)
#' @name row_means
#' @description This function is similar to the SPSS `MEAN.n` or `SUM.n`
#' function and computes row means or row sums from a data frame or matrix if at
#' least `min_valid` values of a row are valid (and not `NA`).
#'
#' @param data A data frame with at least two columns, where row means or row
#' sums are applied.
#' @param min_valid Optional, a numeric value of length 1. May either be
#' - a numeric value that indicates the amount of valid values per row to
#'   calculate the row mean or row sum;
#' - or a value between `0` and `1`, indicating a proportion of valid values per
#'   row to calculate the row mean or row sum (see 'Details').
#' - `NULL` (default), in which all cases are considered.
#'
#' If a row's sum of valid values is less than `min_valid`, `NA` will be returned.
#' @param digits Numeric value indicating the number of decimal places to be
#' used for rounding mean values. Negative values are allowed (see 'Details').
#' By default, `digits = NULL` and no rounding is used.
#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values
#' before calculating row means or row sums. Only applies if `min_valid` is not
#' specified.
#' @param verbose Toggle warnings.
#' @inheritParams extract_column_names
#'
#' @return A vector with row means (for `row_means()`) or row sums (for
#' `row_sums()`) for those rows with at least `n` valid values.
#'
#' @details Rounding to a negative number of `digits` means rounding to a power
#' of ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest
#' hundred. For `min_valid`, if not `NULL`, `min_valid` must be a numeric value
#' from `0` to `ncol(data)`. If a row in the data frame has at least `min_valid`
#' non-missing values, the row mean or row sum is returned. If `min_valid` is a
#' non-integer value from 0 to 1, `min_valid` is considered to indicate the
#' proportion of required non-missing values per row. E.g., if
#' `min_valid = 0.75`, a row must have at least `ncol(data) * min_valid`
#' non-missing values for the row mean or row sum to be calculated. See
#' 'Examples'.
#'
#' @examples
#' dat <- data.frame(
#'   c1 = c(1, 2, NA, 4),
#'   c2 = c(NA, 2, NA, 5),
#'   c3 = c(NA, 4, NA, NA),
#'   c4 = c(2, 3, 7, 8)
#' )
#'
#' # default, all means are shown, if no NA values are present
#' row_means(dat)
#'
#' # remove all NA before computing row means
#' row_means(dat, remove_na = TRUE)
#'
#' # needs at least 4 non-missing values per row
#' row_means(dat, min_valid = 4) # 1 valid return value
#' row_sums(dat, min_valid = 4) # 1 valid return value
#'
#' # needs at least 3 non-missing values per row
#' row_means(dat, min_valid = 3) # 2 valid return values
#'
#' # needs at least 2 non-missing values per row
#' row_means(dat, min_valid = 2)
#'
#' # needs at least 1 non-missing value per row, for two selected variables
#' row_means(dat, select = c("c1", "c3"), min_valid = 1)
#'
#' # needs at least 50% of non-missing values per row
#' row_means(dat, min_valid = 0.5) # 3 valid return values
#' row_sums(dat, min_valid = 0.5)
#'
#' # needs at least 75% of non-missing values per row
#' row_means(dat, min_valid = 0.75) # 2 valid return values
#'
#' @export
row_means <- function(data,
                      select = NULL,
                      exclude = NULL,
                      min_valid = NULL,
                      digits = NULL,
                      ignore_case = FALSE,
                      regex = FALSE,
                      remove_na = FALSE,
                      verbose = TRUE) {
  # evaluate arguments
  select <- .select_nse(select,
    data,
    exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  # prepare data, sanity checks
  data <- .prepare_row_data(data, select, min_valid, verbose)

  # calculate row means
  .row_sums_or_means(data, min_valid, digits, remove_na, fun = "mean")
}


#' @rdname row_means
#' @export
row_sums <- function(data,
                     select = NULL,
                     exclude = NULL,
                     min_valid = NULL,
                     digits = NULL,
                     ignore_case = FALSE,
                     regex = FALSE,
                     remove_na = FALSE,
                     verbose = TRUE) {
  # evaluate arguments
  select <- .select_nse(select,
    data,
    exclude,
    ignore_case = ignore_case,
    regex = regex,
    verbose = verbose
  )

  # prepare data, sanity checks
  data <- .prepare_row_data(data, select, min_valid, verbose)

  # calculate row sums
  .row_sums_or_means(data, min_valid, digits, remove_na, fun = "sum")
}


# helper ------------------------

# calculate row means or sums
.row_sums_or_means <- function(data, min_valid, digits, remove_na, fun) {
  if (is.null(min_valid)) {
    # calculate row means or sums for complete data
    out <- switch(fun,
      mean = rowMeans(data, na.rm = remove_na),
      rowSums(data, na.rm = remove_na)
    )
  } else {
    # is 'min_valid' indicating a proportion?
    decimals <- min_valid %% 1
    if (decimals != 0) {
      min_valid <- round(ncol(data) * decimals)
    }

    # min_valid may not be larger as df's amount of columns
    if (ncol(data) < min_valid) {
      insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.")
    }

    # row means or sums
    to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
    out <- switch(fun,
      mean = rowMeans(data, na.rm = TRUE),
      rowSums(data, na.rm = TRUE)
    )
    out[to_na] <- NA
  }

  # round, if requested
  if (!is.null(digits) && !all(is.na(digits))) {
    out <- round(out, digits = digits)
  }

  out
}


# check that data is in shape for row means or row sums
.prepare_row_data <- function(data, select, min_valid, verbose) {
  if (is.null(select) || length(select) == 0) {
    insight::format_error("No columns selected.")
  }

  data <- .coerce_to_dataframe(data[select])

  # n must be a numeric, non-missing value
  if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) {
    insight::format_error("`min_valid` must be a numeric value of length 1.")
  }

  # make sure we only have numeric values
  numeric_columns <- vapply(data, is.numeric, TRUE)
  if (!all(numeric_columns)) {
    if (verbose) {
      insight::format_alert("Only numeric columns are considered for calculation.")
    }
    data <- data[numeric_columns]
  }

  # check if we have a data framme with at least two columns
  if (ncol(data) < 2) {
    insight::format_error("`data` must be a data frame with at least two numeric columns.")
  }

  data
}