File: data_xtabulate.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (471 lines) | stat: -rw-r--r-- 15,898 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
# helper to compute crosstables --------------

.crosstable <- function(x,
                        by,
                        weights = NULL,
                        remove_na = FALSE,
                        proportions = NULL,
                        obj_name = NULL,
                        group_variable = NULL) {
  if (!is.null(proportions)) {
    proportions <- match.arg(proportions, c("row", "column", "full"))
  }
  # frequency table
  if (is.null(weights)) {
    # we have a `.default` and a `.data.frame` method for `data_tabulate()`.
    # since this is the default, `x` can be an object which cannot be used
    # with `table()`, that's why we add `tryCatch()` here. Below we give an
    # informative error message for non-supported objects.
    if (remove_na) {
      x_table <- tryCatch(table(x, by), error = function(e) NULL)
    } else {
      x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL)
    }
  } else if (remove_na) {
    # weighted frequency table, excluding NA
    x_table <- tryCatch(
      stats::xtabs(
        weights ~ x + by,
        data = data.frame(weights = weights, x = x, by = by),
        na.action = stats::na.omit,
        addNA = FALSE
      ),
      error = function(e) NULL
    )
  } else {
    # weighted frequency table, including NA
    x_table <- tryCatch(
      stats::xtabs(
        weights ~ x + by,
        data = data.frame(weights = weights, x = addNA(x), by = addNA(by)),
        na.action = stats::na.pass,
        addNA = TRUE
      ),
      error = function(e) NULL
    )
  }

  if (is.null(x_table)) {
    insight::format_warning(paste0("Can't compute cross tables for objects of class `", class(x)[1], "`."))
    return(NULL)
  }

  out <- as.data.frame(stats::ftable(x_table))
  colnames(out) <- c("Value", "by", "N")
  total_n <- sum(out$N, na.rm = TRUE)

  # we want to round N for weighted frequencies
  if (!is.null(weights)) {
    out$N <- round(out$N)
    total_n <- round(total_n)
  }

  out <- data_to_wide(out, values_from = "N", names_from = "by")

  # use variable name as column name
  if (!is.null(obj_name)) {
    colnames(out)[1] <- obj_name
  }

  # for grouped data frames, add info about grouping variables
  if (!is.null(group_variable)) {
    var_info <- toString(lapply(colnames(group_variable), function(i) {
      sprintf("%s (%s)", i, group_variable[[i]])
    }))
    out <- cbind(out[1], data.frame(Group = var_info, stringsAsFactors = FALSE), out[-1])
  }

  attr(out, "total_n") <- total_n
  attr(out, "weights") <- weights
  attr(out, "proportions") <- proportions
  attr(out, "varname") <- obj_name

  class(out) <- c("datawizard_crosstab", "data.frame")

  out
}


# methods ---------------------


#' @export
format.datawizard_crosstab <- function(x,
                                       format = "text",
                                       digits = 1,
                                       big_mark = NULL,
                                       include_total_row = TRUE,
                                       ...) {
  # convert to character manually, else, for large numbers,
  # format_table() returns scientific notation
  x <- as.data.frame(x)

  # find numeric columns, only for these we need row/column sums
  numeric_columns <- vapply(x, is.numeric, logical(1))

  # compute total N for rows and columns
  total_n <- attributes(x)$total_n
  total_column <- rowSums(x[numeric_columns], na.rm = TRUE)
  total_row <- c(colSums(x[numeric_columns], na.rm = TRUE), total_n)

  # proportions?
  props <- attributes(x)$proportions

  if (!is.null(props)) {
    # we copy x to tmp, because when we create strings with "sprintf()", the
    # variable is coerced to character, and in subsequent iterations of the loop,
    # mathemathical operations are not possible anymore
    tmp <- x
    if (identical(props, "row")) {
      for (i in seq_len(nrow(x))) {
        row_sum <- sum(x[i, numeric_columns], na.rm = TRUE)
        if (row_sum == 0) {
          row_sum_string <- "(0%)"
        } else {
          row_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[i, numeric_columns] / row_sum)
        }
        tmp[i, numeric_columns] <- paste(format(x[i, numeric_columns]), format(row_sum_string, justify = "right"))
      }
    } else if (identical(props, "column")) {
      for (i in seq_len(ncol(x))[numeric_columns]) {
        col_sum <- sum(x[, i], na.rm = TRUE)
        if (col_sum == 0) {
          col_sum_string <- "(0%)"
        } else {
          col_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[, i] / col_sum)
        }
        tmp[, i] <- paste(format(x[, i]), format(col_sum_string, justify = "right"))
      }
    } else if (identical(props, "full")) {
      for (i in seq_len(ncol(x))[numeric_columns]) {
        tmp[, i] <- paste(
          format(x[, i]),
          format(sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n), justify = "right")
        )
      }
    }
    # copy back final result
    x <- tmp
  }

  x[] <- lapply(x, as.character)

  # format data frame
  ftab <- insight::format_table(x, ...)
  # replace empty cells with NA
  ftab[] <- lapply(ftab, function(i) {
    i[i == ""] <- ifelse(identical(format, "text"), "<NA>", "(NA)") # nolint
    i
  })
  # Remove ".00" from numbers
  ftab$Total <- gsub("\\.00$", "", as.character(total_column))

  # add final total row to each sub-table. For multiple, collapsed table
  # (i.e. when length of `by` > 1), we don't want multiple total rows in the
  # table, so we would set include_total_row = FALSE for objects of class
  # `datawizard_crosstabs` (note plural s!)
  if (include_total_row) {
    # for text format, insert "empty row" before last total row
    if (identical(format, "text") || identical(format, "markdown")) {
      empty_row <- as.data.frame(t(data.frame(
        rep("", ncol(ftab)),
        c("Total", as.character(total_row)),
        stringsAsFactors = FALSE
      )))
    } else {
      empty_row <- as.data.frame(t(data.frame(
        c("Total", as.character(total_row)),
        stringsAsFactors = FALSE
      )))
    }
    colnames(empty_row) <- colnames(ftab)
    ftab <- rbind(ftab, empty_row)
    ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ])
  }

  # insert big marks?
  ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark)
  ftab[nrow(ftab), ] <- .add_commas_in_numbers(ftab[nrow(ftab), ], big_mark)

  # also format NA column name
  colnames(ftab)[colnames(ftab) == "NA"] <- ifelse(identical(format, "text"), "<NA>", "(NA)")

  ftab
}


# print, datawizard_crosstab ---------------------


#' @export
print.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  .print_text_table(x, big_mark, format = "text", ...)
  invisible(x)
}


#' @export
print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  .print_text_table(x, big_mark, format = "markdown", ...)
}


#' @export
print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
  .print_text_table(x, big_mark, format = "html", ...)
}


.print_text_table <- function(x, big_mark = NULL, format = "text", ...) {
  # grouped data? if yes, add information on grouping factor
  if (is.null(x[["Group"]])) {
    caption <- NULL
  } else {
    caption <- paste0("Grouped by ", x[["Group"]][1])
    x$Group <- NULL
  }

  # prepare table arguments
  fun_args <- list(
    format(x, big_mark = big_mark, format = format, ...),
    caption = caption,
    format = format
  )
  if (format != "html") {
    fun_args$cross <- "+"
    fun_args$empty_line <- "-"
  }
  if (format == "text") {
    fun_args$missing <- "<NA>"
  } else {
    fun_args$missing <- "(NA)"
  }
  out <- do.call(insight::export_table, c(fun_args, list(...)))

  # print table
  if (identical(format, "text")) {
    cat(out)
  } else {
    out
  }
}


# print, datawizard_crosstabs ---------------------


#' @export
print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
  .print_text_tables(x, big_mark, format = "text", ...)
  invisible(x)
}


#' @export
print_md.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
  .print_text_tables(x, big_mark, format = "markdown", ...)
}


#' @export
print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
  .print_text_tables(x, big_mark, format = "html", ...)
}


.print_text_tables <- function(x, big_mark = NULL, format = "text", ...) {
  if (length(x) == 1) {
    .print_text_table(x[[1]], big_mark = big_mark, format = format, ...)
  } else {
    x <- lapply(x, function(i) {
      # grouped data? if yes, add information on grouping factor
      if (!is.null(i[["Group"]])) {
        i$groups <- paste0("Grouped by ", i[["Group"]][1])
        i$Group <- NULL
      }
      # if we don't have the gt-grouping variable "groups" yet, we use it now
      # for grouping. Else, we use a new column named "Variable", to avoid
      # overwriting the groups-variable from grouped data frames
      if (is.null(i$groups) && identical(format, "html")) {
        grp_variable <- "groups"
      } else {
        grp_variable <- "Variable"
      }
      # first variable differs for each data frame, so we harmonize it here
      i[[grp_variable]] <- colnames(i)[1]
      colnames(i)[1] <- "Value"
      # move column to first position
      i <- data_relocate(i, select = grp_variable, before = 1)
      # format data frame
      format(i, format = format, big_mark = big_mark, include_total_row = FALSE, ...)
    })
    # now bind, but we need to check for equal number of columns
    if (all(lengths(x) == max(length(x)))) {
      out <- do.call(rbind, x)
    } else {
      # if not all tables have identical columns, we can use "data_merge()",
      # which safely row-binds all data frames. However, the column order can be
      # messed up, so we save column order here and restore it later
      col_order <- colnames(x[[which.max(lengths(x))]])
      out <- data_merge(x, join = "bind")[col_order]
    }

    # split tables for grouped data frames
    if (!is.null(out$groups)) {
      out <- split(out, out$groups)
      out <- lapply(out, function(subtable) {
        # for text and markdown, if we split tables, we remove the "groups"
        # variable. we need to keep it for HTML tables.
        if (!identical(format, "html")) {
          attr(subtable, "table_caption") <- c(unique(subtable$groups), "blue")
          subtable$groups <- NULL
        }
        # remove duplicated names
        for (grpvars in c("Variable", "Group")) {
          if (!is.null(subtable[[grpvars]])) {
            subtable[[grpvars]][duplicated(subtable[[grpvars]])] <- ""
          }
        }
        subtable
      })
      # no splitting of grouped data frames into list for HTML format,
      # because splitting is done by the `by` argument later
      if (identical(format, "html")) {
        out <- do.call(rbind, out)
      }
    }

    # prepare table arguments
    fun_args <- list(
      out,
      format = format,
      by = "groups"
    )
    if (format != "html") {
      fun_args$cross <- "+"
      fun_args$empty_line <- "-"
    }
    if (format == "text") {
      fun_args$missing <- "<NA>"
    } else {
      fun_args$missing <- "(NA)"
    }
    out <- do.call(insight::export_table, c(fun_args, list(...)))

    # print table
    if (identical(format, "text")) {
      cat(out)
    } else {
      out
    }
  }
}


# helper ---------------------


.validate_by <- function(by, x) {
  if (!is.null(by)) {
    if (is.character(by)) {
      # If "by" is a character string, must be of length 1
      if (length(by) > 1) {
        insight::format_error(
          "If `by` is a string indicating a variable name, `by` must be of length 1.",
          "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`."
        )
      }
      # if "by" is a character, "x" must be a data frame
      if (!is.data.frame(x)) {
        insight::format_error("If `by` is a string indicating a variable name, `x` must be a data frame.")
      }
      # is "by" a column in "x"?
      if (!by %in% colnames(x)) {
        insight::format_error(sprintf(
          "The variable specified in `by` was not found in `x`. %s",
          .misspelled_string(names(x), by, "Possibly misspelled?")
        ))
      }
      by <- x[[by]]
    }
    # is "by" of same length as "x"?
    if (is.data.frame(x) && length(by) != nrow(x)) {
      insight::format_error("Length of `by` must be equal to number of rows in `x`.") # nolint
    }
    if (!is.data.frame(x) && length(by) != length(x)) {
      insight::format_error("Length of `by` must be equal to length of `x`.") # nolint
    }
    if (!is.factor(by)) {
      # coerce "by" to factor, including labels
      by <- to_factor(by, labels_to_levels = TRUE, verbose = FALSE)
    }
  }

  by
}


.validate_table_weights <- function(weights, x, weights_expression = NULL) {
  # exception: for vectors, if weighting variable not found, "weights" is NULL.
  # to check this, we further need to check whether a weights expression was
  # provided, e.g. "weights = iris$not_found" - all this is only relevant when
  # weights is NULL
  if (is.null(weights)) {
    # possibly misspelled weights-variables for default-method ----------------
    # -------------------------------------------------------------------------

    # do we have any value for weights_expression?
    if (!is.null(weights_expression) &&
      # due to deparse() and substitute, NULL becomes "NULL" - we need to check for this
      !identical(weights_expression, "NULL") &&
      # we should only run into this problem, when a variable from a data frame
      # is used in the data_tabulate() method for vectors - thus, we need to check
      # whether the weights_expression contains a "$" - `iris$not_found` is "NULL"
      # we need this check, because the default-method of data_tabulate() is called
      # from the data.frame method, where `weights = weights`, and then,
      # deparse(substitute(weights)) is "weights" (not "NULL" or "iris$not_found"),
      # leading to an error when actually all is OK (if "weights" is NULL)
      # Example:
      #> efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
      # Here, efc$wweight is NULL
      #> data_tabulate(efc$c172code, weights = efc$wweight)
      # Here, wweight errors anyway, because object "wweight" is not found
      #> data_tabulate(efc$c172code, weights = wweight)
      grepl("$", weights_expression, fixed = TRUE)) {
      insight::format_error("The variable specified in `weights` was not found. Possibly misspelled?")
    }
  } else {
    # possibly misspecified weights-variables for data.frame-method -----------
    # -------------------------------------------------------------------------

    if (is.character(weights)) {
      # If "weights" is a character string, must be of length 1
      if (length(weights) > 1) {
        insight::format_error(
          "If `weights` is a string indicating a variable name, `weights` must be of length 1."
        )
      }
      # if "weights" is a character, "x" must be a data frame
      if (!is.data.frame(x)) {
        insight::format_error("If `weights` is a string indicating a variable name, `x` must be a data frame.") # nolint
      }
      # is "by" a column in "x"?
      if (!weights %in% colnames(x)) {
        insight::format_error(sprintf(
          "The variable specified in `weights` was not found in `x`. %s",
          .misspelled_string(names(x), weights, "Possibly misspelled?")
        ))
      }
      weights <- x[[weights]]
    }
    # is "by" of same length as "x"?
    if (is.data.frame(x) && length(weights) != nrow(x)) {
      insight::format_error("Length of `weights` must be equal to number of rows in `x`.") # nolint
    }
    if (!is.data.frame(x) && length(weights) != length(x)) {
      insight::format_error("Length of `weights` must be equal to length of `x`.") # nolint
    }
  }

  weights
}