File: data_modify.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (406 lines) | stat: -rw-r--r-- 15,449 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
#' Create new variables in a data frame
#'
#' Create new variables or modify existing variables in a data frame. Unlike `base::transform()`, `data_modify()`
#' can be used on grouped data frames, and newly created variables can be directly
#' used.
#'
#' @param data A data frame
#' @param ... One or more expressions that define the new variable name and the
#' values or recoding of those new variables. These expressions can be one of:
#' - A sequence of named, literal expressions, where the left-hand side refers
#'   to the name of the new variable, while the right-hand side represent the
#'   values of the new variable. Example: `Sepal.Width = center(Sepal.Width)`.
#' - A sequence of string values, representing expressions.
#' - A variable that contains a string representation of the expression. Example:
#'   ```r
#'   a <- "2 * Sepal.Width"
#'   data_modify(iris, a)
#'   ```
#' - A character vector of expressions. Example:
#'   `c("SW_double = 2 * Sepal.Width", "SW_fraction = SW_double / 10")`. This
#'   type of expression cannot be mixed with other expressions, i.e. if a
#'   character vector is provided, you may not add further elements to `...`.
#' - Using `NULL` as right-hand side removes a variable from the data frame.
#'   Example: `Petal.Width = NULL`.
#' - For data frames (including grouped ones), the function `n()` can be used to count the
#'   number of observations and thereby, for instance, create index values by
#'   using `id = 1:n()` or `id = 3:(n()+2)` and similar.
#'
#' Note that newly created variables can be used in subsequent expressions,
#' including `.at` or `.if`. See also 'Examples'.
#'
#' @param .at A character vector of variable names that should be modified. This
#' argument is used in combination with the `.modify` argument. Note that only one
#' of `.at` or `.if` can be provided, but not both at the same time. Newly created
#' variables in `...` can also be selected, see 'Examples'.
#' @param .if A function that returns `TRUE` for columns in the data frame where
#' `.if` applies. This argument is used in combination with the `.modify` argument.
#' Note that only one of `.at` or `.if` can be provided, but not both at the same
#' time. Newly created variables in `...` can also be selected, see 'Examples'.
#' @param .modify A function that modifies the variables defined in `.at` or `.if`.
#' This argument is used in combination with either the `.at` or the `.if` argument.
#' Note that the modified variable (i.e. the result from `.modify`) must be either
#' of length 1 or of same length as the input variable.
#'
#' @note `data_modify()` can also be used inside functions. However, it is
#' recommended to pass the recode-expression as character vector or list of
#' characters.
#'
#' @examples
#' data(efc)
#' new_efc <- data_modify(
#'   efc,
#'   c12hour_c = center(c12hour),
#'   c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
#'   c12hour_z2 = standardize(c12hour)
#' )
#' head(new_efc)
#'
#' # using strings instead of literal expressions
#' new_efc <- data_modify(
#'   efc,
#'   "c12hour_c = center(c12hour)",
#'   "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)",
#'   "c12hour_z2 = standardize(c12hour)"
#' )
#' head(new_efc)
#'
#' # using character strings, provided as variable
#' stand <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
#' new_efc <- data_modify(
#'   efc,
#'   c12hour_c = center(c12hour),
#'   c12hour_z = stand
#' )
#' head(new_efc)
#'
#' # providing expressions as character vector
#' new_exp <- c(
#'   "c12hour_c = center(c12hour)",
#'   "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)"
#' )
#' new_efc <- data_modify(efc, new_exp)
#' head(new_efc)
#'
#' # attributes - in this case, value and variable labels - are preserved
#' str(new_efc)
#'
#' # overwrite existing variable, remove old variable
#' out <- data_modify(iris, Petal.Length = 1 / Sepal.Length, Sepal.Length = NULL)
#' head(out)
#'
#' # works on grouped data
#' grouped_efc <- data_group(efc, "c172code")
#' new_efc <- data_modify(
#'   grouped_efc,
#'   c12hour_c = center(c12hour),
#'   c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
#'   c12hour_z2 = standardize(c12hour),
#'   id = 1:n()
#' )
#' head(new_efc)
#'
#' # works from inside functions
#' foo <- function(data, z) {
#'   head(data_modify(data, z))
#' }
#' foo(iris, "var_a = Sepal.Width / 10")
#'
#' new_exp <- c("SW_double = 2 * Sepal.Width", "SW_fraction = SW_double / 10")
#' foo(iris, new_exp)
#'
#' # modify at specific positions or if condition is met
#' d <- iris[1:5, ]
#' data_modify(d, .at = "Species", .modify = as.numeric)
#' data_modify(d, .if = is.factor, .modify = as.numeric)
#'
#' # can be combined with dots
#' data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric)
#'
#' # new variables used in `.at` or `.if`
#' data_modify(
#'   d,
#'   new_length = Petal.Length * 2,
#'   .at = c("Petal.Length", "new_length"),
#'   .modify = round
#' )
#'
#' # combine "extract_column_names()" and ".at" argument
#' out <- data_modify(
#'   d,
#'   .at = extract_column_names(d, select = starts_with("Sepal")),
#'   .modify = as.factor
#' )
#' # "Sepal.Length" and "Sepal.Width" are now factors
#' str(out)
#'
#' @export
data_modify <- function(data, ...) {
  UseMethod("data_modify")
}

#' @export
data_modify.default <- function(data, ...) {
  insight::format_error("`data` must be a data frame.")
}

#' @rdname data_modify
#' @export
data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) {
  dots <- eval(substitute(alist(...)))

  # error for data frames with no rows...
  if (nrow(data) == 0) {
    insight::format_error("`data` is an empty data frame. `data_modify()` only works for data frames with at least one row.") # nolint
  }

  # check if we have dots, or only at/modify ----

  if (length(dots)) {
    # we check for character vector of expressions, in which case
    # "dots" should be unnamed
    if (is.null(names(dots))) {
      # if we have multiple strings, concatenate them to a character vector
      # and put it into a list...
      if (length(dots) > 1) {
        if (all(vapply(dots, is.character, logical(1)))) {
          dots <- list(unlist(dots))
        } else {
          insight::format_error("You cannot mix string and literal representation of expressions.")
        }
      }
      # expression is given as character string, e.g.
      # a <- "double_SepWidth = 2 * Sepal.Width"
      # data_modify(iris, a)
      # or as character vector, e.g.
      # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10"))
      character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL)
      # do we have a character vector? Then we can proceed
      if (is.character(character_symbol)) {
        dots <- lapply(character_symbol, function(s) {
          # turn value from character vector into expression
          str2lang(.dynEval(s))
        })
        names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1))
      }
    }

    for (i in seq_along(dots)) {
      # create new variable
      new_variable <- .get_new_dots_variable(dots, i, data)
      # give informative error when new variable doesn't match number of rows
      if (!is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0) {
        insight::format_error(
          "New variable has not the same length as the other variables in the data frame and cannot be recycled."
        )
      }
      data[[names(dots)[i]]] <- new_variable
    }
  }

  # check if we have at/modify ----
  data <- .modify_at(data, .at, .if, .modify)

  data
}

#' @export
data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) {
  # we need to evaluate dots here, and pass them with "do.call" to
  # the data.frame method later...
  dots <- match.call(expand.dots = FALSE)[["..."]]

  # error for data frames with no rows...
  if (nrow(data) == 0) {
    insight::format_error("`data` is an empty data frame. `data_modify()` only works for data frames with at least one row.") # nolint
  }

  grps <- attr(data, "groups", exact = TRUE)
  grps <- grps[[".rows"]]
  attr_data <- attributes(data)

  # remove conflicting class attributes
  class_attr <- class(data)
  data <- as.data.frame(data)

  # we check for character vector of expressions, in which case
  # "dots" should be unnamed
  if (is.null(names(dots))) {
    # if we have multiple strings, concatenate them to a character vector
    # and put it into a list...
    if (length(dots) > 1) {
      if (all(vapply(dots, is.character, logical(1)))) {
        dots <- list(unlist(dots))
      } else {
        insight::format_error("You cannot mix string and literal representation of expressions.")
      }
    }
    # expression is given as character string, e.g.
    # a <- "double_SepWidth = 2 * Sepal.Width"
    # data_modify(iris, a)
    # or as character vector, e.g.
    # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10"))
    character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL)
    # do we have a character vector? Then we can proceed
    if (is.character(character_symbol)) {
      dots <- lapply(character_symbol, function(s) {
        # turn value from character vector into expression
        str2lang(.dynEval(s))
      })
      names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1))
    }
  }

  # create new variables as dummys, do for-loop works
  for (i in names(dots)) {
    # don't overwrite / fill existing variables with NA,
    # e.g. if we have "data_modify(iris, Sepal.Length = normalize(Sepal.Length))"
    # normalize() won't work when we fill with NA
    if (!i %in% colnames(data)) {
      data[[i]] <- NA
    }
  }

  # create new variables per group
  for (rows in grps) {
    data[rows, ] <- data_modify.data.frame(data[rows, ], ...)
  }

  # check if we have at/modify ----
  data <- .modify_at(data, .at, .if, .modify)

  # set back attributes and class
  data <- .replace_attrs(data, attr_data)
  class(data) <- class_attr
  data
}


# helper -------------

.modify_at <- function(data, .at, .if, .modify) {
  # check if ".at" or ".if" is defined, but not ".modify"
  if (is.null(.modify)) {
    if (!is.null(.at) || !is.null(.if)) {
      insight::format_error("You need to specify `.modify` when using `.at` or `.if`.")
    }
    return(data)
  }
  # make sure "modify" is a function
  if (!is.function(.modify)) {
    insight::format_error("`.modify` must be a function.")
  }
  # make sure either .at or .if is defined, not both
  if (!is.null(.at) && !is.null(.if)) {
    insight::format_error("You cannot use both `.at` and `.if` at the same time.")
  }
  # make sure at least one of .at or .if is defined
  if (is.null(.at) && is.null(.if)) {
    insight::format_error("You need to specify either `.at` or `.if`.")
  }

  column_names <- colnames(data)

  # if we have ".if" defined, specify ".at"
  if (!is.null(.if)) {
    .at <- column_names[vapply(data, .if, logical(1))]
  }
  # check for valid defined column names
  if (!all(.at %in% column_names)) {
    not_found <- .at[!.at %in% column_names]
    insight::format_error(
      paste0(
        "Variable",
        ifelse(length(not_found) > 1, "s ", " "),
        text_concatenate(not_found, enclose = "\""),
        ifelse(length(not_found) > 1, " were", " was"),
        " not found in the dataset."
      ),
      .misspelled_string(column_names, not_found, "Possibly misspelled or not yet defined?")
    )
  }
  for (i in .at) {
    result <- tryCatch(.modify(data[[i]]), warning = function(e) e, error = function(e) e)
    if (inherits(result, c("error", "warning"))) {
      insight::format_error(
        paste0("Error in modifying variable \"", i, "\": ", result$message),
        "Please check if you correctly specified the `.modify` function."
      )
    } else {
      data[[i]] <- result
    }
  }

  data
}

.get_new_dots_variable <- function(dots, i, data) {
  # iterate expressions for new variables
  symbol <- dots[[i]]

  # expression is given as character string in a variable, but named, e.g.
  # a <- "2 * Sepal.Width"
  # data_modify(iris, double_SepWidth = a)
  # we reconstruct the symbol as if it were provided as literal expression.
  # However, we need to check that we don't have a character vector,
  # like: data_modify(iris, new_var = "a")
  # this one should be recycled instead.
  if (!is.character(symbol)) {
    eval_symbol <- .dynEval(symbol, ifnotfound = NULL)
    if (is.character(eval_symbol)) {
      symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE)
      # we may have the edge-case of having a function that returns a character
      # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol"
      # is of type character, but no symbol, thus str2lang() above creates a
      # wrong pattern. We then take "eval_symbol" as character input.
      if (inherits(symbol, "try-error")) {
        symbol <- str2lang(paste0(
          names(dots)[i],
          " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")"
        ))
      }
    }
  }

  # finally, we can evaluate expression and get values for new variables
  symbol_string <- insight::safe_deparse(symbol)
  if (!is.null(symbol_string) && all(symbol_string == "n()")) {
    # "special" functions - using "n()" just returns number of rows
    new_variable <- nrow(data)
  } else if (!is.null(symbol_string) && length(symbol_string) == 1 && grepl("\\bn\\(\\)", symbol_string)) {
    # "special" functions, like "1:n()" or similar - but not "1:fun()"
    symbol_string <- str2lang(gsub("n()", "nrow(data)", symbol_string, fixed = TRUE))
    new_variable <- try(with(data, eval(symbol_string)), silent = TRUE)
  } else {
    # default evaluation of expression
    new_variable <- try(with(data, eval(symbol)), silent = TRUE)
  }

  # successful, or any errors, like misspelled variable name?
  if (inherits(new_variable, "try-error")) {
    # in which step did error happen?
    step_number <- switch(as.character(i),
      "1" = "the first expression",
      "2" = "the second expression",
      "3" = "the third expression",
      paste("expression", i)
    )
    step_msg <- paste0("There was an error in ", step_number, ".")
    # try to find out which variable was the cause for the error
    error_msg <- attributes(new_variable)$condition$message
    if (grepl("object '(.*)' not found", error_msg)) {
      error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
      insight::format_error(
        paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."),
        .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?")
      )
    } else {
      insight::format_error(paste0(
        step_msg, " ", insight::format_capitalize(error_msg),
        ". Possibly misspelled or not yet defined?"
      ))
    }
  }

  new_variable
}