File: data_rename.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (304 lines) | stat: -rw-r--r-- 11,082 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#' @title Rename columns and variable names
#' @name data_rename
#'
#' @description Safe and intuitive functions to rename variables or rows in
#'   data frames. `data_rename()` will rename column names, i.e. it facilitates
#'   renaming variables. `data_rename_rows()` is a convenient shortcut
#'   to add or rename row names of a data frame, but unlike `row.names()`, its
#'   input and output is a data frame, thus, integrating smoothly into a
#'   possible pipe-workflow.
#'
#' @inheritParams extract_column_names
#' @param data A data frame.
#' @param replacement Character vector. Can be one of the following:
#'   - A character vector that indicates the new names of the columns selected
#'     in `select`. `select` and `replacement` must be of the same length.
#'   - A string (i.e. character vector of length 1) with a "glue" styled
#'     pattern. Currently supported tokens are:
#'     - `{col}` which will be replaced by the column name, i.e. the
#'       corresponding value in `select`.
#'     - `{n}` will be replaced by the number of the variable that is replaced.
#'     - `{letter}` will be replaced by alphabetical letters in sequential
#'       order.
#'       If more than 26 letters are required, letters are repeated, but have
#'       sequential numeric indices (e.g., `a1` to `z1`, followed by `a2` to
#'       `z2`).
#'     - Finally, the name of a user-defined object that is available in the
#'       environment can be used. Note that the object's name is not allowed to
#'       be one of the pre-defined tokens, `"col"`, `"n"` and `"letter"`.
#'
#'     An example for the use of tokens is...
#'     ```r
#'     data_rename(
#'       mtcars,
#'       select = c("am", "vs"),
#'       replacement = "new_name_from_{col}"
#'     )
#'     ```
#'     ... which would return new column names `new_name_from_am` and
#'     `new_name_from_vs`. See 'Examples'.
#'
#' If `select` is a named vector, `replacement` is ignored.
#' @param rows Vector of row names.
#' @param safe Deprecated. Passing unknown column names now always errors.
#' @param pattern Deprecated. Use `select` instead.
#' @param ... Other arguments passed to or from other functions.
#'
#' @details
#' `select` can also be a named character vector. In this case, the names are
#' used to rename the columns in the output data frame. If you have a named
#' list, use `unlist()` to convert it to a named vector. See 'Examples'.
#'
#' @return A modified data frame.
#'
#' @examples
#' # Rename columns
#' head(data_rename(iris, "Sepal.Length", "length"))
#'
#' # Use named vector to rename
#' head(data_rename(iris, c(length = "Sepal.Length", width = "Sepal.Width")))
#'
#' # Change all
#' head(data_rename(iris, replacement = paste0("Var", 1:5)))
#'
#' # Use glue-styled patterns
#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "formerly_{col}"))
#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "{col}_is_column_{n}"))
#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "new_{letter}"))
#'
#' # User-defined glue-styled patterns from objects in environment
#' x <- c("hi", "there", "!")
#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}"))
#' @seealso
#' - Add a prefix or suffix to column names: [data_addprefix()], [data_addsuffix()]
#' - Functions to reorder or remove columns: [data_reorder()], [data_relocate()],
#'   [data_remove()]
#' - Functions to reshape, pivot or rotate data frames: [data_to_long()],
#'   [data_to_wide()], [data_rotate()]
#' - Functions to recode data: [rescale()], [reverse()], [categorize()],
#'   [recode_values()], [slide()]
#' - Functions to standardize, normalize, rank-transform: [center()], [standardize()],
#'   [normalize()], [ranktransform()], [winsorize()]
#' - Split and merge data frames: [data_partition()], [data_merge()]
#' - Functions to find or select columns: [data_select()], [extract_column_names()]
#' - Functions to filter rows: [data_match()], [data_filter()]
#'
#' @export
data_rename <- function(data,
                        select = NULL,
                        replacement = NULL,
                        safe = TRUE,
                        verbose = TRUE,
                        pattern = NULL,
                        ...) {
  # check for valid input
  if (!is.data.frame(data)) {
    insight::format_error("Argument `data` must be a data frame.")
  }
  # If the user does data_rename(iris, pattern = "Sepal.Length", "length"),
  # then "length" is matched to select by position while it's the replacement
  # => do the switch manually
  if (!is.null(pattern)) {
    .is_deprecated("pattern", "select")
    if (!is.null(select)) {
      replacement <- select
    }
    select <- pattern
  }
  if (isFALSE(safe)) {
    insight::format_warning("In `data_rename()`, argument `safe` is no longer used and will be removed in a future release.") # nolint
  }

  # change all names if no pattern specified
  select <- .select_nse(
    select,
    data,
    exclude = NULL,
    ignore_case = NULL,
    regex = NULL,
    allow_rename = TRUE,
    verbose = verbose,
    ifnotfound = "error"
  )

  # Forbid partially named "select",
  # Ex: if select = c("foo" = "Species", "Sepal.Length") then the 2nd name and
  # 2nd value are "Sepal.Length"
  if (!is.null(names(select)) && any(names(select) == select)) {
    insight::format_error("When `select` is a named vector, all elements must be named.")
  }

  # check if `select` has names, and if so, use as "replacement"
  if (!is.null(names(select))) {
    replacement <- names(select)
  }

  # coerce to character
  replacement <- as.character(replacement)

  # check if `replacement` has no empty strings and no NA values
  invalid_replacement <- is.na(replacement) | !nzchar(replacement)
  if (any(invalid_replacement)) {
    if (is.null(names(select))) {
      # when user did not match `select` with `replacement`
      msg <- c(
        "`replacement` is not allowed to have `NA` or empty strings.",
        sprintf(
          "Following values in `select` have no match in `replacement`: %s",
          toString(select[invalid_replacement])
        )
      )
    } else {
      # when user did not name all elements of `select`
      msg <- c(
        "Either name all elements of `select` or use `replacement`.",
        sprintf(
          "Following values in `select` were not named: %s",
          toString(select[invalid_replacement])
        )
      )
    }
    insight::format_error(msg)
  }

  # if duplicated names in replacement, append ".2", ".3", etc. to duplicates
  # ex: c("foo", "foo") -> c("foo", "foo.2")
  if (anyDuplicated(replacement) > 0L) {
    dup <- as.data.frame(table(replacement))
    dup <- dup[dup$Freq > 1, ]
    for (i in dup$replacement) {
      to_replace <- which(replacement == i)[-1]
      new_replacement <- paste0(i, ".", 1 + seq_along(to_replace))
      replacement[to_replace] <- new_replacement
    }
  }

  # check if we have "glue" styled replacement-string
  glue_style <- length(replacement) == 1 && grepl("{", replacement, fixed = TRUE)

  if (length(replacement) > length(select)) {
    insight::format_error("There are more names in `replacement` than in `select`.")
  } else if (length(replacement) < length(select) && !glue_style) {
    insight::format_error("There are more names in `select` than in `replacement`")
  }

  # if we have glue-styled replacement-string, create replacement select now
  if (glue_style) {
    replacement <- .glue_replacement(select, replacement)
  }

  for (i in seq_along(select)) {
    if (!is.na(replacement[i])) {
      data <- .data_rename(data, select[i], replacement[i], safe, verbose)
    }
  }

  data
}

#' @keywords internal
.data_rename <- function(data, pattern, replacement, safe = TRUE, verbose = TRUE) {
  if (!pattern %in% names(data)) {
    if (isTRUE(safe)) {
      # only give message when verbose is TRUE
      if (verbose) {
        insight::format_alert(paste0("Variable `", pattern, "` is not in your data frame :/"))
      }
      # if not safe, always error, no matter what verbose is
    } else {
      insight::format_error(paste0("Variable `", pattern, "` is not in your data frame :/"))
    }
  }

  names(data) <- replace(names(data), names(data) == pattern, replacement)

  data
}


.glue_replacement <- function(pattern, replacement) {
  # this function replaces "glue" tokens into their related
  # real names/values. Currently, following tokens are accepted:
  # - {col}: replacement is the name of the column (indicated in "pattern")
  # - {letter}: replacement is lower-case alphabetically letter, in sequential order
  # - {n}: replacement is the number of the variable out of n, that should be renamed
  out <- rep_len("", length(pattern))

  # for alphabetical letters, we prepare a string if we have more than
  # 26 columns to rename
  if (length(out) > 26) {
    long_letters <- paste0(
      rep.int(letters[1:26], times = ceiling(length(out) / 26)),
      rep(1:ceiling(length(out) / 26), each = 26)
    )
  } else {
    long_letters <- letters[1:26]
  }
  long_letters <- long_letters[seq_len(length(out))]

  for (i in seq_along(out)) {
    # prepare pattern
    column_name <- pattern[i]
    out[i] <- replacement
    # replace first pre-defined token
    out[i] <- gsub(
      "(.*)(\\{col\\})(.*)",
      replacement = paste0("\\1", column_name, "\\3"),
      x = out[i]
    )
    # replace second pre-defined token
    out[i] <- gsub(
      "(.*)(\\{n\\})(.*)",
      replacement = paste0("\\1", i, "\\3"),
      x = out[i]
    )
    # replace third pre-defined token
    out[i] <- gsub(
      "(.*)(\\{letter\\})(.*)",
      replacement = paste0("\\1", long_letters[i], "\\3"),
      x = out[i]
    )
    # extract all non-standard tokens
    matches <- unlist(
      regmatches(out[i], gregexpr("\\{([^}]*)\\}", out[i])),
      use.names = FALSE
    )
    # do we have any additional tokens, i.e. variable names from the environment?
    # users can also specify variable names, where the
    if (length(matches)) {
      # if so, iterate all tokens
      for (token in matches) {
        # evaluate token-object from the environment
        values <- .dynEval(
          str2lang(gsub("\\{(.*)\\}", "\\1", token)),
          ifnotfound = insight::format_error(paste0(
            "The object `", token, "` was not found. Please check if it really exists."
          ))
        )
        # check for correct length
        if (length(values) != length(pattern)) {
          insight::format_error(paste0(
            "The number of values provided in `", token, "` (", length(values),
            " values) do not match the number of columns to rename (",
            length(pattern), " columns)."
          ))
        }
        # replace token with values from the object
        if (length(values)) {
          out[i] <- gsub(token, values[i], out[i], fixed = TRUE)
        }
      }
    }
  }
  out
}


# Row.names ----------------------------------------------------------------

#' @rdname data_rename
#' @export
data_rename_rows <- function(data, rows = NULL) {
  row.names(data) <- rows
  data
}