File: data_separate.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (395 lines) | stat: -rw-r--r-- 14,045 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
#' @title Separate single variable into multiple variables
#' @name data_separate
#'
#' @description
#' Separates a single variable into multiple new variables.
#'
#' @param data A data frame.
#' @param new_columns The names of the new columns, as character vector. If
#' more than one variable was selected (in `select`), the new names are prefixed
#' with the name of the original column. `new_columns` can also be a list of
#' (named) character vectors when multiple variables should be separated. See
#' 'Examples'.
#' @param separator Separator between columns. Can be a character vector, which
#' is then treated as regular expression, or a numeric vector that indicates at
#' which positions the string values will be split.
#' @param append Logical, if `FALSE` (default), removes original columns that
#' were separated. If `TRUE`, all columns are preserved and the new columns are
#' appended to the data frame.
#' @param guess_columns If `new_columns` is not given, the required number of
#' new columns is guessed based on the results of value splitting. For example,
#' if a variable is split into three new columns, this will be considered as
#' the required number of new columns, and columns are named `"split_1"`,
#' `"split_2"` and `"split_3"`. When values from a variable are split into
#' different amount of new columns, the `guess_column` can be either `"mode"`
#' (number of new columns is based on the most common number of splits), `"min"`
#' or `"max"` to use the minimum resp. maximum number of possible splits as
#' required number of columns.
#' @param fill How to deal with values that return fewer new columns after
#' splitting? Can be `"left"` (fill missing columns from the left with `NA`),
#' `"right"` (fill missing columns from the right with `NA`) or `"value_left"`
#' or `"value_right"` to fill missing columns from left or right with the
#' left-most or right-most values.
#' @param extra How to deal with values that return too many new columns after
#' splitting? Can be `"drop_left"` or `"drop_right"` to drop the left-most or
#' right-most values, or `"merge_left"` or `"merge_right"` to merge the left-
#' or right-most value together, and keeping all remaining values as is.
#' @param merge_multiple Logical, if `TRUE` and more than one variable is selected
#' for separating, new columns can be merged. Value pairs of all split variables
#' are merged.
#' @param merge_separator Separator string when `merge_multiple = TRUE`. Defines
#' the string that is used to merge values together.
#' @param convert_na Logical, if `TRUE`, character `"NA"` values are converted
#' into real `NA` values.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#'
#' @seealso [`data_unite()`]
#'
#' @return A data frame with the newly created variable(s), or - when `append = TRUE` -
#' `data` including new variables.
#'
#' @examples
#' # simple case
#' d <- data.frame(
#'   x = c("1.a.6", "2.b.7", "3.c.8"),
#'   stringsAsFactors = FALSE
#' )
#' d
#' data_separate(d, new_columns = c("a", "b", "c"))
#'
#' # guess number of columns
#' d <- data.frame(
#'   x = c("1.a.6", NA, "2.b.6.7", "3.c", "x.y.z"),
#'   stringsAsFactors = FALSE
#' )
#' d
#' data_separate(d, guess_columns = "mode")
#'
#' data_separate(d, guess_columns = "max")
#'
#' # drop left-most column
#' data_separate(d, guess_columns = "mode", extra = "drop_left")
#'
#' # merge right-most column
#' data_separate(d, guess_columns = "mode", extra = "merge_right")
#'
#' # fill columns with fewer values with left-most values
#' data_separate(d, guess_columns = "mode", fill = "value_left")
#'
#' # fill and merge
#' data_separate(
#'   d,
#'   guess_columns = "mode",
#'   fill = "value_left",
#'   extra = "merge_right"
#' )
#'
#' # multiple columns to split
#' d <- data.frame(
#'   x = c("1.a.6", "2.b.7", "3.c.8"),
#'   y = c("x.y.z", "10.11.12", "m.n.o"),
#'   stringsAsFactors = FALSE
#' )
#' d
#' # split two columns, default column names
#' data_separate(d, guess_columns = "mode")
#'
#' # split into new named columns, repeating column names
#' data_separate(d, new_columns = c("a", "b", "c"))
#'
#' # split selected variable new columns
#' data_separate(d, select = "y", new_columns = c("a", "b", "c"))
#'
#' # merge multiple split columns
#' data_separate(
#'   d,
#'   new_columns = c("a", "b", "c"),
#'   merge_multiple = TRUE
#' )
#'
#' # merge multiple split columns
#' data_separate(
#'   d,
#'   new_columns = c("a", "b", "c"),
#'   merge_multiple = TRUE,
#'   merge_separator = "-"
#' )
#'
#' # separate multiple columns, give proper column names
#' d_sep <- data.frame(
#'   x = c("1.a.6", "2.b.7.d", "3.c.8", "5.j"),
#'   y = c("m.n.99.22", "77.f.g.34", "44.9", NA),
#'   stringsAsFactors = FALSE
#' )
#'
#' data_separate(
#'   d_sep,
#'   select = c("x", "y"),
#'   new_columns = list(
#'     x = c("A", "B", "C"), # separate "x" into three columns
#'     y = c("EE", "FF", "GG", "HH") # separate "y" into four columns
#'   ),
#'   verbose = FALSE
#' )
#' @export
data_separate <- function(data,
                          select = NULL,
                          new_columns = NULL,
                          separator = "[^[:alnum:]]+",
                          guess_columns = NULL,
                          merge_multiple = FALSE,
                          merge_separator = "",
                          fill = "right",
                          extra = "drop_right",
                          convert_na = TRUE,
                          exclude = NULL,
                          append = FALSE,
                          ignore_case = FALSE,
                          verbose = TRUE,
                          regex = FALSE,
                          ...) {
  # we need at least one explicit choice for either `new_columns` or `guess_columns`
  if (is.null(new_columns) && is.null(guess_columns)) {
    insight::format_error("Cannot separate values. Either `new_columns` or `guess_columns` must be provided.")
  }
  # in case user did not provide names of new columns, we can try
  # to guess number of columns per variable
  guess_columns <- match.arg(guess_columns, choices = c("min", "max", "mode"))

  # make sure we have valid options for fill and extra
  fill <- match.arg(fill, choices = c("left", "right", "value_left", "value_right"))
  extra <- match.arg(extra, choices = c("drop_left", "drop_right", "merge_left", "merge_right"))

  # evaluate select/exclude, may be select-helpers
  select <- .select_nse(select,
    data,
    exclude,
    ignore_case,
    regex = regex,
    verbose = verbose
  )

  # make new_columns as list, this works with single and multiple columns
  if (!is.null(new_columns) && !is.list(new_columns)) {
    new_columns <- rep(list(new_columns), times = length(select))
    # if we have multiple columns that were separated, we avoid duplicated
    # column names of created variables by appending name of original column
    # however, we don't have duplicated column names when we merge them together
    # so don't create new column names when "merge_multiple" is FALSE.
    make_unique_colnames <- length(select) > 1 && !merge_multiple
  } else {
    # we don't want to create own unique column names when user explicitly
    # provided column names as a list, i.e. column names for each separated
    # variable
    make_unique_colnames <- FALSE
  }

  # make sure list of new column names is named
  if (!is.null(new_columns) && is.null(names(new_columns))) {
    names(new_columns) <- select
  }

  # iterate columns that should be split
  split_data <- lapply(select, function(sep_column) {
    # do we have known number of columns?
    if (is.null(new_columns)) {
      n_columns <- NULL
    } else {
      n_columns <- length(new_columns[[sep_column]])
    }

    # make sure we have a character that we can split
    x <- data[[sep_column]]
    if (!is.character(x)) {
      x <- as.character(x)
    }

    # separate column into multiple strings
    if (is.numeric(separator)) {
      maxlen <- max(nchar(x), na.rm = TRUE)
      starts <- c(0, separator)
      ends <- c(separator - 1, maxlen)
      separated_columns <- lapply(seq_along(starts), function(i) {
        substr(x, starts[i], ends[i])
      })
      separated_columns <- as.data.frame(
        do.call(rbind, separated_columns),
        stringsAsFactors = FALSE
      )
    } else {
      separated_columns <- strsplit(x, separator, perl = TRUE)
    }

    # how many new columns do we need?
    if (is.null(n_columns)) {
      # lengths of all split strings
      l <- lengths(separated_columns)
      # but without NA values
      l <- l[!vapply(l, function(i) all(is.na(i)), TRUE)]
      # define number of new columns, based on user-choice
      n_cols <- switch(guess_columns,
        min = min(l, na.rm = TRUE),
        max = max(l, na.rm = TRUE),
        mode = distribution_mode(l),
      )
      # tell user
      if (verbose && insight::n_unique(l) != 1 && !is.numeric(separator)) {
        insight::format_alert(paste0(
          "Column `", sep_column, "` had different number of values after splitting. Variable was split into ",
          n_cols, " column", ifelse(n_cols > 1, "s", ""), "."
        ))
      }
    } else {
      # else, if we know number of columns, use that number
      n_cols <- n_columns
    }

    # main task here - fill or drop values for all columns
    separated_columns <- tryCatch(
      .fix_separated_columns(separated_columns, fill, extra, n_cols, sep_column, verbose),
      error = function(e) NULL
    )

    # catch error
    if (is.null(separated_columns)) {
      insight::format_error(
        "Something went wrong. Probably the number of provided column names did not match number of newly created columns?" # nolint
      )
    }

    # bind separated columns into data frame and set column names
    out <- as.data.frame(do.call(rbind, separated_columns))

    # if no column names provided, use standard names
    if (is.null(new_columns[[sep_column]])) {
      new_column_names <- paste0(sep_column, "_", seq_along(out))
    } else if (make_unique_colnames) {
      # if we have multiple columns that were separated, we avoid duplicated
      # column names of created variables by appending name of original column
      new_column_names <- paste0(sep_column, "_", new_columns[[sep_column]])
    } else {
      new_column_names <- new_columns[[sep_column]]
    }

    colnames(out) <- new_column_names
    out
  })

  # any split performed?
  if (all(lengths(split_data) == 1)) {
    if (verbose) {
      insight::format_alert("Separator probably not found. No values were split. Returning original data.")
    }
    return(data)
  }

  # final preparation, bind or merge columns, make unique columm names
  if (isTRUE(merge_multiple) && length(split_data) > 1) {
    # we merge all split columns, which are currently saved as list
    # of data frames, together into one data frame
    for (i in 2:length(split_data)) {
      for (j in seq_along(split_data[[1]])) {
        split_data[[1]][[j]] <- gsub(" ", "",
          paste(
            split_data[[1]][[j]],
            split_data[[i]][[j]],
            sep = merge_separator
          ),
          fixed = TRUE
        )
      }
    }
    split_data <- split_data[[1]]
  } else {
    # bind all columns
    split_data <- do.call(cbind, split_data)
  }

  # convert "NA" strings into real NA?
  if (convert_na) {
    split_data[] <- lapply(split_data, function(i) {
      i[i == "NA"] <- NA_character_
      i
    })
  }

  data <- cbind(data, split_data)
  if (!isTRUE(append)) {
    data[select] <- NULL
  }

  # fin
  data
}


#' @keywords internal
.fix_separated_columns <- function(separated_columns, fill, extra, n_cols, sep_column, verbose = TRUE) {
  warn_extra <- warn_fill <- FALSE
  for (sc in seq_along(separated_columns)) {
    i <- separated_columns[[sc]]
    # determine number of values in separated column
    n_values <- length(i)
    if (all(is.na(i))) {
      # we have NA values - so fill everything with NA
      out <- rep(NA_character_, times = n_cols)
    } else if (n_values > n_cols) {
      # we have more values than required - drop extra columns
      out <- switch(extra,
        drop_left = i[(n_values - n_cols + 1):n_values],
        drop_right = i[1:n_cols],
        merge_left = {
          tmp <- paste(i[1:(n_values - n_cols + 1)], collapse = " ")
          c(tmp, i[(n_values - n_cols + 2):n_values])
        },
        {
          tmp <- i[1:(n_cols - 1)]
          c(tmp, paste(i[n_cols:n_values], collapse = " "))
        }
      )
      warn_extra <- TRUE
    } else if (n_values < n_cols) {
      # we have fewer values than required - fill columns
      out <- switch(fill,
        left = c(rep(NA_character_, times = n_cols - n_values), i),
        right = c(i, rep(NA_character_, times = n_cols - n_values)),
        value_left = c(rep(i[1], times = n_cols - n_values), i),
        c(i, rep(i[length(i)], times = n_cols - n_values))
      )
      warn_fill <- TRUE
    } else {
      out <- i
    }
    separated_columns[[sc]] <- out
  }

  if (verbose) {
    if (warn_extra) {
      insight::format_alert(paste0(
        "`", sep_column, "`",
        " returned more columns than expected after splitting. ",
        switch(extra,
          drop_left = "Left-most columns have been dropped.",
          drop_right = "Right-most columns have been dropped.",
          merge_left = "Left-most columns have been merged together.",
          merge_right = "Right-most columns have been merged together."
        )
      ))
    }
    if (warn_fill) {
      insight::format_alert(paste0(
        "`", sep_column, "`",
        "returned fewer columns than expected after splitting. ",
        switch(fill,
          left = "Left-most columns were filled with `NA`.",
          right = "Right-most columns were filled with `NA`.",
          value_left = "Left-most columns were filled with first value.",
          value_right = "Right-most columns were filled with last value."
        )
      ))
    }
  }

  separated_columns
}