File: data_merge.R

package info (click to toggle)
r-cran-datawizard 1.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,300 kB
  • sloc: sh: 13; makefile: 2
file content (387 lines) | stat: -rw-r--r-- 13,433 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
#' @title Merge (join) two data frames, or a list of data frames
#' @name data_merge
#'
#' @description
#' Merge (join) two data frames, or a list of data frames. However, unlike
#' base R's `merge()`, `data_merge()` offers a few more methods to join data
#' frames, and it does not drop data frame nor column attributes.
#'
#' @param x,y A data frame to merge. `x` may also be a list of data frames
#'   that will be merged. Note that the list-method has no `y` argument.
#' @param join Character vector, indicating the method of joining the data frames.
#'   Can be `"full"`, `"left"` (default), `"right"`, `"inner"`, `"anti"`, `"semi"`
#'   or `"bind"`. See details below.
#' @param by Specifications of the columns used for merging.
#' @param id Optional name for ID column that will be created to indicate the
#'   source data frames for appended rows. Only applies if `join = "bind"`.
#' @param verbose Toggle warnings.
#' @param ... Not used.
#'
#' @return
#' A merged data frame.
#'
#' @section Merging data frames:
#'
#' Merging data frames is performed by adding rows (cases), columns
#' (variables) or both from the source data frame (`y`) to the target
#' data frame (`x`). This usually requires one or more variables which
#' are included in both data frames and that are used for merging, typically
#' indicated with the `by` argument. When `by` contains a variable present
#' in both data frames, cases are matched and filtered by identical values
#' of `by` in `x` and `y`.
#'
#' @section Left- and right-joins:
#'
#' Left- and right joins usually don't add new rows (cases), but only new
#' columns (variables) for existing cases in `x`. For `join = "left"` or
#' `join = "right"` to work, `by` *must* indicate one or more columns that
#' are included in both data frames. For `join = "left"`, if `by` is an
#' identifier variable, which is included in both `x` and `y`, all variables
#' from `y` are copied to `x`, but only those cases from `y` that have
#' matching values in their identifier variable in `x` (i.e. all cases
#' in `x` that are also found in `y` get the related values from the new
#' columns in `y`). If there is no match between identifiers in `x` and `y`,
#' the copied variable from `y` will get a `NA` value for this particular
#' case. Other variables that occur both in `x` and `y`, but are not used
#' as identifiers (with `by`), will be renamed to avoid multiple identical
#' variable names. Cases in `y` where values from the identifier have no
#' match in `x`'s identifier are removed. `join = "right"` works in
#' a similar way as `join = "left"`, just that only cases from `x` that
#' have matching values in their identifier variable in `y` are chosen.
#'
#' In base R, these are equivalent to `merge(x, y, all.x = TRUE)` and
#' `merge(x, y, all.y = TRUE)`.
#'
#' @section Full joins:
#'
#' Full joins copy all cases from `y` to `x`. For matching cases in both
#' data frames, values for new variables are copied from `y` to `x`. For
#' cases in `y` not present in `x`, these will be added as new rows to `x`.
#' Thus, full joins not only add new columns (variables), but also might
#' add new rows (cases).
#'
#' In base R, this is equivalent to `merge(x, y, all = TRUE)`.
#'
#' @section Inner joins:
#'
#' Inner joins merge two data frames, however, only those rows (cases) are
#' kept that are present in both data frames. Thus, inner joins usually
#' add new columns (variables), but also remove rows (cases) that only
#' occur in one data frame.
#'
#' In base R, this is equivalent to `merge(x, y)`.
#'
#' @section Binds:
#'
#' `join = "bind"` row-binds the complete second data frame `y` to `x`.
#' Unlike simple `rbind()`, which requires the same columns for both data
#' frames, `join = "bind"` will bind shared columns from `y` to `x`, and
#' add new columns from `y` to `x`.
#'
#' @examples
#'
#' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
#' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4)
#'
#' x
#' y
#'
#' # "by" will default to all shared columns, i.e. "c" and "id". new columns
#' # "d" and "e" will be copied from "y" to "x", but there are only two cases
#' # in "x" that have the same values for "c" and "id" in "y". only those cases
#' # have values in the copied columns, the other case gets "NA".
#' data_merge(x, y, join = "left")
#'
#' # we change the id-value here
#' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
#' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 3:5)
#'
#' x
#' y
#'
#' # no cases in "y" have the same matching "c" and "id" as in "x", thus
#' # copied variables from "y" to "x" copy no values, all get NA.
#' data_merge(x, y, join = "left")
#'
#' # one case in "y" has a match in "id" with "x", thus values for this
#' # case from the remaining variables in "y" are copied to "x", all other
#' # values (cases) in those remaining variables get NA
#' data_merge(x, y, join = "left", by = "id")
#'
#' data(mtcars)
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:32, 4:6]
#'
#' # add ID common column
#' x$id <- 1:5
#' y$id <- 3:7
#'
#' # left-join, add new variables and copy values from y to x,
#' # where "id" values match
#' data_merge(x, y)
#'
#' # right-join, add new variables and copy values from x to y,
#' # where "id" values match
#' data_merge(x, y, join = "right")
#'
#' # full-join
#' data_merge(x, y, join = "full")
#'
#'
#' data(mtcars)
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:32, c(1, 4:5)]
#'
#' # add ID common column
#' x$id <- 1:5
#' y$id <- 3:7
#'
#' # left-join, no matching rows (because columns "id" and "disp" are used)
#' # new variables get all NA values
#' data_merge(x, y)
#'
#' # one common value in "mpg", so one row from y is copied to x
#' data_merge(x, y, by = "mpg")
#'
#' # only keep rows with matching values in by-column
#' data_merge(x, y, join = "semi", by = "mpg")
#'
#' # only keep rows with non-matching values in by-column
#' data_merge(x, y, join = "anti", by = "mpg")
#'
#' # merge list of data frames. can be of different rows
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:31, 3:5]
#' z <- mtcars[11:18, c(1, 3:4, 6:8)]
#' x$id <- 1:5
#' y$id <- 4:7
#' z$id <- 3:10
#' data_merge(list(x, y, z), join = "bind", by = "id", id = "source")
#' @inherit data_rename seealso
#' @export
data_merge <- function(x, ...) {
  UseMethod("data_merge")
}

#' @rdname data_merge
#' @export
data_join <- data_merge

#' @rdname data_merge
#' @export
data_merge.data.frame <- function(x, y, join = "left", by = NULL, id = NULL, verbose = TRUE, ...) {
  class_x <- class(x)

  # save variable attributes
  attr_x_vars <- lapply(x, attributes)
  attr_y_vars <- lapply(y, attributes)
  attr_vars <- c(attr_x_vars, attr_y_vars[names(attr_y_vars)[!names(attr_y_vars) %in% names(attr_x_vars)]])


  # check join-argument ----------------------

  join <- match.arg(join, choices = c("full", "left", "right", "inner", "semi", "anti", "bind"))


  # check id-argument ----------------------

  all_columns <- union(colnames(x), colnames(y))

  if (join == "bind" && !is.null(id) && id %in% all_columns) {
    # ensure unique ID
    id <- make.unique(c(all_columns, id), sep = "_")[length(all_columns) + 1]
    # and also tell user...
    if (isTRUE(verbose)) {
      insight::format_warning(
        sprintf("Value of `id` already exists as column name. ID column was renamed to `%s`.", id)
      )
    }
  }

  if (!is.null(id) && join == "bind") {
    x[[id]] <- 1
    y[[id]] <- 2
  }


  # check merge columns ("by"-argument) ----------------------

  if (join != "bind") {
    # we need a value for "by". If not provided, use all shared column names
    if (is.null(by)) {
      by <- intersect(colnames(x), colnames(y))
    }

    # If not all column names specified in "by" are present, yield warning
    # and use all shared column names
    if (!all(by %in% colnames(x)) || !all(by %in% colnames(y))) {
      missing_in_x <- setdiff(by, colnames(x))
      missing_in_y <- setdiff(by, colnames(y))
      stop_message <- c(
        "Not all columns specified in `by` were found in the data frames.",
        if (length(missing_in_x) > 0L) {
          paste0("Following columns are in `by` but absent in `x`: ", text_concatenate(missing_in_x))
        },
        if (length(missing_in_y) > 0L) {
          paste0("Following columns are in `by` but absent in `y`: ", text_concatenate(missing_in_y))
        }
      )
      if (isTRUE(verbose)) {
        insight::format_error(stop_message)
      }
    }

    # if still both data frames have no common columns, do a full join
    if (!length(by)) {
      if (isTRUE(verbose)) {
        insight::format_warning(
          "Found no matching columns in the data frames. Fully merging both data frames now.",
          "Note that this can lead to unintended results, because rows in `x` and `y` are possibly duplicated.",
          "You probably want to use `data_merge(x, y, join = \"bind\")` instead."
        )
      }
      by <- NULL
      join <- "full"
    }
  }


  # check valid combination of "join" and "by" -----------------------

  if (join %in% c("anti", "semi") && (is.null(by) || length(by) != 1)) {
    insight::format_error(
      sprintf(
        "For `join = \"%s\"`, `by` needs to be a name of only one variable that is present in both data frames.",
        join
      )
    )
  }


  # merge --------------------

  # for later sorting
  if (join != "bind") {
    if (nrow(x) > 0L) {
      x$.data_merge_id_x <- seq_len(nrow(x))
    }
    if (nrow(y) > 0L) {
      y$.data_merge_id_y <- (seq_len(nrow(y))) + nrow(x)
    }
  }
  all_columns <- union(colnames(x), colnames(y))

  out <- switch(join,
    full = merge(x, y, all = TRUE, sort = FALSE, by = by),
    left = merge(x, y, all.x = TRUE, sort = FALSE, by = by),
    right = merge(x, y, all.y = TRUE, sort = FALSE, by = by),
    inner = merge(x, y, sort = FALSE, by = by),
    semi = x[x[[by]] %in% y[[by]], , drop = FALSE],
    anti = x[!x[[by]] %in% y[[by]], , drop = FALSE],
    bind = .bind_data_frames(x, y)
  )


  # sort rows, add attributes, and return results -------------------------

  if (".data_merge_id_x" %in% colnames(out)) {
    # for full joins, we have no complete sorting id, but NAs for each
    # data frame. we now "merge" the two sorting IDs from each data frame.
    if (anyNA(out$.data_merge_id_x) && ".data_merge_id_y" %in% colnames(out)) {
      out$.data_merge_id_x[is.na(out$.data_merge_id_x)] <- out$.data_merge_id_y[is.na(out$.data_merge_id_x)]
    }
    out <- out[order(out$.data_merge_id_x), ]
    out$.data_merge_id_x <- NULL
    out$.data_merge_id_y <- NULL
  }

  # try to restore original column order as good as possible. Therefore, we
  # first take all column names of the original input data frames, then
  # we add all new columns, like duplicated from merging (name.x and name.y,
  # if "name" was in both data frames, but not used in "by"), and then do a
  # final check that all column names are present in "out" (e.g., "name" would)
  # no longer be there if we have "name.x" and "name.y").

  all_columns <- c(all_columns, setdiff(colnames(out), all_columns))
  all_columns <- all_columns[all_columns %in% colnames(out)]
  out <- out[all_columns]

  # add back attributes
  out <- .replace_attrs(out, attributes(y))
  out <- .replace_attrs(out, attributes(x))

  for (i in colnames(out)) {
    if (is.list(attr_vars[[i]])) {
      if (is.list(attributes(out[[i]]))) {
        attributes(out[[i]]) <- utils::modifyList(attr_vars[[i]], attributes(out[[i]]))
      } else {
        attributes(out[[i]]) <- attr_vars[[i]]
      }
    }
  }

  class(out) <- unique(c(class_x, "data.frame"))
  out
}


#' @rdname data_merge
#' @export
data_merge.list <- function(x, join = "left", by = NULL, id = NULL, verbose = TRUE, ...) {
  out <- x[[1]]
  df_id <- rep(1, times = nrow(out))

  for (i in 2:length(x)) {
    out <- data_merge(out, x[[i]], join = join, by = by, id = NULL, verbose = verbose, ...)
    df_id <- c(df_id, rep(i, times = nrow(x[[i]])))
  }

  # we need separate handling for list of data frames and id-variable here
  if (!is.null(id) && join == "bind") {
    if (id %in% colnames(out)) {
      # ensure unique ID
      id <- make.unique(c(colnames(out), id), sep = "_")[length(colnames(out)) + 1]
      # and also tell user...
      if (isTRUE(verbose)) {
        insight::format_warning(
          sprintf("Value of `id` already exists as column name. ID column was renamed to `%s`.", id)
        )
      }
    }
    out[[id]] <- df_id
  }

  out
}


.bind_data_frames <- function(x, y) {
  # merge and sort. "rbind()" is faster than "merge()" if all columns present
  if (all(colnames(x) %in% colnames(y)) && ncol(x) == ncol(y)) {
    # we may have different column order
    out <- rbind(x, y[match(colnames(x), colnames(y))])
  } else {
    # add ID for merging
    if (nrow(x) > 0L) {
      x$.data_merge_row <- seq_len(nrow(x))
    }
    if (nrow(y) > 0L) {
      y$.data_merge_row <- (nrow(x) + 1):(nrow(x) + nrow(y))
    }
    merge_by <- intersect(colnames(x), colnames(y))
    out <- merge(x, y, all = TRUE, sort = FALSE, by = merge_by)
  }

  # for empty df's, merge() may return an empty character vector
  # make sure it's a data frame object.
  if (!is.data.frame(out)) {
    out <- as.data.frame(out)
  }

  if (".data_merge_row" %in% colnames(out)) {
    out <- out[order(out$.data_merge_row), ]
  }

  out$.data_merge_row <- NULL
  out
}