1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
|
#' @title Merge (join) two data frames, or a list of data frames
#' @name data_merge
#'
#' @description
#' Merge (join) two data frames, or a list of data frames. However, unlike
#' base R's `merge()`, `data_merge()` offers a few more methods to join data
#' frames, and it does not drop data frame nor column attributes.
#'
#' @param x,y A data frame to merge. `x` may also be a list of data frames
#' that will be merged. Note that the list-method has no `y` argument.
#' @param join Character vector, indicating the method of joining the data frames.
#' Can be `"full"`, `"left"` (default), `"right"`, `"inner"`, `"anti"`, `"semi"`
#' or `"bind"`. See details below.
#' @param by Specifications of the columns used for merging.
#' @param id Optional name for ID column that will be created to indicate the
#' source data frames for appended rows. Only applies if `join = "bind"`.
#' @param verbose Toggle warnings.
#' @param ... Not used.
#'
#' @return
#' A merged data frame.
#'
#' @section Merging data frames:
#'
#' Merging data frames is performed by adding rows (cases), columns
#' (variables) or both from the source data frame (`y`) to the target
#' data frame (`x`). This usually requires one or more variables which
#' are included in both data frames and that are used for merging, typically
#' indicated with the `by` argument. When `by` contains a variable present
#' in both data frames, cases are matched and filtered by identical values
#' of `by` in `x` and `y`.
#'
#' @section Left- and right-joins:
#'
#' Left- and right joins usually don't add new rows (cases), but only new
#' columns (variables) for existing cases in `x`. For `join = "left"` or
#' `join = "right"` to work, `by` *must* indicate one or more columns that
#' are included in both data frames. For `join = "left"`, if `by` is an
#' identifier variable, which is included in both `x` and `y`, all variables
#' from `y` are copied to `x`, but only those cases from `y` that have
#' matching values in their identifier variable in `x` (i.e. all cases
#' in `x` that are also found in `y` get the related values from the new
#' columns in `y`). If there is no match between identifiers in `x` and `y`,
#' the copied variable from `y` will get a `NA` value for this particular
#' case. Other variables that occur both in `x` and `y`, but are not used
#' as identifiers (with `by`), will be renamed to avoid multiple identical
#' variable names. Cases in `y` where values from the identifier have no
#' match in `x`'s identifier are removed. `join = "right"` works in
#' a similar way as `join = "left"`, just that only cases from `x` that
#' have matching values in their identifier variable in `y` are chosen.
#'
#' In base R, these are equivalent to `merge(x, y, all.x = TRUE)` and
#' `merge(x, y, all.y = TRUE)`.
#'
#' @section Full joins:
#'
#' Full joins copy all cases from `y` to `x`. For matching cases in both
#' data frames, values for new variables are copied from `y` to `x`. For
#' cases in `y` not present in `x`, these will be added as new rows to `x`.
#' Thus, full joins not only add new columns (variables), but also might
#' add new rows (cases).
#'
#' In base R, this is equivalent to `merge(x, y, all = TRUE)`.
#'
#' @section Inner joins:
#'
#' Inner joins merge two data frames, however, only those rows (cases) are
#' kept that are present in both data frames. Thus, inner joins usually
#' add new columns (variables), but also remove rows (cases) that only
#' occur in one data frame.
#'
#' In base R, this is equivalent to `merge(x, y)`.
#'
#' @section Binds:
#'
#' `join = "bind"` row-binds the complete second data frame `y` to `x`.
#' Unlike simple `rbind()`, which requires the same columns for both data
#' frames, `join = "bind"` will bind shared columns from `y` to `x`, and
#' add new columns from `y` to `x`.
#'
#' @examples
#'
#' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
#' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 2:4)
#'
#' x
#' y
#'
#' # "by" will default to all shared columns, i.e. "c" and "id". new columns
#' # "d" and "e" will be copied from "y" to "x", but there are only two cases
#' # in "x" that have the same values for "c" and "id" in "y". only those cases
#' # have values in the copied columns, the other case gets "NA".
#' data_merge(x, y, join = "left")
#'
#' # we change the id-value here
#' x <- data.frame(a = 1:3, b = c("a", "b", "c"), c = 5:7, id = 1:3)
#' y <- data.frame(c = 6:8, d = c("f", "g", "h"), e = 100:102, id = 3:5)
#'
#' x
#' y
#'
#' # no cases in "y" have the same matching "c" and "id" as in "x", thus
#' # copied variables from "y" to "x" copy no values, all get NA.
#' data_merge(x, y, join = "left")
#'
#' # one case in "y" has a match in "id" with "x", thus values for this
#' # case from the remaining variables in "y" are copied to "x", all other
#' # values (cases) in those remaining variables get NA
#' data_merge(x, y, join = "left", by = "id")
#'
#' data(mtcars)
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:32, 4:6]
#'
#' # add ID common column
#' x$id <- 1:5
#' y$id <- 3:7
#'
#' # left-join, add new variables and copy values from y to x,
#' # where "id" values match
#' data_merge(x, y)
#'
#' # right-join, add new variables and copy values from x to y,
#' # where "id" values match
#' data_merge(x, y, join = "right")
#'
#' # full-join
#' data_merge(x, y, join = "full")
#'
#'
#' data(mtcars)
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:32, c(1, 4:5)]
#'
#' # add ID common column
#' x$id <- 1:5
#' y$id <- 3:7
#'
#' # left-join, no matching rows (because columns "id" and "disp" are used)
#' # new variables get all NA values
#' data_merge(x, y)
#'
#' # one common value in "mpg", so one row from y is copied to x
#' data_merge(x, y, by = "mpg")
#'
#' # only keep rows with matching values in by-column
#' data_merge(x, y, join = "semi", by = "mpg")
#'
#' # only keep rows with non-matching values in by-column
#' data_merge(x, y, join = "anti", by = "mpg")
#'
#' # merge list of data frames. can be of different rows
#' x <- mtcars[1:5, 1:3]
#' y <- mtcars[28:31, 3:5]
#' z <- mtcars[11:18, c(1, 3:4, 6:8)]
#' x$id <- 1:5
#' y$id <- 4:7
#' z$id <- 3:10
#' data_merge(list(x, y, z), join = "bind", by = "id", id = "source")
#' @inherit data_rename seealso
#' @export
data_merge <- function(x, ...) {
UseMethod("data_merge")
}
#' @rdname data_merge
#' @export
data_join <- data_merge
#' @rdname data_merge
#' @export
data_merge.data.frame <- function(x, y, join = "left", by = NULL, id = NULL, verbose = TRUE, ...) {
class_x <- class(x)
# save variable attributes
attr_x_vars <- lapply(x, attributes)
attr_y_vars <- lapply(y, attributes)
attr_vars <- c(attr_x_vars, attr_y_vars[names(attr_y_vars)[!names(attr_y_vars) %in% names(attr_x_vars)]])
# check join-argument ----------------------
join <- match.arg(join, choices = c("full", "left", "right", "inner", "semi", "anti", "bind"))
# check id-argument ----------------------
all_columns <- union(colnames(x), colnames(y))
if (join == "bind" && !is.null(id) && id %in% all_columns) {
# ensure unique ID
id <- make.unique(c(all_columns, id), sep = "_")[length(all_columns) + 1]
# and also tell user...
if (isTRUE(verbose)) {
insight::format_warning(
sprintf("Value of `id` already exists as column name. ID column was renamed to `%s`.", id)
)
}
}
if (!is.null(id) && join == "bind") {
x[[id]] <- 1
y[[id]] <- 2
}
# check merge columns ("by"-argument) ----------------------
if (join != "bind") {
# we need a value for "by". If not provided, use all shared column names
if (is.null(by)) {
by <- intersect(colnames(x), colnames(y))
}
# If not all column names specified in "by" are present, yield warning
# and use all shared column names
if (!all(by %in% colnames(x)) || !all(by %in% colnames(y))) {
missing_in_x <- setdiff(by, colnames(x))
missing_in_y <- setdiff(by, colnames(y))
stop_message <- c(
"Not all columns specified in `by` were found in the data frames.",
if (length(missing_in_x) > 0L) {
paste0("Following columns are in `by` but absent in `x`: ", text_concatenate(missing_in_x))
},
if (length(missing_in_y) > 0L) {
paste0("Following columns are in `by` but absent in `y`: ", text_concatenate(missing_in_y))
}
)
if (isTRUE(verbose)) {
insight::format_error(stop_message)
}
}
# if still both data frames have no common columns, do a full join
if (!length(by)) {
if (isTRUE(verbose)) {
insight::format_warning(
"Found no matching columns in the data frames. Fully merging both data frames now.",
"Note that this can lead to unintended results, because rows in `x` and `y` are possibly duplicated.",
"You probably want to use `data_merge(x, y, join = \"bind\")` instead."
)
}
by <- NULL
join <- "full"
}
}
# check valid combination of "join" and "by" -----------------------
if (join %in% c("anti", "semi") && (is.null(by) || length(by) != 1)) {
insight::format_error(
sprintf(
"For `join = \"%s\"`, `by` needs to be a name of only one variable that is present in both data frames.",
join
)
)
}
# merge --------------------
# for later sorting
if (join != "bind") {
if (nrow(x) > 0L) {
x$.data_merge_id_x <- seq_len(nrow(x))
}
if (nrow(y) > 0L) {
y$.data_merge_id_y <- (seq_len(nrow(y))) + nrow(x)
}
}
all_columns <- union(colnames(x), colnames(y))
out <- switch(join,
full = merge(x, y, all = TRUE, sort = FALSE, by = by),
left = merge(x, y, all.x = TRUE, sort = FALSE, by = by),
right = merge(x, y, all.y = TRUE, sort = FALSE, by = by),
inner = merge(x, y, sort = FALSE, by = by),
semi = x[x[[by]] %in% y[[by]], , drop = FALSE],
anti = x[!x[[by]] %in% y[[by]], , drop = FALSE],
bind = .bind_data_frames(x, y)
)
# sort rows, add attributes, and return results -------------------------
if (".data_merge_id_x" %in% colnames(out)) {
# for full joins, we have no complete sorting id, but NAs for each
# data frame. we now "merge" the two sorting IDs from each data frame.
if (anyNA(out$.data_merge_id_x) && ".data_merge_id_y" %in% colnames(out)) {
out$.data_merge_id_x[is.na(out$.data_merge_id_x)] <- out$.data_merge_id_y[is.na(out$.data_merge_id_x)]
}
out <- out[order(out$.data_merge_id_x), ]
out$.data_merge_id_x <- NULL
out$.data_merge_id_y <- NULL
}
# try to restore original column order as good as possible. Therefore, we
# first take all column names of the original input data frames, then
# we add all new columns, like duplicated from merging (name.x and name.y,
# if "name" was in both data frames, but not used in "by"), and then do a
# final check that all column names are present in "out" (e.g., "name" would)
# no longer be there if we have "name.x" and "name.y").
all_columns <- c(all_columns, setdiff(colnames(out), all_columns))
all_columns <- all_columns[all_columns %in% colnames(out)]
out <- out[all_columns]
# add back attributes
out <- .replace_attrs(out, attributes(y))
out <- .replace_attrs(out, attributes(x))
for (i in colnames(out)) {
if (is.list(attr_vars[[i]])) {
if (is.list(attributes(out[[i]]))) {
attributes(out[[i]]) <- utils::modifyList(attr_vars[[i]], attributes(out[[i]]))
} else {
attributes(out[[i]]) <- attr_vars[[i]]
}
}
}
class(out) <- unique(c(class_x, "data.frame"))
out
}
#' @rdname data_merge
#' @export
data_merge.list <- function(x, join = "left", by = NULL, id = NULL, verbose = TRUE, ...) {
out <- x[[1]]
df_id <- rep(1, times = nrow(out))
for (i in 2:length(x)) {
out <- data_merge(out, x[[i]], join = join, by = by, id = NULL, verbose = verbose, ...)
df_id <- c(df_id, rep(i, times = nrow(x[[i]])))
}
# we need separate handling for list of data frames and id-variable here
if (!is.null(id) && join == "bind") {
if (id %in% colnames(out)) {
# ensure unique ID
id <- make.unique(c(colnames(out), id), sep = "_")[length(colnames(out)) + 1]
# and also tell user...
if (isTRUE(verbose)) {
insight::format_warning(
sprintf("Value of `id` already exists as column name. ID column was renamed to `%s`.", id)
)
}
}
out[[id]] <- df_id
}
out
}
.bind_data_frames <- function(x, y) {
# merge and sort. "rbind()" is faster than "merge()" if all columns present
if (all(colnames(x) %in% colnames(y)) && ncol(x) == ncol(y)) {
# we may have different column order
out <- rbind(x, y[match(colnames(x), colnames(y))])
} else {
# add ID for merging
if (nrow(x) > 0L) {
x$.data_merge_row <- seq_len(nrow(x))
}
if (nrow(y) > 0L) {
y$.data_merge_row <- (nrow(x) + 1):(nrow(x) + nrow(y))
}
merge_by <- intersect(colnames(x), colnames(y))
out <- merge(x, y, all = TRUE, sort = FALSE, by = merge_by)
}
# for empty df's, merge() may return an empty character vector
# make sure it's a data frame object.
if (!is.data.frame(out)) {
out <- as.data.frame(out)
}
if (".data_merge_row" %in% colnames(out)) {
out <- out[order(out$.data_merge_row), ]
}
out$.data_merge_row <- NULL
out
}
|