1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
|
#' @title Inner, Left, Right, Outer, Semi and Anti Join for Data Tables
#' @name JoinTables
#'
#' @description
#' These helper functions perform join operations on data tables.
#' Most of them are basically one-liners.
#' See \url{https://rpubs.com/ronasta/join_data_tables} for a overview of join operations in
#' data table or alternatively \pkg{dplyr}'s vignette on two table verbs.
#'
#' @param x [\code{\link{data.frame}}]\cr
#' First data.frame to join.
#' @param y [\code{\link{data.frame}}]\cr
#' Second data.frame to join.
#' @param by [\code{character}]\cr
#' Column name(s) of variables used to match rows in \code{x} and \code{y}.
#' If not provided, a heuristic similar to the one described in the \pkg{dplyr} vignette is used:
#' \enumerate{
#' \item If \code{x} is keyed, the existing key will be used if \code{y} has the same column(s).
#' \item If \code{x} is not keyed, the intersect of common columns names is used if not empty.
#' \item Raise an exception.
#' }
#' You may pass a named character vector to merge on columns with different names in \code{x} and
#' \code{y}: \code{by = c("x.id" = "y.id")} will match \code{x}'s \dQuote{x.id} column with \code{y}\'s
#' \dQuote{y.id} column.
#' @return [\code{\link{data.table}}] with key identical to \code{by}.
#' @export
#' @examples
#' \dontshow{ batchtools:::example_push_temp(1) }
#' # Create two tables for demonstration
#' tmp = makeRegistry(file.dir = NA, make.default = FALSE)
#' batchMap(identity, x = 1:6, reg = tmp)
#' x = getJobPars(reg = tmp)
#' y = findJobs(x >= 2 & x <= 5, reg = tmp)
#' y$extra.col = head(letters, nrow(y))
#'
#' # Inner join: similar to intersect(): keep all columns of x and y with common matches
#' ijoin(x, y)
#'
#' # Left join: use all ids from x, keep all columns of x and y
#' ljoin(x, y)
#'
#' # Right join: use all ids from y, keep all columns of x and y
#' rjoin(x, y)
#'
#' # Outer join: similar to union(): keep all columns of x and y with matches in x or y
#' ojoin(x, y)
#'
#' # Semi join: filter x with matches in y
#' sjoin(x, y)
#'
#' # Anti join: filter x with matches not in y
#' ajoin(x, y)
#'
#' # Updating join: Replace values in x with values in y
#' ujoin(x, y)
ijoin = function(x, y, by = NULL) {
x = as.data.table(x)
y = as.data.table(y)
by = guessBy(x, y, by)
setKey(x[y, nomatch = 0L, on = by], by)
}
#' @rdname JoinTables
#' @export
ljoin = function(x, y, by = NULL) {
x = as.data.table(x)
y = as.data.table(y)
by = guessBy(x, y, by)
setKey(y[x, on = by], by)
}
#' @rdname JoinTables
#' @export
rjoin = function(x, y, by = NULL) {
x = as.data.table(x)
y = as.data.table(y)
by = guessBy(x, y, by)
setKey(x[y, on = by], by)
}
#' @rdname JoinTables
#' @export
ojoin = function(x, y, by = NULL) {
x = as.data.table(x)
y = as.data.table(y)
by = guessBy(x, y, by)
res = if (is.null(names(by)))
merge(x, y, all = TRUE, by = by)
else
merge(x, y, all = TRUE, by.x = names(by), by.y = by)
setKey(res, by)
}
#' @rdname JoinTables
#' @export
sjoin = function(x, y, by = NULL) {
x = as.data.table(x)
y = as.data.table(y)
by = guessBy(x, y, by)
w = unique(x[y, on = by, nomatch = 0L, which = TRUE, allow.cartesian = TRUE])
setKey(x[w], by)
}
#' @rdname JoinTables
#' @export
ajoin = function(x, y, by = NULL) {
x = as.data.table(x)
y = as.data.table(y)
by = guessBy(x, y, by)
setKey(x[!y, on = by], by)
}
#' @rdname JoinTables
#' @param all.y [logical(1)]\cr
#' Keep columns of \code{y} which are not in \code{x}?
#' @export
ujoin = function(x, y, all.y = FALSE, by = NULL) {
assertFlag(all.y)
x = if (is.data.table(x)) copy(x) else as.data.table(x)
y = as.data.table(y)
by = guessBy(x, y, by)
cn = chsetdiff(names(y), by)
if (!all.y)
cn = chintersect(names(x), cn)
if (length(cn) == 0L)
return(x)
expr = parse(text = stri_join("`:=`(", stri_flatten(sprintf("%1$s=i.%1$s", cn), ","), ")"))
setKey(x[y, eval(expr), on = by], by)
}
guessBy = function(x, y, by = NULL) {
assertDataFrame(x, min.cols = 1L)
assertDataFrame(y, min.cols = 1L)
if (is.null(by)) {
res = key(x)
if (!is.null(res) && all(res %chin% names(y)))
return(res)
res = chintersect(names(x), names(y))
if (length(res) > 0L)
return(res)
stop("Unable to guess columns to match on. Please specify them explicitly or set keys beforehand.")
} else {
if (is.null(names(by))) {
assertSubset(by, names(x))
} else {
assertSubset(names(by), names(x))
}
assertSubset(by, names(y))
return(by)
}
}
setKey = function(res, by) {
by = names(by) %??% unname(by)
if (!identical(key(res), by))
setkeyv(res, by)
res[]
}
|