File: Joins.R

package info (click to toggle)
r-cran-batchtools 0.9.15%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,416 kB
  • sloc: ansic: 172; sh: 156; makefile: 2
file content (169 lines) | stat: -rw-r--r-- 4,703 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#' @title Inner, Left, Right, Outer, Semi and Anti Join for Data Tables
#' @name JoinTables
#'
#' @description
#' These helper functions perform join operations on data tables.
#' Most of them are basically one-liners.
#' See \url{https://rpubs.com/ronasta/join_data_tables} for a overview of join operations in
#' data table or alternatively \pkg{dplyr}'s vignette on two table verbs.
#'
#' @param x [\code{\link{data.frame}}]\cr
#'   First data.frame to join.
#' @param y [\code{\link{data.frame}}]\cr
#'   Second data.frame to join.
#' @param by [\code{character}]\cr
#'   Column name(s) of variables used to match rows in \code{x} and \code{y}.
#'   If not provided, a heuristic similar to the one described in the \pkg{dplyr} vignette is used:
#'   \enumerate{
#'     \item If \code{x} is keyed, the existing key will be used if \code{y} has the same column(s).
#'     \item If \code{x} is not keyed, the intersect of common columns names is used if not empty.
#'     \item Raise an exception.
#'   }
#'   You may pass a named character vector to merge on columns with different names in \code{x} and
#'   \code{y}: \code{by = c("x.id" = "y.id")} will match \code{x}'s \dQuote{x.id} column with \code{y}\'s
#'   \dQuote{y.id} column.
#' @return [\code{\link{data.table}}] with key identical to \code{by}.
#' @export
#' @examples
#' \dontshow{ batchtools:::example_push_temp(1) }
#' # Create two tables for demonstration
#' tmp = makeRegistry(file.dir = NA, make.default = FALSE)
#' batchMap(identity, x = 1:6, reg = tmp)
#' x = getJobPars(reg = tmp)
#' y = findJobs(x >= 2 & x <= 5, reg = tmp)
#' y$extra.col = head(letters, nrow(y))
#'
#' # Inner join: similar to intersect(): keep all columns of x and y with common matches
#' ijoin(x, y)
#'
#' # Left join: use all ids from x, keep all columns of x and y
#' ljoin(x, y)
#'
#' # Right join: use all ids from y, keep all columns of x and y
#' rjoin(x, y)
#'
#' # Outer join: similar to union(): keep all columns of x and y with matches in x or y
#' ojoin(x, y)
#'
#' # Semi join: filter x with matches in y
#' sjoin(x, y)
#'
#' # Anti join: filter x with matches not in y
#' ajoin(x, y)
#'
#' # Updating join: Replace values in x with values in y
#' ujoin(x, y)
ijoin = function(x, y, by = NULL) {
  x = as.data.table(x)
  y = as.data.table(y)
  by = guessBy(x, y, by)

  setKey(x[y, nomatch = 0L, on = by], by)
}

#' @rdname JoinTables
#' @export
ljoin = function(x, y, by = NULL) {
  x = as.data.table(x)
  y = as.data.table(y)
  by = guessBy(x, y, by)

  setKey(y[x, on = by], by)
}

#' @rdname JoinTables
#' @export
rjoin = function(x, y, by = NULL) {
  x = as.data.table(x)
  y = as.data.table(y)
  by = guessBy(x, y, by)

  setKey(x[y, on = by], by)
}

#' @rdname JoinTables
#' @export
ojoin = function(x, y, by = NULL) {
  x = as.data.table(x)
  y = as.data.table(y)
  by = guessBy(x, y, by)

  res = if (is.null(names(by)))
    merge(x, y, all = TRUE, by = by)
  else
    merge(x, y, all = TRUE, by.x = names(by), by.y = by)

  setKey(res, by)
}

#' @rdname JoinTables
#' @export
sjoin = function(x, y, by = NULL) {
  x = as.data.table(x)
  y = as.data.table(y)
  by = guessBy(x, y, by)

  w = unique(x[y, on = by, nomatch = 0L, which = TRUE, allow.cartesian = TRUE])
  setKey(x[w], by)
}

#' @rdname JoinTables
#' @export
ajoin = function(x, y, by = NULL) {
  x = as.data.table(x)
  y = as.data.table(y)
  by = guessBy(x, y, by)

  setKey(x[!y, on = by], by)
}

#' @rdname JoinTables
#' @param all.y [logical(1)]\cr
#'   Keep columns of \code{y} which are not in \code{x}?
#' @export
ujoin = function(x, y, all.y = FALSE, by = NULL) {
  assertFlag(all.y)
  x = if (is.data.table(x)) copy(x) else as.data.table(x)
  y = as.data.table(y)
  by = guessBy(x, y, by)

  cn = chsetdiff(names(y), by)
  if (!all.y)
    cn = chintersect(names(x), cn)
  if (length(cn) == 0L)
    return(x)

  expr = parse(text = stri_join("`:=`(", stri_flatten(sprintf("%1$s=i.%1$s", cn), ","), ")"))
  setKey(x[y, eval(expr), on = by], by)
}

guessBy = function(x, y, by = NULL) {
  assertDataFrame(x, min.cols = 1L)
  assertDataFrame(y, min.cols = 1L)

  if (is.null(by)) {
    res = key(x)
    if (!is.null(res) && all(res %chin% names(y)))
      return(res)

    res = chintersect(names(x), names(y))
    if (length(res) > 0L)
      return(res)
    stop("Unable to guess columns to match on. Please specify them explicitly or set keys beforehand.")
  } else {
    if (is.null(names(by))) {
      assertSubset(by, names(x))
    } else {
      assertSubset(names(by), names(x))
    }
    assertSubset(by, names(y))
    return(by)
  }
}

setKey = function(res, by) {
  by = names(by) %??% unname(by)
  if (!identical(key(res), by))
    setkeyv(res, by)
  res[]
}