1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
#' Merge Multiple Data Frames or Data Tables
#'
#' Merges an arbitrarily large series of data frames or data tables containing common \code{id} variables. Information about number of observations and number of unique \code{id}s in individual and final merged datasets is printed. The first data frame/table has special meaning in that all of its observations are kept whether they match \code{id}s in other data frames or not. For all other data frames, by default non-matching observations are dropped. The first data frame is also the one against which counts of unique \code{id}s are compared. Sometimes \code{merge} drops variable attributes such as \code{labels} and \code{units}. These are restored by \code{Merge}.
#'
#' @param \dots two or more dataframes or data tables
#' @param id a formula containing all the identification variables such that the combination of these variables uniquely identifies subjects or records of interest. May be omitted for data tables; in that case the \code{key} function retrieves the id variables.
#' @param all set to \code{FALSE} to drop observations not found in second and later data frames (only applies if not using \code{data.table})
#' @param verbose set to \code{FALSE} to not print information about observations
#' @export
#' @examples
#' \dontrun{
#' a <- data.frame(sid=1:3, age=c(20,30,40))
#' b <- data.frame(sid=c(1,2,2), bp=c(120,130,140))
#' d <- data.frame(sid=c(1,3,4), wt=c(170,180,190))
#' all <- Merge(a, b, d, id = ~ sid)
#' # First file should be the master file and must
#' # contain all ids that ever occur. ids not in the master will
#' # not be merged from other datasets.
#' a <- data.table(a); setkey(a, sid)
#' # data.table also does not allow duplicates without allow.cartesian=TRUE
#' b <- data.table(sid=1:2, bp=c(120,130)); setkey(b, sid)
#' d <- data.table(d); setkey(d, sid)
#' all <- Merge(a, b, d)
#' }
Merge <- function(..., id=NULL, all=TRUE, verbose=TRUE) {
w <- list(...)
nams <- (as.character(sys.call())[-1])[1 : length(w)]
m <- length(nams)
## If argument is a function call, e.g., subset(mydata, age > 20)
## find name of first argument and omit any dollar sign prefix and []
for(i in 1 : m) {
x <- nams[i]
x <- gsub('subset\\(', '', x)
x <- gsub(',.*', '', x)
x <- gsub('\\[.*' , '', x)
nams[i] <- gsub('(.*)\\$(.*)', '\\2', x)
}
d1 <- w[[1]]
idt <- 'data.table' %in% class(d1)
if(idt && ! requireNamespace("data.table", quietly = TRUE))
stop("The 'data.table' package is required to operate on data tables.")
if(length(id)) id <- all.vars(id)
else {
if(! idt) stop('must specify id if not using data.tables')
id <- key(d1)
if(! length(id)) stop('id not given and first data table has no keys')
}
m <- length(w)
va <- n <- nu <- integer(m)
nin1 <- nnin1 <- rep(NA, m)
did <- if(idt) d1[, id, with=FALSE] else d1[id]
idc1 <- unique(as.character(interaction(did)))
id.union <- id.intersection <- idc1
## Unique variables, and their labels and units
uvar <- lab <- un <- character(0)
for(i in 1 : m) {
d <- w[[i]]
nd <- names(d)
if(any(id %nin% nd))
stop(paste('data frame', nams[i], 'does not contain id variables',
paste(id, collapse=', ')))
j <- nd %nin% uvar
uvar <- c(uvar, nd[j])
lab <- c(lab, sapply(d, label)[j])
un <- c(un, sapply(d, units)[j])
idt <- is.data.table(d)
M <- if(i == 1) d
else
merge(M, d, by=id, all.x=TRUE, all.y=all)
did <- if(idt) d[, id, with=FALSE] else d[id]
idc <- unique(as.character(interaction(did)))
di <- dim(d)
va[i] <- di[2]
n [i] <- di[1]
nu[i] <- length(unique(idc))
if(i > 1) {
nin1 [i] <- sum(idc %in% idc1)
nnin1[i] <- sum(idc %nin% idc1)
id.union <- union(id.union, idc)
id.intersection <- intersect(id.intersection, idc)
}
}
## Restore labels and units if needed
nm <- names(M)
names(lab) <- uvar
names(un ) <- uvar
anych <- FALSE
if(any(c(lab, un) != ''))
for(i in 1 : ncol(M)) {
x <- M[[i]]
ni <- nm[i]
changed <- FALSE
if(ni %nin% names(lab))
stop(paste('Unexpected variable:', ni))
if(lab[ni] != '' && ! length(attr(x, 'label'))) {
label(x) <- lab[ni]
changed <- TRUE
}
if(un[ni] != '' && ! length(attr(x, 'units'))) {
units(x) <- un[ni]
changed <- TRUE
}
if(changed) M[[i]] <- x
anych <- anych | changed
}
nams <- c(nams, 'Merged')
va <- c(va, ncol(M))
n <- c(n, nrow(M))
did <- if(is.data.table(M)) M[, id, with=FALSE] else M[id]
idc <- unique(as.character(interaction(did)))
nu <- c(nu, length(unique(idc)))
nin1 <- c(nin1, sum(idc %in% idc1))
nnin1 <- c(nnin1, sum(idc %nin% idc1))
info <- cbind(Vars=va, Obs=n, 'Unique IDs'=nu, 'IDs in #1'=nin1,
'IDs not in #1'=nnin1)
rownames(info) <- nams
if(verbose) {
print(info)
cat('\nNumber of unique IDs in any data frame :', length(id.union), '\n')
cat( 'Number of unique IDs in all data frames:', length(id.intersection),
'\n')
if(anych) cat('\nLabels or units restored\n')
}
attr(M, 'info') <- info
M
}
|