File: Merge.r

package info (click to toggle)
hmisc 5.2-5-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,044 kB
  • sloc: asm: 28,907; f90: 590; ansic: 415; xml: 160; fortran: 75; makefile: 2
file content (130 lines) | stat: -rw-r--r-- 5,282 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#' Merge Multiple Data Frames or Data Tables
#'
#' Merges an arbitrarily large series of data frames or data tables containing common \code{id} variables.  Information about number of observations and number of unique \code{id}s in individual and final merged datasets is printed.  The first data frame/table has special meaning in that all of its observations are kept whether they match \code{id}s in other data frames or not.  For all other data frames, by default non-matching observations are dropped.  The first data frame is also the one against which counts of unique \code{id}s are compared.  Sometimes \code{merge} drops variable attributes such as \code{labels} and \code{units}.  These are restored by \code{Merge}.
#'
#' @param \dots two or more dataframes or data tables
#' @param id a formula containing all the identification variables such that the combination of these variables uniquely identifies subjects or records of interest.  May be omitted for data tables; in that case the \code{key} function retrieves the id variables.
#' @param all set to \code{FALSE} to drop observations not found in second and later data frames (only applies if not using \code{data.table})
#' @param verbose set to \code{FALSE} to not print information about observations
#' @export
#' @examples
#' \dontrun{
#' a <- data.frame(sid=1:3, age=c(20,30,40))
#' b <- data.frame(sid=c(1,2,2), bp=c(120,130,140))
#' d <- data.frame(sid=c(1,3,4), wt=c(170,180,190))
#' all <- Merge(a, b, d, id = ~ sid)
#' # First file should be the master file and must
#' # contain all ids that ever occur.  ids not in the master will
#' # not be merged from other datasets.
#' a <- data.table(a); setkey(a, sid)
#' # data.table also does not allow duplicates without allow.cartesian=TRUE
#' b <- data.table(sid=1:2, bp=c(120,130)); setkey(b, sid)
#' d <- data.table(d); setkey(d, sid)
#' all <- Merge(a, b, d)
#' }

Merge <- function(..., id=NULL, all=TRUE, verbose=TRUE) {
  
  w <- list(...)
  nams <- (as.character(sys.call())[-1])[1 : length(w)]
  m <- length(nams)
  ## If argument is a function call, e.g., subset(mydata, age > 20)
  ## find name of first argument and omit any dollar sign prefix and []
  for(i in 1 : m) {
    x <-       nams[i]
    x <-       gsub('subset\\(',   '', x)
    x <-       gsub(',.*',         '', x)
    x <-       gsub('\\[.*'  ,     '', x)
    nams[i] <- gsub('(.*)\\$(.*)', '\\2', x)
  }
  d1   <- w[[1]]
  idt <- 'data.table' %in% class(d1)
  if(idt && ! requireNamespace("data.table", quietly = TRUE))
    stop("The 'data.table' package is required to operate on data tables.")

  if(length(id)) id <- all.vars(id)
  else {
    if(! idt) stop('must specify id if not using data.tables')
    id <- key(d1)
    if(! length(id)) stop('id not given and first data table has no keys')
    }
  m <- length(w)
  va <- n <- nu <- integer(m)
  nin1 <- nnin1 <- rep(NA, m)
  did <- if(idt) d1[, id, with=FALSE] else d1[id]
  idc1 <- unique(as.character(interaction(did)))
  id.union <- id.intersection <- idc1
  ## Unique variables, and their labels and units
  uvar <- lab <- un <- character(0)
  for(i in 1 : m) {
    d <- w[[i]]
    nd <- names(d)
    if(any(id %nin% nd))
      stop(paste('data frame', nams[i], 'does not contain id variables',
                 paste(id, collapse=', ')))
    j <- nd %nin% uvar
    uvar <- c(uvar, nd[j])
    lab  <- c(lab,  sapply(d, label)[j])
    un   <- c(un,   sapply(d, units)[j])
    idt  <- is.data.table(d)
    M <- if(i == 1) d
    else
      merge(M, d, by=id, all.x=TRUE, all.y=all)
    did   <- if(idt) d[, id, with=FALSE] else d[id]
    idc   <- unique(as.character(interaction(did)))
    di    <- dim(d)
    va[i] <- di[2]
    n [i] <- di[1]
    nu[i] <- length(unique(idc))
    if(i > 1) {
      nin1 [i] <- sum(idc %in%  idc1)
      nnin1[i] <- sum(idc %nin% idc1)
      id.union <- union(id.union, idc)
      id.intersection <- intersect(id.intersection, idc)
    }
  }
  ## Restore labels and units if needed
  nm <- names(M)
  names(lab) <- uvar
  names(un ) <- uvar
  anych <- FALSE
  if(any(c(lab, un) != ''))
    for(i in 1 : ncol(M)) {
      x  <- M[[i]]
      ni <- nm[i]
      changed <- FALSE
      if(ni %nin% names(lab))
        stop(paste('Unexpected variable:', ni))
      if(lab[ni] != '' && ! length(attr(x, 'label'))) {
        label(x) <- lab[ni]
        changed <- TRUE
      }
      if(un[ni] != '' && ! length(attr(x, 'units'))) {
        units(x) <- un[ni]
        changed <- TRUE
      }
      if(changed) M[[i]] <- x
      anych <- anych | changed
    }
  
  nams  <- c(nams, 'Merged')
  va    <- c(va, ncol(M))
  n     <- c(n, nrow(M))
  did   <- if(is.data.table(M)) M[, id, with=FALSE] else M[id]
  idc   <- unique(as.character(interaction(did)))
  nu    <- c(nu, length(unique(idc)))
  nin1  <- c(nin1,  sum(idc %in%  idc1))
  nnin1 <- c(nnin1, sum(idc %nin% idc1))
  info  <- cbind(Vars=va, Obs=n, 'Unique IDs'=nu, 'IDs in #1'=nin1,
                 'IDs not in #1'=nnin1)
  rownames(info) <- nams
  if(verbose) {
    print(info)
    cat('\nNumber of unique IDs in any data frame :', length(id.union), '\n')
    cat(  'Number of unique IDs in all data frames:', length(id.intersection),
        '\n')
    if(anych) cat('\nLabels or units restored\n')
  }
  attr(M, 'info') <- info
  M
}