1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
##' First draft of function to diagnose problems in merges and key variables
##'
##' This is a first effort. It works with 2 data frames and 1 key
##' variable in each. It does not work if the by parameter includes
##' more than one column name (but may work in future). The return is
##' a list which includes full copies of the rows from the data frames
##' in which trouble is observed.
##' @param x data frame
##' @param y data frame
##' @param by Commonly called the "key" variable. A column name to be
##' used for merging (common to both \code{x} and \code{y})
##' @param by.x Column name in \code{x} to be used for merging. If not
##' supplied, then \code{by.x} is assumed to be same as \code{by}.
##' @param by.y Column name in \code{y} to be used for merging. If not
##' supplied, then \code{by.y} is assumed to be same as \code{by}.
##' @param incomparables values in the key (by) variable that are
##' ignored for matching. We default to include these values as
##' incomparables: c(NULL, NA, NaN, Inf, "\\s+", ""). Note this is
##' a larger list of incomparables than assumed by R merge (which
##' assumes only NULL).
##' @return A list of data structures that are displayed for keys and
##' data sets. The return is \code{list(keysBad, keysDuped,
##' unmatched)}. \code{unmatched} is a list with 2 elements, the
##' unmatched cases from \code{x} and \code{y}.
##' @export
##' @author Paul Johnson
##' @examples
##' df1 <- data.frame(id = 1:7, x = rnorm(7))
##' df2 <- data.frame(id = c(2:6, 9:10), x = rnorm(7))
##' mc1 <- mergeCheck(df1, df2, by = "id")
##' ## Use mc1 objects mc1$keysBad, mc1$keysDuped, mc1$unmatched
##' df1 <- data.frame(id = c(1:3, NA, NaN, "", " "), x = rnorm(7))
##' df2 <- data.frame(id = c(2:6, 5:6), x = rnorm(7))
##' mergeCheck(df1, df2, by = "id")
##' df1 <- data.frame(idx = c(1:5, NA, NaN), x = rnorm(7))
##' df2 <- data.frame(idy = c(2:6, 9:10), x = rnorm(7))
##' mergeCheck(df1, df2, by.x = "idx", by.y = "idy")
mergeCheck <- function(x, y, by, by.x = by, by.y = by,
incomparables = c(NULL, NA, NaN, Inf, "\\s+", "")) {
## copied from R base merge function
fix.by <- function(by, df) {
if (is.null(by))
by <- numeric()
by <- as.vector(by)
nc <- ncol(df)
if (is.character(by)) {
poss <- c("row.names", names(df))
if (any(bad <- !charmatch(by, poss, 0L)))
stop(ngettext(sum(bad), "'by' must specify a uniquely valid column",
"'by' must specify uniquely valid columns"),
domain = NA)
by <- match(by, poss) - 1L
}
else if (is.numeric(by)) {
if (any(by < 0L) || any(by > nc))
stop("'by' must match numbers of columns")
}
else if (is.logical(by)) {
if (length(by) != nc)
stop("'by' must match number of columns")
by <- seq_along(by)[by]
}
else stop("'by' must specify one or more columns as numbers, names or logical")
if (any(bad <- is.na(by)))
stop(ngettext(sum(bad), "'by' must specify a uniquely valid column",
"'by' must specify uniquely valid columns"),
domain = NA)
by.integer <- unique(by)
colnames(df)[by.integer]
}
if(missing(by)){
if(missing(by.x) || missing(by.y)) {
MESSG <- "Must specify a column name in by or both by.x and by.y"
}
}
by.x <- fix.by(by.x, x)
by.y <- fix.by(by.y, y)
#get data frame name
label1 <- deparse(substitute(x))
label2 <- deparse(substitute(y))
dlist <- list(x, y)
names(dlist) <- c(label1, label2)
bylist <- list(by.x, by.y)
names(bylist) <- c(label1, label2)
keyEmpty <- function(df, keyName){
df2 <- df[ df[ , keyName] %in% incomparables, , drop = FALSE]
if(NROW(df2) > 0) return(df2) else return(NULL)
}
keyNotUnique <- function(df, keyName){
dupes <- df[df[ , keyName] %in% df[duplicated(df[ , keyName]), keyName], , drop = FALSE]
if(NROW(dupes) > 0) return(dupes) else return(NULL)
}
keysBad <- list()
for(i in names(dlist)) keysBad[[i]] <- keyEmpty(dlist[[i]], keyName = bylist[[i]])
keysDuped <- list()
for(i in names(dlist)) keysDuped[[i]] <- keyNotUnique(dlist[[i]], keyName = bylist[[i]])
unmatched <- list()
unmatched[[label1]] <- x[is.na(match(x[ , bylist[[label1]]], y[ , bylist[[label2]]])), , drop = FALSE]
unmatched[[label2]] <- y[is.na(match(y[ , bylist[[label2]]], x[ , bylist[[label1]]])), , drop = FALSE]
#create a list structure as a returned value
res <- list(keysBad = keysBad, keysDuped = keysDuped, unmatched = unmatched)
attr(res, "bylist") <- bylist
class(res) <- "keycheck"
## if any results are not NULL, yell at user
if(any(unlist(lapply(res, function(yyy) lapply(yyy, function(zz) length(zz) > 0))))){
MESSG <- "Merge difficulties detected\n\n"
cat(MESSG)
print(res)
}
invisible(res)
}
##' Print out the result of mergeCheck function.
##'
##' This is a placeholder for a more elaborate print method
##' to be prepared in the future. Please advise us what might
##' be most helpful.
##' @param x keycheck output from mergeCheck
##' @param ... Other arguments
##' @return None, side effect if print to screen
##' @method print keycheck
##' @export
##' @author Paul Johnson
print.keycheck <- function(x, ...){
if (length(x$keysBad) > 0){
MESSG <- paste("Unacceptable key values\n")
cat(MESSG)
lapply(names(x$keysBad), function(yy){
cat(paste(yy, "\n"))
print(x$keysBad[[yy]])})
}
if (length(x$keysDuped) > 0){
MESSG <- paste("Duplicated key values\n")
cat(MESSG)
lapply(names(x$keysDuped), function(yy){
cat(paste(yy, "\n"))
print(x$keysDuped[[yy]])})
}
MESSG <- paste("Unmatched cases from", paste(names(x$unmatched), collapse = " and "), ":\n")
cat(MESSG)
lapply(names(x$unmatched), function(yy) {
cat(paste(yy, "\n"))
print(x$unmatched[[yy]])
})
}
|