1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
|
##' Remove rows in which the proportion of missing data exceeds
##' a threshold.
##'
##' If cases are mostly missing, delete them. It often happens that
##' when data is imported from other sources, some noise rows exist at
##' the bottom of the input. Anything that is missing in more than,
##' say, 90\% of cases is probably useless information. We invented
##' this to deal with problem that MS Excel users often include a
##' marginal note at the bottom of a spread sheet.
##'
##' @param dframe A data frame or matrix
##' @param pm "proportion missing data" to be tolerated.
##' @param drop Default FALSE: if data frame result is reduced to one
##' row, should R's default drop behavior "demote" this to a
##' column vector.
##' @param verbose Default TRUE. Should a report be printed
##' summarizing information to be delted?
##' @param n Default 25: limit on number of values to print in verbose
##' diagnostic output. If set to NULL or NA, then all of the
##' column values will be printed for the bogus rows.
##' @return a data frame, invisibly
##' @export
##' @author Paul Johnson <pauljohn@@ku.edu>
##' @examples
##' mymat <- matrix(rnorm(10*100), nrow = 10, ncol = 100,
##' dimnames = list(1:10, paste0("x", 1:100)))
##' mymat <- rbind(mymat, c(32, rep(NA, 99)))
##' mymat2 <- deleteBogusRows(mymat)
##' mydf <- as.data.frame(mymat)
##' mydf$someFactor <- factor(sample(c("A", "B"), size = NROW(mydf), replace = TRUE))
##' mydf2 <- deleteBogusRows(mydf, n = "all")
deleteBogusRows <- function (dframe, pm = 0.9, drop = FALSE,
verbose = TRUE, n = 25){
if (is.null(n) || is.na(n)) n <- NCOL(dframe)
if (n > NCOL(dframe)) n <- NCOL(dframe)
rowna <- apply(dframe, 1, function(x){sum(is.na(x))})
badrows <- rowna > pm * NCOL(dframe)
if (any(badrows)){
if (verbose){
cat(paste("deleteBogusRows Diagnostic\n"))
cat(paste("These rows from the data frame: ",
deparse(substitute(dframe)), "\n are being purged: "))
cat(paste(which(badrows), "\n"))
cat(paste("The bad content was\n"))
print(dframe[badrows, 1:n, drop = drop])
}
dframe <- dframe[!badrows, , drop = drop]
return(dframe)
}
print(paste("No bogus rows were found in: ", deparse(substitute(dframe))))
invisible(dframe)
}
##' Remove columns in which the proportion of missing data exceeds
##' a threshold.
##'
##' This is a column version of \code{deleteBogusRows}. Use the pm
##' argument to set the proportion of missing required before a column
##' is flagged for deletion
##'
##' @param dframe A data frame or matrix
##' @param pm "proportion missing data" to be tolerated.
##' @param drop Default FALSE: if data frame result is reduced to one
##' column, should R's default drop behavior "demote" this to a
##' column vector.
##' @param verbose Default TRUE. Should a report be printed
##' summarizing information to be delted?
##' @param n Default 25: limit on number of values to print in
##' diagnostic output. If set to NULL or NA, then all of the
##' column values will be printed for the bogus rows.
##' @return a data frame, invisibly
##' @seealso \code{deleteBogusRows}
##' @export
##' @author Paul Johnson <pauljohn@@ku.edu>
deleteBogusColumns <- function (dframe, pm = 0.9, drop = FALSE,
verbose = TRUE, n = 25)
{
if (is.null(n) || is.na(n)) n <- NROW(dframe)
if (n > NROW(dframe)) n <- NROW(dframe)
colna <- apply(dframe, 2, function(x){sum(is.na(x))})
badcols <- colna > pm * NROW(dframe)
if (any(badcols)){
if (verbose){
cat(paste("deleteBogusColumns Diagnostic\n"))
cat(paste("These columns from the data frame: ", deparse(substitute(dframe)), "\n are being purged: "))
cat(paste(colnames(dframe)[badcols], "\n"))
cat(paste("The bad content was:\n"))
print(dframe[1:n, badcols, drop = drop])
}
newdframe <- dframe[, !badcols, drop = drop]
return(newdframe)
}
print(paste("No bogus columns were found in: ", deparse(substitute(dframe))))
invisible(dframe)
}
## TODO: make deleteBogus a generic, then write a method for data
## frames and a method for data tables.
|