File: deleteBogusRows.R

package info (click to toggle)
r-cran-kutils 1.73%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,648 kB
  • sloc: sh: 13; makefile: 2
file content (100 lines) | stat: -rw-r--r-- 4,335 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
##' Remove rows in which the proportion of missing data exceeds
##' a threshold.
##'
##' If cases are mostly missing, delete them. It often happens that
##' when data is imported from other sources, some noise rows exist at
##' the bottom of the input.  Anything that is missing in more than,
##' say, 90\% of cases is probably useless information.  We invented
##' this to deal with problem that MS Excel users often include a
##' marginal note at the bottom of a spread sheet.
##'
##' @param dframe A data frame or matrix
##' @param pm "proportion missing data" to be tolerated.
##' @param drop Default FALSE: if data frame result is reduced to one
##'     row, should R's default drop behavior "demote" this to a
##'     column vector.
##' @param verbose Default TRUE. Should a report be printed
##'     summarizing information to be delted?
##' @param n Default 25: limit on number of values to print in verbose
##'     diagnostic output. If set to NULL or NA, then all of the
##'     column values will be printed for the bogus rows.
##' @return a data frame, invisibly
##' @export
##' @author Paul Johnson <pauljohn@@ku.edu>
##' @examples
##' mymat <- matrix(rnorm(10*100), nrow = 10, ncol = 100,
##'                dimnames = list(1:10, paste0("x", 1:100)))
##' mymat <- rbind(mymat, c(32, rep(NA, 99)))
##' mymat2 <- deleteBogusRows(mymat)
##' mydf <- as.data.frame(mymat)
##' mydf$someFactor <- factor(sample(c("A", "B"), size = NROW(mydf), replace = TRUE))
##' mydf2 <- deleteBogusRows(mydf, n = "all")
deleteBogusRows <- function (dframe, pm = 0.9, drop = FALSE,
                             verbose = TRUE, n = 25){
    if (is.null(n) || is.na(n)) n <- NCOL(dframe)
    if (n > NCOL(dframe)) n <- NCOL(dframe)
    rowna <- apply(dframe, 1, function(x){sum(is.na(x))})
    badrows <- rowna > pm * NCOL(dframe)
    if (any(badrows)){
        if (verbose){
            cat(paste("deleteBogusRows Diagnostic\n"))
            cat(paste("These rows from the data frame: ",
                      deparse(substitute(dframe)), "\n are being purged: "))
            cat(paste(which(badrows), "\n"))
            cat(paste("The bad content was\n"))
            print(dframe[badrows, 1:n, drop = drop])
        }
        dframe <- dframe[!badrows, , drop = drop]
        return(dframe)
    }
    print(paste("No bogus rows were found in: ", deparse(substitute(dframe))))
    invisible(dframe)
}


##' Remove columns in which the proportion of missing data exceeds
##' a threshold.
##'
##' This is a column version of \code{deleteBogusRows}. Use the pm
##' argument to set the proportion of missing required before a column
##' is flagged for deletion
##' 
##' @param dframe A data frame or matrix
##' @param pm "proportion missing data" to be tolerated.
##' @param drop Default FALSE: if data frame result is reduced to one
##'     column, should R's default drop behavior "demote" this to a
##'     column vector.
##' @param verbose Default TRUE. Should a report be printed
##'     summarizing information to be delted?
##' @param n Default 25: limit on number of values to print in
##'     diagnostic output. If set to NULL or NA, then all of the
##'     column values will be printed for the bogus rows.
##' @return a data frame, invisibly
##' @seealso \code{deleteBogusRows}
##' @export
##' @author Paul Johnson <pauljohn@@ku.edu>
deleteBogusColumns <- function (dframe, pm = 0.9, drop = FALSE,
                                verbose = TRUE, n = 25)
{
    if (is.null(n) || is.na(n)) n <- NROW(dframe)
    if (n > NROW(dframe)) n <- NROW(dframe)
    colna <- apply(dframe, 2, function(x){sum(is.na(x))})
    badcols <- colna > pm * NROW(dframe)
    if (any(badcols)){
        if (verbose){
            cat(paste("deleteBogusColumns Diagnostic\n"))
            cat(paste("These columns from the data frame: ", deparse(substitute(dframe)), "\n are being purged: "))
            cat(paste(colnames(dframe)[badcols], "\n"))
            cat(paste("The bad content was:\n"))
            print(dframe[1:n, badcols, drop = drop])
        }
        newdframe <- dframe[, !badcols, drop = drop]
        return(newdframe)
    }
    print(paste("No bogus columns were found in: ", deparse(substitute(dframe))))
    invisible(dframe)
}


## TODO: make deleteBogus a generic, then write a method for data
## frames and a method for data tables.