File: dataframe.R

package info (click to toggle)
r-cran-rpf 1.0.11%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,484 kB
  • sloc: cpp: 5,364; sh: 114; ansic: 41; makefile: 2
file content (113 lines) | stat: -rw-r--r-- 4,123 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
##' Order a data.frame by missingness and all columns
##'
##' Completely order all rows in a data.frame.
##' 
##' @param observed a data.frame holding ordered factors in every column
##' @return the sorted order of the rows
##' @examples
##' df <- as.data.frame(matrix(c(sample.int(2, 30, replace=TRUE)), 10, 3))
##' mask <- matrix(c(sample.int(3, 30, replace=TRUE)), 10, 3) == 1
##' df[mask] <- NA
##' df[orderCompletely(df),]
orderCompletely <- function(observed) {
	observedNames <- colnames(observed)
	nacount <- sapply(observedNames, function(x) { sum(is.na(observed[,x])) })
	observedNames <- observedNames[order(nacount, decreasing=TRUE)]
	othervectorsNA <- lapply(observedNames, function(x) {!is.na(observed[,x]) })
	othervectors <- lapply(observedNames, function(x) {observed[,x] })
	args <- c(othervectorsNA, othervectors, 'na.last'=FALSE)
	do.call('order', args)
}

##' Tabulate data.frame rows
##'
##' Like \code{tabulate} but entire rows are the unit of tabulation.
##' The data.frame is not sorted, but must be sorted already.
##'
##' @param observed a sorted data.frame holding ordered factors in every column
##' @seealso \code{\link{orderCompletely}}
##' @examples
##' df <- as.data.frame(matrix(c(sample.int(2, 30, replace=TRUE)), 10, 3))
##' df <- df[orderCompletely(df),]
##' tabulateRows(df)
tabulateRows <- function(observed) {
	selectMissing <- rep(0L, nrow(observed))
	selectDefvars <- rep(0L, nrow(observed))
	threeVectors <- .Call('_rpf_findIdenticalRowsData', observed,
			      selectMissing, selectDefvars, TRUE, TRUE)
	dups <- threeVectors[[1]]
	result <- rep(NA, sum(dups==1L))
	dx <- 1L
	rx <- 1L
	while (dx <= length(dups)) {
		result[rx] <- dups[dx]
		rx <- rx + 1L
		dx <- dx + dups[dx]
	}
	result
}

#' Expand summary table of patterns and frequencies
#'
#' Expand a summary table of unique response patterns to a full sized
#' data-set.
#'
#' @param tabdata An object of class \code{data.frame} with the unique response patterns and the number of frequencies
#' @param freqName Column name containing the frequencies
#' @return Returns a data frame with all the response patterns
#' @author Based on code by Phil Chalmers \email{rphilip.chalmers@@gmail.com}
#' @examples
#' data(LSAT7)
#' expandDataFrame(LSAT7, freqName="freq")
expandDataFrame <- function(tabdata, freqName=NULL) {
	if (is.null(colnames(tabdata))) stop("colnames are required")

    if (missing(freqName)) {
        freqCol <- ncol(tabdata)
	warning(paste("Assuming column", colnames(tabdata)[freqCol], "contains frequencies"))
    } else {
        freqCol <- which(freqName == colnames(tabdata))
        if (length(freqCol) != 1) {
            stop(paste("Frequency column", freqName, "not found"))
        }
    }

    rows <- sum(tabdata[,freqCol])
    indexVector <- rep(NA, rows)
    rx <- 1L
    ix <- 1L
    while (rx <= nrow(tabdata)) {
        indexVector[ix:(ix + tabdata[rx,freqCol] - 1)] <- rx
	ix <- ix + tabdata[rx,freqCol]
        rx <- rx + 1L
    }
    tabdata[indexVector,-freqCol]
}

#' Compress a data frame into unique rows and frequencies
#'
#' Compress a data frame into unique rows and frequency counts.
#'
#' @param tabdata An object of class \code{data.frame}
#' @param freqColName Column name to contain the frequencies
#' @param .asNumeric logical. Whether to cast the frequencies to the numeric type
#' @return Returns a compressed data frame
#' @examples
#' df <- as.data.frame(matrix(c(sample.int(2, 30, replace=TRUE)), 10, 3))
#' compressDataFrame(df)
compressDataFrame <- function(tabdata, freqColName="freq", .asNumeric=FALSE) {
	if (!is.na(match(freqColName, colnames(tabdata)))) {
		# Might be nice to recompress instead of stopping.
		# There might be rows to collapse due to removal
		# of columns.
		stop(paste("Frequency column", freqColName, "already appears as a column:",
			   paste(colnames(tabdata), collapse=", ")))
	}
	tabdata <- tabdata[orderCompletely(tabdata),,drop=FALSE]
	freq <- tabulateRows(tabdata)
	if (.asNumeric) freq <- as.numeric(freq)
	tabdata <- unique(tabdata)
	tabdata <- cbind(tabdata, freq)
	colnames(tabdata)[ncol(tabdata)] <- freqColName
	tabdata
}