1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
|
#' Combine multiple sets of marker statistics
#'
#' Combine multiple sets of marker statistics, typically from different tests,
#' into a single \linkS4class{DataFrame} for convenient inspection.
#'
#' @param ... Two or more lists or \linkS4class{List}s produced by \code{\link{findMarkers}} or \code{\link{combineMarkers}}.
#' Each list should contain \linkS4class{DataFrame}s of results, one for each group/cluster of cells.
#'
#' The names of each List should be the same; the universe of genes in each DataFrame should be the same;
#' and the same number of columns in each DataFrame should be named.
#' All elements in \code{...} are also expected to be named.
#' @param repeated Character vector of columns that are present in one or more DataFrames but should only be reported once.
#' Typically used to avoid reporting redundant copies of annotation-related columns.
#' @param sorted Logical scalar indicating whether each output DataFrame should be sorted by some relevant statistic.
#'
#' @return
#' A named List of DataFrames with one DataFrame per group/cluster.
#' Each DataFrame contains statistics from the corresponding entry of each List in \code{...},
#' prefixed with the name of the List.
#' In addition, several combined statistics are reported:
#' \itemize{
#' \item \code{Top}, the largest rank of each gene across all DataFrames for that group.
#' This is only reported if each list in \code{...} was generated with \code{pval.type="any"} in \code{\link{combineMarkers}}.
#' \item \code{p.value}, the largest p-value of each gene across all DataFrames for that group.
#' This is replaced by \code{log.p.value} if p-values in \code{...} are log-transformed.
#' \item \code{FDR}, the BH-adjusted value of \code{p.value}.
#' This is replaced by \code{log.FDR} if p-values in \code{...} are log-transformed.
#' }
#'
#' @details
#' The combined statistics are designed to favor a gene that is highly ranked in each of the individual test results.
#' This is highly conservative and aims to identify robust DE that is significant under all testing schemes.
#'
#' A combined \code{Top} value of T indicates that the gene is among the top T genes of one or more pairwise comparisons
#' in each of the testing schemes.
#' (We can be even more aggressive if the individual results were generated with a larger \code{min.prop} value.)
#' In effect, a gene can only achieve a low \code{Top} value if it is consistently highly ranked in each test.
#' If \code{sorted=TRUE}, this is used to order the genes in the output DataFrame.
#'
#' The combined \code{p.value} is effectively the result of applying an intersection-union test to the per-test results.
#' This will only be low if the gene has a low p-value in each of the test results.
#' If \code{sorted=TRUE} and \code{Top} is not present, this will be used to order the genes in the output DataFrame.
#'
#' @author Aaron Lun
#'
#' @seealso
#' \code{\link{findMarkers}} and \code{\link{combineMarkers}}, to generate elements in \code{...}.
#'
#' @examples
#' library(scuttle)
#' sce <- mockSCE()
#' sce <- logNormCounts(sce)
#'
#' # Any clustering method is okay, only using k-means for convenience.
#' kout <- kmeans(t(logcounts(sce)), centers=4)
#'
#' tout <- findMarkers(sce, groups=kout$cluster, direction="up")
#' wout <- findMarkers(sce, groups=kout$cluster, direction="up", test="wilcox")
#'
#' combined <- multiMarkerStats(t=tout, wilcox=wout)
#' colnames(combined[[1]])
#'
#' @export
#' @importFrom S4Vectors DataFrame SimpleList I
multiMarkerStats <- function(..., repeated=NULL, sorted=TRUE) {
all.methods <- list(...)
nmethods <- length(all.methods)
if (is.null(names(all.methods))) {
stop("elements in '...' must be named")
}
# Checking groups within each element.
all.methods <- .check_element_names(all.methods,
FUN=names, SUB=function(x, i) x[i],
msg="names inside each list")
group.names <- names(all.methods[[1]])
# Iterating over each group and combining the statistics.
output <- vector("list", length(group.names))
names(output) <- group.names
for (i in seq_along(group.names)) {
collected <- vector("list", nmethods)
for (j in seq_len(nmethods)) {
collected[[j]] <- all.methods[[j]][[i]]
}
collected <- .check_element_names(collected,
FUN=rownames, SUB=function(x, i) x[i,,drop=FALSE],
msg=sprintf("row names for '%s'", group.names[i]))
gene.names <- rownames(collected[[1]])
# Pulling out the redundant columns before counting the number of columns.
redundancies <- list()
for (r in repeated) {
curval <- NULL
for (j in seq_len(nmethods)) {
if (!is.null(collected[[j]][[r]])) {
curval <- collected[[j]][[r]]
collected[[j]][[r]] <- NULL
}
}
redundancies[[r]] <- curval
}
ncols <- vapply(collected, ncol, 0L)
if (length(unique(ncols))!=1L) {
stop(sprintf("different numbers of columns for '%s'", group.names[i]))
}
ncols <- ncols[1]
collated <- DataFrame(row.names=gene.names)
if (length(redundancies)) {
redundancies <- do.call(DataFrame, redundancies)
collated <- cbind(collated, redundancies)
}
# Interleave relevant statistics. We assume they're in the same order across DFs.
interleaved <- vector("list", nmethods)
for (j in seq_len(nmethods)) {
current <- lapply(collected[[j]], I)
names(current) <- paste0(names(all.methods)[j], ".", names(current))
interleaved[[j]] <- current
}
flip <- as.integer(matrix(seq_len(nmethods*ncols), nrow=nmethods, byrow=TRUE))
interleaved <- unlist(interleaved, recursive=FALSE)[flip]
interleaved <- do.call(DataFrame, interleaved)
# Computing combined statistics.
all.top <- .extract_columns(collected, "Top")
all.p <- .extract_columns(collected, "p.value")
all.lp <- .extract_columns(collected, "log.p.value")
if (!is.null(all.top)) {
collated$Top <- do.call(pmax, all.top)
}
if (!is.null(all.p)) {
stats <-collated$p.value <- do.call(pmax, all.p)
collated$FDR <- p.adjust(collated$p.value, method="BH")
} else if (!is.null(all.lp)) {
stats <- collated$log.p.value <- do.call(pmax, all.lp)
collated$log.FDR <- .logBH(collated$log.p.value)
} else {
stop("p-value field should be 'p.value' or 'log.p.value'")
}
# Assembling the final result.
collated <- cbind(collated, interleaved)
if (sorted) {
o <- order(if (!is.null(collated$Top)) collated$Top else stats)
collated <- collated[o,,drop=FALSE]
}
output[[i]] <- collated
}
SimpleList(output)
}
.check_element_names <- function(things, FUN, SUB, msg) {
.names <- FUN(things[[1]])
s.names <- sort(.names)
for (i in seq_along(things)) {
cur.names <- FUN(things[[i]])
if (is.null(cur.names)) {
stop(paste(msg, "should be non-NULL"))
}
if (!identical(s.names, sort(FUN(things[[i]])))) {
stop(paste(msg, "should be the same"))
}
things[[i]] <- SUB(things[[i]], .names)
}
things
}
.extract_columns <- function(things, col) {
for (j in seq_along(things)) {
current <- things[[j]]
things[j] <- list(if (col %in% colnames(current)) current[,col] else NULL)
}
decision <- unique(vapply(things, is.null, TRUE))
if (length(decision)!=1L) {
stop(sprintf("either all or no elements should have '%s'", col))
}
if (decision) NULL else things
}
|