1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
|
#' Summarize an assay by group
#'
#' From an assay matrix, compute summary statistics for groups of cells.
#' A typical example would be to compute various summary statistics for clusters.
#'
#' @param x A numeric matrix containing features in rows and cells in columns.
#' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.
#' @param ids A factor (or vector coercible into a factor) specifying the group to which each cell in \code{x} belongs.
#' Alternatively, a \linkS4class{DataFrame} of such vectors or factors,
#' in which case each unique combination of levels defines a group.
#' @param subset.row An integer, logical or character vector specifying the features to use.
#' If \code{NULL}, defaults to all features.
#' @param subset.col An integer, logical or character vector specifying the cells to use.
#' Defaults to all cells with non-\code{NA} entries of \code{ids}.
#' @param assay.type A string or integer scalar specifying the assay of \code{x} containing the assay to be summarized.
#' @param store.number String specifying the field of the output \code{\link{colData}} to store the number of cells in each group.
#' If \code{NULL}, nothing is stored.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying whether summation should be parallelized.
#' @param ... For the generics, further arguments to be passed to specific methods.
#'
#' For the SummarizedExperiment method, further arguments to be passed to the ANY method.
#' @param statistics Character vector specifying the type of statistics to be computed, see Details.
#' @param threshold A numeric scalar specifying the threshold above which a gene is considered to be detected.
#'
#' @return
#' A SummarizedExperiment is returned with one column per level of \code{ids}.
#' Each entry of the assay contains the sum or average across all cells in a given group (column) for a given feature (row).
#' Columns are ordered by \code{levels(ids)} and the number of cells per level is reported in the \code{"ncells"} column metadata.
#' For DataFrame \code{ids}, each column corresponds to a unique combination of levels (recorded in the \code{\link{colData}}).
#'
#' @details
#' These functions provide a convenient method for summing or averaging expression values across multiple columns for each feature.
#' A typical application would be to sum counts across all cells in each cluster to obtain \dQuote{pseudo-bulk} samples for further analyses,
#' e.g., differential expression analyses between conditions.
#'
#' For each feature, the chosen assay can be aggregated by:
#' \itemize{
#' \item \code{"sum"}, the sum of all values in each group.
#' This makes the most sense for raw counts, to allow models to account for the mean-variance relationship.
#' \item \code{"mean"}, the mean of all values in each group.
#' This makes the most sense for normalized and/or transformed assays.
#' \item \code{"median"}, the median of all values in each group.
#' This makes the most sense for normalized and/or transformed assays,
#' usually generated from large counts where discreteness is less of an issue.
#' \item \code{"num.detected"} and \code{"prop.detected"}, the number and proportion of values in each group that are non-zero.#
#' This makes the most sense for raw counts or sparsity-preserving transformations.
#' }
#'
#' Any \code{NA} values in \code{ids} are implicitly ignored and will not be considered during summation.
#' This may be useful for removing undesirable cells by setting their entries in \code{ids} to \code{NA}.
#' Alternatively, we can explicitly select the cells of interest with \code{subset_col}.
#'
#' If \code{ids} is a factor and contains unused levels, they will not be represented as columns in the output.
#'
#' @author Aaron Lun
#' @name summarizeAssayByGroup
#'
#' @seealso
#' \code{\link{aggregateAcrossCells}}, which also combines information in the \code{\link{colData}} of \code{x}.
#'
#' @examples
#' example_sce <- mockSCE()
#' ids <- sample(LETTERS[1:5], ncol(example_sce), replace=TRUE)
#'
#' out <- summarizeAssayByGroup(example_sce, ids)
#' out
#'
#' batches <- sample(1:3, ncol(example_sce), replace=TRUE)
#' out2 <- summarizeAssayByGroup(example_sce,
#' DataFrame(label=ids, batch=batches))
#' head(out2)
NULL
##########################
##########################
#' @importFrom BiocParallel SerialParam
#' @importFrom SummarizedExperiment SummarizedExperiment
.summarize_assay_by_group <- function(x, ids, subset.row=NULL, subset.col=NULL,
statistics=c("mean", "sum", "num.detected", "prop.detected", "median"),
store.number="ncells", threshold=0, BPPARAM=SerialParam())
{
new.ids <- .process_ids(x, ids, subset.col)
sum.out <- .summarize_assay(x, ids=new.ids, subset.row=subset.row,
statistics=match.arg(statistics, several.ok=TRUE),
threshold=threshold, BPPARAM=BPPARAM)
mat.out <- sum.out$summary
mapping <- match(colnames(mat.out[[1]]), as.character(new.ids))
coldata <- .create_coldata(original.ids=ids, mapping=mapping,
freq=sum.out$freq, store.number=store.number)
# Sync'ing the names as coldata is the source of truth.
for (i in seq_along(mat.out)) {
colnames(mat.out[[i]]) <- rownames(coldata)
}
SummarizedExperiment(mat.out, colData=coldata)
}
#' @importFrom BiocParallel SerialParam
#' @importFrom beachmat rowBlockApply
.summarize_assay <- function(x, ids, statistics, threshold=0, subset.row=NULL, BPPARAM=SerialParam()) {
if (!is.null(subset.row)) {
x <- x[subset.row,,drop=FALSE]
}
lost <- is.na(ids)
ids <- ids[!lost]
if (any(lost)) {
x <- x[,!lost,drop=FALSE]
}
# Drop unused levels, as the subsequent mapping step to preserve the type of 'ids'
# in .create_coldata doesn't make sense (as there is no mapping to a concrete observation).
by.group <- split(seq_along(ids), ids, drop=TRUE)
out <- rowBlockApply(x, FUN=.summarize_assay_internal, by.group=by.group,
statistics=statistics, threshold=threshold, BPPARAM=BPPARAM)
collected <- do.call(mapply, c(list(FUN=rbind, SIMPLIFY=FALSE, USE.NAMES=FALSE), out))
names(collected) <- names(out[[1]])
freq <- lengths(by.group)
if ("mean" %in% statistics) {
collected$mean <- t(t(collected$sum)/freq)
}
if ("prop.detected" %in% statistics) {
collected$prop.detected <- t(t(collected$num.detected)/freq)
}
list(summary=collected[statistics], freq=freq)
}
#' @importFrom Matrix rowSums
#' @importFrom DelayedMatrixStats rowMedians
#' @importClassesFrom Matrix sparseMatrix
#' @importClassesFrom DelayedArray SparseArraySeed
.summarize_assay_internal <- function(x, by.group, statistics, threshold) {
if (is(x, "SparseArraySeed")) {
x <- as(x, "sparseMatrix")
}
collated <- list()
if ("sum" %in% statistics || "mean" %in% statistics) {
out <- lapply(by.group, function(i) rowSums(x[,i,drop=FALSE]))
collated$sum <- .cbind_empty(out, x)
}
if ("median" %in% statistics) {
out <- lapply(by.group, function(i) rowMedians(x[,i,drop=FALSE]))
out <- .cbind_empty(out, x)
rownames(out) <- rownames(x)
collated$median <- out
}
if ("num.detected" %in% statistics || "prop.detected" %in% statistics) {
out <- lapply(by.group, function(i) rowSums(x[,i,drop=FALSE] > threshold))
collated$num.detected <- .cbind_empty(out, x)
}
collated
}
.cbind_empty <- function(out, x) {
if (length(out)) {
do.call(cbind, out)
} else {
as.matrix(x[,0,drop=FALSE])
}
}
##########################
##########################
#' @importFrom S4Vectors selfmatch
.df_to_factor <- function(ids) {
o <- order(ids)
x <- selfmatch(ids[o,,drop=FALSE])
x[o] <- x
x[Reduce("|", lapply(ids, is.na))] <- NA_integer_
x
}
#' @importFrom methods is
.has_multi_ids <- function(ids) is(ids, "DataFrame")
.process_ids <- function(x, ids, subset.col) {
if (.has_multi_ids(ids)) {
ids <- .df_to_factor(ids)
}
if (ncol(x)!=length(ids)) {
stop("length of 'ids' and 'ncol(x)' are not equal")
}
if (!is.null(subset.col)) {
ids[!seq_along(ids) %in% .subset2index(subset.col, x, byrow=FALSE)] <- NA_integer_
}
ids
}
#' @importFrom S4Vectors DataFrame
.create_coldata <- function(original.ids, mapping, freq, store.number) {
if (.has_multi_ids(original.ids)) {
coldata <- original.ids[mapping,,drop=FALSE]
rownames(coldata) <- NULL
} else {
coldata <- DataFrame(ids=original.ids[mapping])
rownames(coldata) <- coldata$ids
}
if (!is.null(store.number)) {
coldata[[store.number]] <- unname(freq)
}
coldata
}
##########################
##########################
#' @export
#' @rdname summarizeAssayByGroup
setGeneric("summarizeAssayByGroup", function(x, ...) standardGeneric("summarizeAssayByGroup"))
#' @export
#' @rdname summarizeAssayByGroup
#' @importFrom BiocParallel SerialParam
setMethod("summarizeAssayByGroup", "ANY", .summarize_assay_by_group)
#' @export
#' @rdname summarizeAssayByGroup
#' @importFrom SummarizedExperiment assay
setMethod("summarizeAssayByGroup", "SummarizedExperiment", function(x, ..., assay.type="counts") {
.summarize_assay_by_group(assay(x, assay.type), ...)
})
|