File: summarizeAssayByGroup.R

package info (click to toggle)
r-bioc-scuttle 1.0.4%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 728 kB
  • sloc: cpp: 356; sh: 17; makefile: 2
file content (235 lines) | stat: -rw-r--r-- 9,393 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#' Summarize an assay by group 
#' 
#' From an assay matrix, compute summary statistics for groups of cells.
#' A typical example would be to compute various summary statistics for clusters.
#'
#' @param x A numeric matrix containing features in rows and cells in columns.
#' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.
#' @param ids A factor (or vector coercible into a factor) specifying the group to which each cell in \code{x} belongs.
#' Alternatively, a \linkS4class{DataFrame} of such vectors or factors, 
#' in which case each unique combination of levels defines a group. 
#' @param subset.row An integer, logical or character vector specifying the features to use.
#' If \code{NULL}, defaults to all features.
#' @param subset.col An integer, logical or character vector specifying the cells to use.
#' Defaults to all cells with non-\code{NA} entries of \code{ids}.
#' @param assay.type A string or integer scalar specifying the assay of \code{x} containing the assay to be summarized.
#' @param store.number String specifying the field of the output \code{\link{colData}} to store the number of cells in each group.
#' If \code{NULL}, nothing is stored.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying whether summation should be parallelized.
#' @param ... For the generics, further arguments to be passed to specific methods.
#' 
#' For the SummarizedExperiment method, further arguments to be passed to the ANY method.
#' @param statistics Character vector specifying the type of statistics to be computed, see Details.
#' @param threshold A numeric scalar specifying the threshold above which a gene is considered to be detected.
#'
#' @return 
#' A SummarizedExperiment is returned with one column per level of \code{ids}.
#' Each entry of the assay contains the sum or average across all cells in a given group (column) for a given feature (row).
#' Columns are ordered by \code{levels(ids)} and the number of cells per level is reported in the \code{"ncells"} column metadata.
#' For DataFrame \code{ids}, each column corresponds to a unique combination of levels (recorded in the \code{\link{colData}}).
#'
#' @details
#' These functions provide a convenient method for summing or averaging expression values across multiple columns for each feature.
#' A typical application would be to sum counts across all cells in each cluster to obtain \dQuote{pseudo-bulk} samples for further analyses, 
#' e.g., differential expression analyses between conditions.
#'
#' For each feature, the chosen assay can be aggregated by:
#' \itemize{
#' \item \code{"sum"}, the sum of all values in each group.
#' This makes the most sense for raw counts, to allow models to account for the mean-variance relationship.
#' \item \code{"mean"}, the mean of all values in each group.
#' This makes the most sense for normalized and/or transformed assays.
#' \item \code{"median"}, the median of all values in each group.
#' This makes the most sense for normalized and/or transformed assays, 
#' usually generated from large counts where discreteness is less of an issue.
#' \item \code{"num.detected"} and \code{"prop.detected"}, the number and proportion of values in each group that are non-zero.#
#' This makes the most sense for raw counts or sparsity-preserving transformations.
#' }
#'
#' Any \code{NA} values in \code{ids} are implicitly ignored and will not be considered during summation.
#' This may be useful for removing undesirable cells by setting their entries in \code{ids} to \code{NA}.
#' Alternatively, we can explicitly select the cells of interest with \code{subset_col}.
#'
#' If \code{ids} is a factor and contains unused levels, they will not be represented as columns in the output.
#' 
#' @author Aaron Lun
#' @name summarizeAssayByGroup
#'
#' @seealso
#' \code{\link{aggregateAcrossCells}}, which also combines information in the \code{\link{colData}} of \code{x}.
#'
#' @examples
#' example_sce <- mockSCE()
#' ids <- sample(LETTERS[1:5], ncol(example_sce), replace=TRUE)
#'
#' out <- summarizeAssayByGroup(example_sce, ids)
#' out
#'
#' batches <- sample(1:3, ncol(example_sce), replace=TRUE)
#' out2 <- summarizeAssayByGroup(example_sce, 
#'       DataFrame(label=ids, batch=batches))
#' head(out2)
NULL

##########################
##########################

#' @importFrom BiocParallel SerialParam 
#' @importFrom SummarizedExperiment SummarizedExperiment
.summarize_assay_by_group <- function(x, ids, subset.row=NULL, subset.col=NULL,
    statistics=c("mean", "sum", "num.detected", "prop.detected", "median"),
    store.number="ncells", threshold=0, BPPARAM=SerialParam()) 
{
    new.ids <- .process_ids(x, ids, subset.col)
    sum.out <- .summarize_assay(x, ids=new.ids, subset.row=subset.row,
        statistics=match.arg(statistics, several.ok=TRUE),
        threshold=threshold, BPPARAM=BPPARAM)

    mat.out <- sum.out$summary
    mapping <- match(colnames(mat.out[[1]]), as.character(new.ids))
    coldata <- .create_coldata(original.ids=ids, mapping=mapping, 
        freq=sum.out$freq, store.number=store.number)

    # Sync'ing the names as coldata is the source of truth.
    for (i in seq_along(mat.out)) {
        colnames(mat.out[[i]]) <- rownames(coldata)
    }

    SummarizedExperiment(mat.out, colData=coldata)
}

#' @importFrom BiocParallel SerialParam 
#' @importFrom beachmat rowBlockApply
.summarize_assay <- function(x, ids, statistics, threshold=0, subset.row=NULL, BPPARAM=SerialParam()) {
    if (!is.null(subset.row)) {
        x <- x[subset.row,,drop=FALSE]
    }

    lost <- is.na(ids)
    ids <- ids[!lost]
    if (any(lost)) {
        x <- x[,!lost,drop=FALSE]
    }

    # Drop unused levels, as the subsequent mapping step to preserve the type of 'ids'
    # in .create_coldata doesn't make sense (as there is no mapping to a concrete observation).
    by.group <- split(seq_along(ids), ids, drop=TRUE)

    out <- rowBlockApply(x, FUN=.summarize_assay_internal, by.group=by.group, 
        statistics=statistics, threshold=threshold, BPPARAM=BPPARAM)

    collected <- do.call(mapply, c(list(FUN=rbind, SIMPLIFY=FALSE, USE.NAMES=FALSE), out))
    names(collected) <- names(out[[1]])

    freq <- lengths(by.group)
    if ("mean" %in% statistics) {
        collected$mean <- t(t(collected$sum)/freq)
    }
    if ("prop.detected" %in% statistics) {
        collected$prop.detected <- t(t(collected$num.detected)/freq)
    }

    list(summary=collected[statistics], freq=freq)
}

#' @importFrom Matrix rowSums
#' @importFrom DelayedMatrixStats rowMedians 
#' @importClassesFrom Matrix sparseMatrix
#' @importClassesFrom DelayedArray SparseArraySeed
.summarize_assay_internal <- function(x, by.group, statistics, threshold) {
    if (is(x, "SparseArraySeed")) {
        x <- as(x, "sparseMatrix")
    }

    collated <- list()
    
    if ("sum" %in% statistics || "mean" %in% statistics) {
        out <- lapply(by.group, function(i) rowSums(x[,i,drop=FALSE]))
        collated$sum <- .cbind_empty(out, x)
    }

    if ("median" %in% statistics) {
        out <- lapply(by.group, function(i) rowMedians(x[,i,drop=FALSE]))
        out <- .cbind_empty(out, x)
        rownames(out) <- rownames(x)
        collated$median <- out
    }

    if ("num.detected" %in% statistics || "prop.detected" %in% statistics) {
        out <- lapply(by.group, function(i) rowSums(x[,i,drop=FALSE] > threshold))
        collated$num.detected <- .cbind_empty(out, x)
    }

    collated
}

.cbind_empty <- function(out, x) {
    if (length(out)) {
        do.call(cbind, out)
    } else {
        as.matrix(x[,0,drop=FALSE])
    }
}

##########################
##########################

#' @importFrom S4Vectors selfmatch
.df_to_factor <- function(ids) {
    o <- order(ids)
    x <- selfmatch(ids[o,,drop=FALSE]) 
    x[o] <- x
    x[Reduce("|", lapply(ids, is.na))] <- NA_integer_
    x
}

#' @importFrom methods is
.has_multi_ids <- function(ids) is(ids, "DataFrame")

.process_ids <- function(x, ids, subset.col) {    
    if (.has_multi_ids(ids)) {
        ids <- .df_to_factor(ids)
    } 
    if (ncol(x)!=length(ids)) {
        stop("length of 'ids' and 'ncol(x)' are not equal")
    }
    if (!is.null(subset.col)) {
        ids[!seq_along(ids) %in% .subset2index(subset.col, x, byrow=FALSE)] <- NA_integer_
    }
    ids
}

#' @importFrom S4Vectors DataFrame
.create_coldata <- function(original.ids, mapping, freq, store.number) {
    if (.has_multi_ids(original.ids)) {
        coldata <- original.ids[mapping,,drop=FALSE]
        rownames(coldata) <- NULL
    } else {
        coldata <- DataFrame(ids=original.ids[mapping])
        rownames(coldata) <- coldata$ids
    }

    if (!is.null(store.number)) {
        coldata[[store.number]] <- unname(freq)
    }
    coldata
}

##########################
##########################

#' @export
#' @rdname summarizeAssayByGroup
setGeneric("summarizeAssayByGroup", function(x, ...) standardGeneric("summarizeAssayByGroup"))

#' @export
#' @rdname summarizeAssayByGroup
#' @importFrom BiocParallel SerialParam
setMethod("summarizeAssayByGroup", "ANY", .summarize_assay_by_group)

#' @export
#' @rdname summarizeAssayByGroup
#' @importFrom SummarizedExperiment assay
setMethod("summarizeAssayByGroup", "SummarizedExperiment", function(x, ..., assay.type="counts") {
    .summarize_assay_by_group(assay(x, assay.type), ...)
})