File: sumCountsAcrossFeatures.R

package info (click to toggle)
r-bioc-scuttle 1.0.4%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 728 kB
  • sloc: cpp: 356; sh: 17; makefile: 2
file content (136 lines) | stat: -rw-r--r-- 6,500 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#' Sum counts across feature sets
#' 
#' Sum together expression values (by default, counts) for each feature set in each cell.
#'
#' @param x A numeric matrix of counts containing features in rows and cells in columns.
#' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a count matrix.
#' @param ids A factor of length \code{nrow(x)}, specifying the set to which each feature in \code{x} belongs.
#'
#' Alternatively, a list of integer or character vectors, where each vector specifies the indices or names of features in a set.
#' Logical vectors are also supported.
#' @param average Logical scalar indicating whether the average should be computed instead of the sum.
#' @param subset.row An integer, logical or character vector specifying the features to use.
#' Defaults to all features.
#' @param subset.col An integer, logical or character vector specifying the cells to use.
#' Defaults to all cells with non-\code{NA} entries of \code{ids}.
#' @param assay.type A string or integer scalar specifying the assay of \code{x} containing the matrix of counts
#' (or any other expression quantity that can be meaningfully summed).
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying whether summation should be parallelized.
#' @param ... For the \code{sumCountsAcrossFeatures} generic, further arguments to be passed to specific methods.
#' 
#' For the SummarizedExperiment method, further arguments to be passed to the ANY method.
#' @param subset_row,subset_col,exprs_values Soft-deprecated equivalents of the arguments described above.
#'
#' @return 
#' A count matrix is returned with one row per level of \code{ids}.
#' In each cell, counts for all features in the same set are summed together (or averaged, if \code{average=TRUE}).
#' Rows are ordered according to \code{levels(ids)}.
#'
#' @details
#' This function provides a convenient method for aggregating counts across multiple rows for each cell.
#' Several possible applications are listed below:
#' \itemize{
#' \item Using a list of genes in \code{ids}, we can obtain a summary expression value for all genes in one or more gene sets.
#' This allows the activity of various pathways to be compared across cells.
#' \item Genes with multiple mapping locations in the reference will often manifest as multiple rows with distinct Ensembl/Entrez IDs.
#' These counts can be aggregated into a single feature by setting the shared identifier (usually the gene symbol) as \code{ids}.
#' \item It is theoretically possible to aggregate transcript-level counts to gene-level counts with this function.
#' However, it is often better to do so with dedicated functions (e.g., from the \pkg{tximport} or \pkg{tximeta} packages) that account for differences in length across isoforms.
#' }
#'
#' The behaviour of this function is equivalent to that of \code{\link{rowsum}}.
#' However, this function can operate on any matrix representation in \code{object},
#' and can do so in a parallelized manner for large matrices without resorting to block processing.
#'
#' If \code{ids} is a factor, any \code{NA} values are implicitly ignored and will not be considered or reported.
#' This may be useful, e.g., to remove undesirable feature sets by setting their entries in \code{ids} to \code{NA}.
#'
#' Setting \code{average=TRUE} will compute the average in each set rather than the sum.
#' This is particularly useful if \code{x} contains expression values that have already been normalized in some manner,
#' as computing the average avoids another round of normalization to account for differences in the size of each set.
#'
#' @author Aaron Lun
#' @name sumCountsAcrossFeatures
#'
#' @seealso 
#' \code{\link{aggregateAcrossFeatures}}, to perform additional aggregation of row-level metadata.
#'
#' \code{\link{numDetectedAcrossFeatures}}, to compute the number of detected features per cell.
#'
#' @examples
#' example_sce <- mockSCE()
#' ids <- sample(LETTERS, nrow(example_sce), replace=TRUE)
#' out <- sumCountsAcrossFeatures(example_sce, ids)
#' str(out)
NULL

#' @importFrom BiocParallel SerialParam 
.sum_counts_across_features <- function(x, ids, subset.row=NULL, subset.col=NULL, 
    average=FALSE, BPPARAM=SerialParam(), subset_row=NULL, subset_col=NULL) 
{
    subset.row <- .replace(subset.row, subset_row)
    subset.col <- .replace(subset.col, subset_col)
    .sum_across_features(x, ids, subset.row=subset.row, subset.col=subset.col, average=average, BPPARAM=BPPARAM)
}

#' @importFrom BiocParallel SerialParam
#' @importFrom beachmat colBlockApply
.sum_across_features <- function(x, ids, subset.row=NULL, subset.col=NULL, average=FALSE, BPPARAM=SerialParam(), modifier=NULL) {
    if (is.list(ids)) {
        ids <- lapply(ids, FUN=.subset2index, target=x, byrow=TRUE)
        runs <- lengths(ids)
        genes <- unlist(ids)
        names <- names(ids)
    } else {
        if (length(ids)!=nrow(x)) {
            stop("'ids' should be of length equal to 'nrow(x)'")            
        }
        ids <- factor(ids)
        genes <- order(ids, na.last=NA)
        runs <- as.integer(table(ids))
        names <- levels(ids)
    }

    if (!is.null(subset.row)) {
        subset.row <- .subset2index(subset.row, x, byrow=TRUE)
        keep <- genes %in% subset.row
        genes <- genes[keep]
        kept <- findInterval(which(keep), cumsum(runs), left.open=TRUE)
        runs <- tabulate(kept+1L, nbins=length(runs))
    }

    if (!is.null(subset.col)) {
        x <- x[,subset.col,drop=FALSE]
    }
    if (!is.null(modifier)) {
        x <- modifier(x)
    }

    out_list <- colBlockApply(x, BPPARAM=BPPARAM, FUN=sum_row_counts, genes=genes, runs=runs)
    out <- do.call(cbind, out_list)
    rownames(out) <- names
    colnames(out) <- colnames(x)

    if (average) {
        out <- out/runs
    }

    out
}

#' @export
#' @rdname sumCountsAcrossFeatures
setGeneric("sumCountsAcrossFeatures", function(x, ...) standardGeneric("sumCountsAcrossFeatures"))

#' @export
#' @rdname sumCountsAcrossFeatures
setMethod("sumCountsAcrossFeatures", "ANY", .sum_counts_across_features)

#' @export
#' @rdname sumCountsAcrossFeatures
#' @importFrom SummarizedExperiment assay
#' @importClassesFrom SummarizedExperiment SummarizedExperiment
setMethod("sumCountsAcrossFeatures", "SummarizedExperiment", function(x, ..., assay.type="counts", exprs_values=NULL) {
    assay.type <- .replace(assay.type, exprs_values)
    .sum_counts_across_features(assay(x, assay.type), ...)    
})