File: perFeatureQCMetrics.R

package info (click to toggle)
r-bioc-scuttle 1.0.4%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 728 kB
  • sloc: cpp: 356; sh: 17; makefile: 2
file content (152 lines) | stat: -rw-r--r-- 5,695 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#' Per-feature quality control metrics
#'
#' Compute per-feature quality control metrics for a count matrix or a \linkS4class{SummarizedExperiment}.
#'
#' @param subsets A named list containing one or more vectors 
#' (a character vector of cell names, a logical vector, or a numeric vector of indices),
#' used to identify interesting sample subsets such as negative control wells.
#' @inheritParams perCellQCMetrics
#' @param detection_limit,exprs_values Soft deprecated equivalents to the arguments described above.
#'
#' @return
#' A \linkS4class{DataFrame} of QC statistics where each row corresponds to a row in \code{x}.
#' This contains the following fields:
#' \itemize{
#' \item \code{mean}: numeric, the mean counts for each feature.
#' \item \code{detected}: numeric, the percentage of observations above \code{threshold}.
#' }
#'
#' If \code{flatten=FALSE}, the output DataFrame also contains the \code{subsets} field.
#' This a nested DataFrame containing per-feature QC statistics for each subset of columns.
#'
#' If \code{flatten=TRUE}, \code{subsets} is flattened to remove the hierarchical structure.
#' 
#' @author Aaron Lun
#' 
#' @details
#' This function calculates useful QC metrics for features, including the mean across all cells
#' and the number of expressed features (i.e., counts above the detection limit).
#' 
#' If \code{subsets} is specified, the same statistics are computed for each subset of cells.
#' This is useful for obtaining statistics for cell sets of interest, e.g., negative control wells.
#' These statistics are stored as nested \linkS4class{DataFrame}s in the output.
#' For example, if \code{subsets} contained \code{"empty"} and \code{"cellpool"}, the output would look like:
#' \preformatted{  output 
#'   |-- mean 
#'   |-- detected
#'   +-- subsets
#'       |-- empty
#'       |   |-- mean 
#'       |   |-- detected
#'       |   +-- ratio
#'       +-- cellpool 
#'           |-- mean
#'           |-- detected
#'           +-- ratio
#' }
#' The \code{ratio} field contains the ratio of the mean within each subset to the mean across all cells.
#' 
#' If \code{flatten=TRUE}, the nested DataFrames are flattened by concatenating the column names with underscores.
#' This means that, say, the \code{subsets$empty$mean} nested field becomes the top-level \code{subsets_empty_mean} field.
#' A flattened structure is more convenient for end-users performing interactive analyses,
#' but less convenient for programmatic access as artificial construction of strings is required.
#' @examples
#' example_sce <- mockSCE()
#' stats <- perFeatureQCMetrics(example_sce)
#' stats
#'
#' # With subsets.
#' stats2 <- perFeatureQCMetrics(example_sce, subsets=list(Empty=1:10))
#' stats2
#'
#' @seealso 
#' \code{\link{addPerFeatureQC}}, to add the QC metrics to the row metadata.
#' @export
#' @name perFeatureQCMetrics
NULL

#' @importFrom beachmat rowBlockApply
#' @importFrom S4Vectors DataFrame make_zero_col_DFrame
#' @importFrom BiocParallel bplapply SerialParam
#' @importClassesFrom S4Vectors DFrame 
.per_feature_qc_metrics <- function(x, subsets = NULL, threshold = 0, BPPARAM=SerialParam(), flatten=TRUE,
    detection_limit=NULL) 
{
    threshold <- .replace(threshold, detection_limit)

    if (length(subsets) && is.null(names(subsets))){ 
        stop("'subsets' must be named")
    }
    subsets <- lapply(subsets, FUN = .subset2index, target = x, byrow = FALSE)

    # Computing all QC metrics, with cells split across workers.
    bp.out <- rowBlockApply(x, FUN=.per_feature_qc, cellcon=subsets, limit=threshold, BPPARAM=BPPARAM)

    # Aggregating across cores.
    full.info <- DataFrame(
        mean=unlist(lapply(bp.out, FUN=function(x) x[[1]][[1]])),
        detected=unlist(lapply(bp.out, FUN=function(x) x[[1]][[2]])) * 100,
        row.names=rownames(x)
    )

    # Collecting subset information.
    if (!is.null(subsets)) {
        sub.info <- make_zero_col_DFrame(nrow(x))
        for (i in seq_along(subsets)) {
            sub.out <- DataFrame(
                mean=unlist(lapply(bp.out, FUN=function(x) x[[2]][[i]][[1]])),
                detected=unlist(lapply(bp.out, FUN=function(x) x[[2]][[i]][[2]])) * 100
            )
            sub.out$ratio <- sub.out$mean/full.info$mean
            sub.info[[names(subsets)[i]]] <- sub.out
        }
        full.info$subsets <- sub.info
    }

    if (flatten) {
        full.info <- .flatten_nested_dims(full.info)
    }
    full.info
}

#' @importFrom Matrix rowMeans
#' @importClassesFrom Matrix sparseMatrix
#' @importClassesFrom DelayedArray SparseArraySeed
.per_feature_qc <- function(x, cellcon, limit) {
    if (is(x, "SparseArraySeed")) {
        x <- as(x, "sparseMatrix")
    }

    detected <- x > limit

    full <- list(
        sum=unname(rowMeans(x)),
        detected=unname(rowMeans(detected))
    )

    cellcons <- lapply(cellcon, function(i) {
        # TODO: switch to MatrixGenerics when that finally becomes available.
        list(
            sum=unname(rowMeans(x[,i,drop=FALSE])),
            detected=unname(rowMeans(detected[,i,drop=FALSE]))
        )
    })

    list(full, cellcons)
}

#' @export
#' @rdname perFeatureQCMetrics
setGeneric("perFeatureQCMetrics", function(x, ...) standardGeneric("perFeatureQCMetrics"))

#' @export
#' @rdname perFeatureQCMetrics
setMethod("perFeatureQCMetrics", "ANY", .per_feature_qc_metrics)

#' @export
#' @rdname perFeatureQCMetrics
#' @importFrom SummarizedExperiment assay
setMethod("perFeatureQCMetrics", "SummarizedExperiment", function(x, ..., assay.type="counts", exprs_values=NULL) {
    assay.type <- .replace(assay.type, exprs_values)
    .per_feature_qc_metrics(assay(x, assay.type), ...)
})