File: perCellQCMetrics.R

package info (click to toggle)
r-bioc-scuttle 1.0.4%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 728 kB
  • sloc: cpp: 356; sh: 17; makefile: 2
file content (295 lines) | stat: -rw-r--r-- 12,112 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#' Per-cell quality control metrics
#'
#' Compute per-cell quality control metrics for a count matrix or a \linkS4class{SingleCellExperiment}.
#'
#' @param x A numeric matrix of counts with cells in columns and features in rows.
#' 
#' Alternatively, a \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} object containing such a matrix.
#' @param subsets A named list containing one or more vectors 
#' (a character vector of feature names, a logical vector, or a numeric vector of indices),
#' used to identify interesting feature subsets such as ERCC spike-in transcripts or mitochondrial genes. 
#' @param percent.top An integer vector specifying the size(s) of the top set of high-abundance genes.
#' Used to compute the percentage of library size occupied by the most highly expressed genes in each cell.
#' @param threshold A numeric scalar specifying the threshold above which a gene is considered to be detected.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed.
#' @param ... For the generic, further arguments to pass to specific methods.
#' 
#' For the SummarizedExperiment and SingleCellExperiment methods, further arguments to pass to the ANY method.
#' @param assay.type A string or integer scalar indicating which \code{assays} in the \code{x} contains the count matrix.
#' @param use.altexps Logical scalar indicating whether QC statistics should be computed for alternative Experiments in \code{x}.
#' If \code{TRUE}, statistics are computed for all alternative experiments. 
#'
#' Alternatively, an integer or character vector specifying the alternative Experiments to use to compute QC statistics.
#' 
#' Alternatively \code{NULL}, in which case alternative experiments are not used.
#' @param flatten Logical scalar indicating whether the nested \linkS4class{DataFrame}s in the output should be flattened.
#' @param percent_top,detection_limit,exprs_values,use_altexps Soft deprecated equivalents to the arguments described above.
#'
#' @return
#' A \linkS4class{DataFrame} of QC statistics where each row corresponds to a column in \code{x}.
#' This contains the following fields:
#' \itemize{
#' \item \code{sum}: numeric, the sum of counts for each cell.
#' \item \code{detected}: numeric, the number of observations above \code{threshold}.
#' }
#'
#' If \code{flatten=FALSE}, the DataFrame will contain the additional columns:
#' \itemize{
#' \item \code{percent.top}: numeric matrix, the percentage of counts assigned to the top most highly expressed genes.
#' Each column of the matrix corresponds to an entry of \code{percent.top}, sorted in increasing order.
#' \item \code{subsets}: A nested DataFrame containing statistics for each subset, see Details.
#' \item \code{altexps}: A nested DataFrame containing statistics for each alternative experiment, see Details.
#' This is only returned for the SingleCellExperiment method.
#' \item \code{total}: numeric, the total sum of counts for each cell across main and alternative Experiments.
#' This is only returned for the SingleCellExperiment method.
#' }
#'
#' If \code{flatten=TRUE}, nested matrices and DataFrames are flattened to remove the hierarchical structure from the output DataFrame.
#' 
#' @author Aaron Lun
#' 
#' @details
#' This function calculates useful QC metrics for identification and removal of potentially problematic cells.
#' Obvious per-cell metrics are the sum of counts (i.e., the library size) and the number of detected features.
#' The percentage of counts in the top features also provides a measure of library complexity.
#' 
#' If \code{subsets} is specified, these statistics are also computed for each subset of features.
#' This is useful for investigating gene sets of interest, e.g., mitochondrial genes, Y chromosome genes.
#' These statistics are stored as nested \linkS4class{DataFrame}s in the \code{subsets} field of the output.
#' For example, if the input \code{subsets} contained \code{"Mito"} and \code{"Sex"}, the output would look like:
#' \preformatted{  output 
#'   |-- sum
#'   |-- detected
#'   |-- percent.top
#'   +-- subsets
#'       |-- Mito
#'       |   |-- sum
#'       |   |-- detected
#'       |   +-- percent
#'       +-- Sex 
#'           |-- sum
#'           |-- detected
#'           +-- percent
#' }
#' Here, the \code{percent} field contains the percentage of each cell's count sum assigned to each subset. 
#'
#' If \code{use.altexps=TRUE}, the same statistics are computed for each alternative experiment in \code{x}.
#' This can also be an integer or character vector specifying the alternative Experiments to use.
#' These statistics are also stored as nested \linkS4class{DataFrame}s, this time in the \code{altexps} field of the output.
#' For example, if \code{x} contained the alternative Experiments \code{"Spike"} and \code{"Ab"}, the output would look like:
#' \preformatted{  output 
#'   |-- sum
#'   |-- detected
#'   |-- percent.top
#'   +-- altexps 
#'   |   |-- Spike
#'   |   |   |-- sum
#'   |   |   |-- detected
#'   |   |   +-- percent.total
#'   |   +-- Ab
#'   |       |-- sum
#'   |       |-- detected
#'   |       +-- percent.total
#'   +-- total 
#' }
#' The \code{total} field contains the total sum of counts for each cell across the main and alternative Experiments.
#' The \code{percent} field contains the percentage of the total count in each alternative Experiment for each cell.
#' 
#' If \code{flatten=TRUE}, the nested DataFrames are flattened by concatenating the column names with underscores.
#' This means that, say, the \code{subsets$Mito$sum} nested field becomes the top-level \code{subsets_Mito_sum} field.
#' A flattened structure is more convenient for end-users performing interactive analyses,
#' but less convenient for programmatic access as artificial construction of strings is required.
#' 
#' @examples
#' example_sce <- mockSCE()
#' stats <- perCellQCMetrics(example_sce)
#' stats
#'
#' # With subsets.
#' stats2 <- perCellQCMetrics(example_sce, subsets=list(Mito=1:10), 
#'     flatten=FALSE)
#' stats2$subsets
#'
#' # With alternative Experiments.
#' pretend.spike <- ifelse(seq_len(nrow(example_sce)) < 10, "Spike", "Gene")
#' alt_sce <- splitAltExps(example_sce, pretend.spike)
#' stats3 <- perCellQCMetrics(alt_sce, flatten=FALSE)
#' stats3$altexps
#'
#'
#' @seealso 
#' \code{\link{addPerCellQC}}, to add the QC metrics to the column metadata.
#' @export
#' @name perCellQCMetrics
NULL

#' @importFrom beachmat colBlockApply
#' @importFrom S4Vectors DataFrame make_zero_col_DFrame
#' @importFrom BiocParallel bpmapply SerialParam
#' @importClassesFrom S4Vectors DataFrame
.per_cell_qc_metrics <- function(x, subsets = NULL, percent.top = integer(0),
    threshold = 0, BPPARAM=SerialParam(), flatten=TRUE, 
    percent_top=NULL, detection_limit=NULL)
{
    threshold <- .replace(threshold, detection_limit)
    percent.top <- .replace(percent.top, percent_top)

    if (length(subsets) && is.null(names(subsets))){ 
        stop("'subsets' must be named")
    }
    subsets <- lapply(subsets, FUN = .subset2index, target = x, byrow = TRUE)
    percent.top <- sort(as.integer(percent.top))

    # Computing all QC metrics, with cells split across workers. 
    bp.out <- colBlockApply(x, FUN=.per_cell_qc, featcon=subsets, top=percent.top, limit=threshold, BPPARAM=BPPARAM)

    # Aggregating across cores.
    full.info <- DataFrame(
        sum=unlist(lapply(bp.out, FUN=function(x) x[[1]][[1]])),
        detected=unlist(lapply(bp.out, FUN=function(x) x[[1]][[2]])),
        row.names=colnames(x)
    )

    pct <- do.call(cbind, lapply(bp.out, FUN=function(x) x[[1]][[3]]))
    rownames(pct) <- percent.top
    full.info$percent.top <- t(pct)/full.info$sum * 100

    # Collecting subset information.
    if (!is.null(subsets)) {
        sub.info <- make_zero_col_DFrame(ncol(x))
        for (i in seq_along(subsets)) {
            sub.out <- DataFrame(
                sum=unlist(lapply(bp.out, FUN=function(x) x[[2]][[i]][[1]])),
                detected=unlist(lapply(bp.out, FUN=function(x) x[[2]][[i]][[2]]))
            )
            sub.out$percent <- sub.out$sum/full.info$sum * 100
            sub.info[[names(subsets)[i]]] <- sub.out
        }
        full.info$subsets <- sub.info
    }

    if (flatten) {
        full.info <- .flatten_nested_dims(full.info)
    }
    full.info
}

#' @importFrom Matrix colSums 
#' @importClassesFrom Matrix sparseMatrix
#' @importClassesFrom DelayedArray SparseArraySeed
.per_cell_qc <- function(x, featcon, top, limit) {
    if (is(x, "SparseArraySeed")) {
        x <- as(x, "sparseMatrix")
    }

    detected <- x > limit

    full <- list(
        sum=unname(colSums(x)),
        detected=unname(colSums(detected)),
        prop=cumulative_prop(x, top)
    )

    featcons <- lapply(featcon, function(i) {
        # TODO: switch to MatrixGenerics when that finally becomes available.
        list(
            sum=unname(colSums(x[i,,drop=FALSE])),
            detected=unname(colSums(detected[i,,drop=FALSE]))
        )
    })

    list(full, featcons)
}

##################################################

#' @export
#' @rdname perCellQCMetrics
setGeneric("perCellQCMetrics", function(x, ...) standardGeneric("perCellQCMetrics"))

#' @export
#' @rdname perCellQCMetrics
setMethod("perCellQCMetrics", "ANY", .per_cell_qc_metrics)

#' @export
#' @rdname perCellQCMetrics
#' @importFrom SummarizedExperiment assay
setMethod("perCellQCMetrics", "SummarizedExperiment", function(x, ..., assay.type="counts", exprs_values=NULL) {
    assay.type <- .replace(assay.type, exprs_values)
    .per_cell_qc_metrics(assay(x, assay.type), ...)
})

#' @export
#' @rdname perCellQCMetrics
#' @importFrom SummarizedExperiment assay
#' @importFrom SingleCellExperiment altExp altExpNames
#' @importFrom S4Vectors make_zero_col_DFrame
#' @importClassesFrom S4Vectors DataFrame
setMethod("perCellQCMetrics", "SingleCellExperiment", 
    function(x, subsets=NULL, percent.top=integer(0), ..., flatten=TRUE, assay.type="counts", use.altexps=TRUE, 
        percent_top=NULL, exprs_values=NULL, use_altexps=NULL) 
{
    assay.type <- .replace(assay.type, exprs_values)
    use.altexps <- .replace(use.altexps, use_altexps)
    percent.top <- .replace(percent.top, percent_top)

    # subsets and percent.top need to be explicitly listed,
    # because the altexps call sets them to NULL and integer(0).
    main <- .per_cell_qc_metrics(assay(x, assay.type), subsets=subsets, percent.top=percent.top, flatten=FALSE, ...)
    use.altexps <- .use_names_to_integer_indices(use.altexps, x=x, nameFUN=altExpNames, msg="use.altexps")

    alt <- list()
    total <- main$sum
    for (i in seq_along(use.altexps)) {
        y <- assay(altExp(x, use.altexps[i]), assay.type)
        current <- .per_cell_qc_metrics(y, subsets=NULL, percent.top=integer(0), ...)
        current$percent.top <- current$subsets <- NULL
        total <- total + current$sum
        alt[[i]] <- current
    }
    for (i in seq_along(alt)) {
        alt[[i]]$percent <- alt[[i]]$sum/total * 100
    }

    if (length(alt)) {
        main$altexps <- do.call(DataFrame, lapply(alt, I))
        names(main$altexps) <- altExpNames(x)[use.altexps]
    } else {
        main$altexps <- make_zero_col_DFrame(ncol(x))
    }

    main$total <- total
    if (flatten) {
        main <- .flatten_nested_dims(main)
    }
    main
})

##################################################

#' @importFrom S4Vectors DataFrame
.flatten_nested_dims <- function(x, name="") {
    if (!is.null(dim(x))) {
        if (name!="") {
            name <- paste0(name, "_")
        }
        names <- sprintf("%s%s", name, colnames(x))
        rn <- rownames(x)

        df <- vector("list", ncol(x))
        for (i in seq_along(df)) {
            df[[i]] <- .flatten_nested_dims(x[,i], names[i])
        }
        if (length(df) > 0) {
            df <- do.call(cbind, df)
        } else {
            df <- DataFrame(x[,0])
        }

        rownames(df) <- rn
    } else {
        df <- DataFrame(x)
        colnames(df) <- name
    }
    df
}