File: doubletRecovery.R

package info (click to toggle)
r-bioc-scran 1.18.5%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,856 kB
  • sloc: cpp: 960; sh: 13; makefile: 2
file content (154 lines) | stat: -rw-r--r-- 7,736 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#' Recover intra-sample doublets
#'
#' Recover intra-sample doublets that are neighbors to known inter-sample doublets in a multiplexed experiment.
#' This function is now deprecated, use \code{recoverDoublets} from \pkg{scDblFinder} instead.
#'
#' @param x A log-expression matrix for all cells (including doublets) in columns and genes in rows.
#' Alternatively, a \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} containing such a matrix.
#'
#' If \code{transposed=TRUE}, a matrix of low-dimensional coordinates where each row corresponds to a cell.
#' This can also be in the \code{\link{reducedDims}} of a \linkS4class{SingleCellExperiment} if \code{use.dimred} is specified.
#' @param doublets A logical, integer or character vector specifying which cells in \code{x} are known (inter-sample) doublets.
#' @param samples A numeric vector containing the relative proportions of cells from each sample,
#' used to determine how many cells are to be considered as intra-sample doublets.
#' @param k Integer scalar specifying the number of nearest neighbors to use for computing the local doublet proportions.
#' @param transposed Logical scalar indicating whether \code{x} is transposed, i.e., cells in the rows.
#' @param subset.row See \code{?"\link{scran-gene-selection}"}, specifying the genes to use for the neighbor search. 
#' Only used when \code{transposed=FALSE}.
#' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying the algorithm to use for the nearest neighbor search.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization to use for the nearest neighbor search.
#' @param ... For the generic, additional arguments to pass to specific methods.
#' 
#' For the SummarizedExperiment method, additional arguments to pass to the ANY method.
#'
#' For the SingleCellExperiment method, additional arguments to pass to the SummarizedExperiment method.
#' @param assay.type A string specifying which assay values contain the log-expression matrix. 
#' @param use.dimred A string specifying whether existing values in \code{reducedDims(x)} should be used.
#'
#' @return
#' A \linkS4class{DataFrame} containing one row per cell and the following fields:
#' \itemize{
#' \item \code{proportion}, a numeric field containing the proportion of neighbors that are doublets.
#' \item \code{known}, a logical field indicating whether this cell is a known inter-sample doublet.
#' \item \code{predicted}, a logical field indicating whether this cell is a predicted intra-sample doublet.
#' }
#' The \code{\link{metadata}} contains \code{intra}, a numeric scalar containing the expected number of intra-sample doublets. 
#'
#' @details
#' In multiplexed single-cell experiments, we can detect doublets as libraries with labels for multiple samples.
#' However, this approach fails to identify doublets consisting of two cells with the same label.
#' Such cells may be problematic if they are still sufficiently abundant to drive formation of spurious clusters.
#'
#' This function identifies intra-sample doublets based on the similarity in expression profiles to known inter-sample doublets.
#' For each cell, we compute the proportion of the \code{k} neighbors that are known doublets.
#' Of the \dQuote{unmarked} cells that are not known doublets, 
#' those with top \eqn{X} largest proportions are considered to be intra-sample doublets.
#'
#' To compute \eqn{X}, we assume that the formation of doublets is random with respect to their originating samples.
#' This allows us to use \code{samples} to estimate the expected percentage of doublets that should occur within samples.
#' We then convert into an absolute number \eqn{X} based on the number of known doublets in \code{doublets}.
#'
#' A larger value of \code{k} provides more stable estimates of the doublet proportion in each cell.
#' However, this comes at the cost of assuming that each cell actually has \code{k} neighboring cells of the same state.
#' For example, if a doublet cluster has fewer than \code{k} members,
#' its doublet proportions will be \dQuote{diluted} by inclusion of unmarked cells in the next-closest cluster.
#' 
#' In principle, it is also possible to identify inter-sample doublets by applying a hard threshold on the doublet proportion.
#' This threshold can be set close to the expected percentage from \code{samples} (i.e., the same one used to derive \eqn{X}).
#' Unfortunately, in practice, the observed proportions are generally lower than expected,
#' possibly due to contamination of doublet subpopulations by unmarked cells in noisy expression data.
#' This motivates the use of a top \eqn{X} approach instead.
#'
#' @author Aaron Lun
#' 
#' @seealso
#' \code{\link{doubletCells}} and \code{\link{doubletCluster}},
#' for alternative methods of doublet detection when no prior doublet information is available.
#'
#' \code{hashedDrops} from the \pkg{DropletUtils} package,
#' to identify doublets from cell hashing experiments.
#'
#' @examples
#' # Mocking up an example.
#' set.seed(100)
#' ngenes <- 1000
#' mu1 <- 2^rnorm(ngenes, sd=2)
#' mu2 <- 2^rnorm(ngenes, sd=2)
#'
#' counts.1 <- matrix(rpois(ngenes*100, mu1), nrow=ngenes) # Pure type 1
#' counts.2 <- matrix(rpois(ngenes*100, mu2), nrow=ngenes) # Pure type 2
#' counts.m <- matrix(rpois(ngenes*20, mu1+mu2), nrow=ngenes) # Doublets (1 & 2)
#' all.counts <- cbind(counts.1, counts.2, counts.m)
#' lcounts <- scuttle::normalizeCounts(all.counts)
#' 
#' # Pretending that half of the doublets are known. Also pretending that 
#' # the experiment involved two samples of equal size.
#' known <- 200 + seq_len(10) 
#' out <- doubletRecovery(lcounts, doublets=known, k=10, samples=c(1, 1))
#' out
#'
#' @name doubletRecovery
NULL

#' @importFrom Matrix t
#' @importFrom BiocNeighbors findKNN KmknnParam
#' @importFrom utils head
#' @importFrom S4Vectors DataFrame
#' @importFrom scuttle .subset2index
.doublet_recovery <- function(x, doublets, samples,
    k=50, transposed=FALSE, subset.row=NULL, BNPARAM=KmknnParam(), BPPARAM=SerialParam()) 
{
    if (!transposed) {
        if (!is.null(subset.row)) {
            x <- x[subset.row,,drop=FALSE]
        }
        x <- t(x)
    }
    .Deprecated(old="doubletRecovery",new="scDblFinder::recoverDoublets")

    is.doublet <- logical(nrow(x))
    is.doublet[.subset2index(doublets, x, byrow=TRUE)] <- TRUE

    fout <- findKNN(x, k=k, BNPARAM=BNPARAM, BPPARAM=BPPARAM)
    neighbors <- fout$index
    neighbors[] <- is.doublet[neighbors]
    P <- rowMeans(neighbors)

    expected.intra <- sum(samples^2)/sum(samples)^2
    intra.doublets <- sum(is.doublet) * expected.intra/(1 - expected.intra)

    predicted <- logical(nrow(x))
    o <- order(P[!is.doublet], decreasing=TRUE)
    predicted[!is.doublet][head(o, intra.doublets)] <- TRUE

    output <- DataFrame(proportion=P, known=is.doublet, predicted=predicted)
    metadata(output)$intra <- intra.doublets
    output
}

#' @export
#' @rdname doubletRecovery
setGeneric("doubletRecovery", function(x, ...) standardGeneric("doubletRecovery"))

#' @export
#' @rdname doubletRecovery
setMethod("doubletRecovery", "ANY", .doublet_recovery)

#' @export
#' @importFrom SummarizedExperiment assay
#' @rdname doubletRecovery
setMethod("doubletRecovery", "SummarizedExperiment", function(x, ..., assay.type="logcounts") {
    .doublet_recovery(assay(x, assay.type), ...)
})

#' @export
#' @importFrom SingleCellExperiment reducedDim
#' @rdname doubletRecovery
setMethod("doubletRecovery", "SingleCellExperiment", function(x, ..., use.dimred=NULL) {
    if (!is.null(use.dimred)) {
        .doublet_recovery(reducedDim(x, use.dimred), transposed=TRUE, ...)
    } else {
        callNextMethod(x=x, ...)
    }
})