File: correlateNull.R

package info (click to toggle)
r-bioc-scran 1.26.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,692 kB
  • sloc: cpp: 733; makefile: 2
file content (154 lines) | stat: -rw-r--r-- 6,225 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#' Build null correlations
#' 
#' Build a distribution of correlations under the null hypothesis of independent expression between pairs of genes.
#' This is now deprecated as \code{\link{correlatePairs}} uses an approximation instead.
#'
#' @param ncells An integer scalar indicating the number of cells in the data set.
#' @param iters An integer scalar specifying the number of values in the null distribution.
#' @param block A factor specifying the blocking level for each cell.
#' @param design A numeric design matrix containing uninteresting factors to be ignored.
#' @param equiweight A logical scalar indicating whether statistics from each block should be given equal weight.
#' Otherwise, each block is weighted according to its number of cells.
#' Only used if \code{block} is specified.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object that specifies the manner of parallel processing to use.
#'
#' @details
#' The \code{correlateNull} function constructs an empirical null distribution for Spearman's rank correlation when it is computed with \code{ncells} cells.
#' This is done by shuffling the ranks, calculating the correlation and repeating until \code{iters} values are obtained.
#' No consideration is given to tied ranks, which has implications for the accuracy of p-values in \code{\link{correlatePairs}}.
#'
#' If \code{block} is specified, a null correlation is created within each level of \code{block} using the shuffled ranks.
#' The final correlation is then defined as the average of the per-level correlations, 
#' weighted by the number of cells in that level if \code{equiweight=FALSE}.
#' Levels with fewer than 3 cells are ignored, and if no level has 3 or more cells, all returned correlations will be \code{NA}.
#'
#' If \code{design} is specified, the same process is performed on ranks derived from simulated residuals computed by fitting the linear model to a vector of normally distributed values.
#' If there are not at least 3 residual d.f., all returned correlations will be \code{NA}.
#' The \code{design} argument cannot be used at the same time as \code{block}.
#' 
#' % Yeah, we could use a t-distribution for this, but the empirical distribution is probably more robust if you have few cells (or effects, after batch correction).
#'
#' @return
#' A numeric vector of length \code{iters} is returned containing the sorted correlations under the null hypothesis of no correlations.
#'
#' @author
#' Aaron Lun
#' 
#' @seealso
#' \code{\link{correlatePairs}}, where the null distribution is used to compute p-values.
#'
#' @examples
#' set.seed(0)
#' ncells <- 100
#'
#' # Simplest case:
#' null.dist <- correlateNull(ncells, iters=10000)
#' hist(null.dist)
#'
#' # With a blocking factor:
#' block <- sample(LETTERS[1:3], ncells, replace=TRUE)
#' null.dist <- correlateNull(block=block, iters=10000)
#' hist(null.dist)
#'
#' # With a design matrix.
#' cov <- runif(ncells)
#' X <- model.matrix(~cov)
#' null.dist <- correlateNull(design=X, iters=10000)
#' hist(null.dist)
#' 
#' @export
#' @importFrom BiocParallel SerialParam bpmapply bpstart bpstop bpisup
#' @importFrom scuttle .bpNotSharedOrUp .ranksafeQR
correlateNull <- function(ncells, iters=1e6, block=NULL, design=NULL, equiweight=TRUE, BPPARAM=SerialParam()) {
    .Deprecated()
    if (!bpisup(BPPARAM)) {
        bpstart(BPPARAM)
        on.exit(bpstop(BPPARAM))
    }

    iters <- as.integer(iters)
    iters.per.core <- .niters_by_nworkers(iters, BPPARAM)

    blockFUN <- function(group) {
        pcg.state <- .setup_pcg_state(iters.per.core)
        out.g <- bpmapply(Niters=iters.per.core, Seeds=pcg.state$seeds, Streams=pcg.state$streams,
            MoreArgs=list(Ncells=length(group)), FUN=get_null_rho,
            SIMPLIFY=FALSE, USE.NAMES=FALSE, BPPARAM=BPPARAM)
        unlist(out.g) 
    }

    designFUN <- function(design) {
        QR <- .ranksafeQR(design)
        pcg.state <- .setup_pcg_state(iters.per.core)
        out.g <- bpmapply(Niters=iters.per.core, Seeds=pcg.state$seeds, Streams=pcg.state$streams, 
            MoreArgs=list(qr=QR$qr, qraux=QR$qraux), FUN=get_null_rho_design,
            SIMPLIFY=FALSE, USE.NAMES=FALSE, BPPARAM=BPPARAM)
        unlist(out.g) 
    }

    output <- .correlator_base(ncells, block, design, equiweight, blockFUN, designFUN)
    if (is.null(output)) {
        output <- rep(NA_real_, iters)
    }
    output
}

.correlator_base <- function(ncells, block, design, equiweight, blockFUN, designFUN) {
    if (is.null(design)) { 
        if (is.null(block)) {
            block <- rep(1L, ncells)
        } else if (!missing(ncells) && ncells!=length(block)) { 
            stop("cannot specify both 'ncells' and 'block'")
        }

        groupings <- split(seq_along(block), block)
        if (equiweight) {
            weights <- rep(1, length(groupings))
        } else {
            weights <- lengths(groupings)
        }

        # Estimating the correlation as a weighted mean of the correlations in each group.
        # This avoids the need for the normality assumption in the residual effect simulation.
        out <- total <- 0
        for (g in seq_along(groupings)) {
            ngr <- length(groupings[[g]])
            if (ngr <= 2L) {
                next            
            }

            out.g <- blockFUN(groupings[[g]])
            w <- weights[g]
            out <- out + unlist(out.g) * w
            total <- total + w
        }
        out/total

    } else {
        if (!is.null(block)) {
            stop("cannot specify both 'block' and 'design'")
        }
        if (!missing(ncells) && ncells!=nrow(design)) {
            stop("cannot specify both 'ncells' and 'design'")
        }
        if (nrow(design) - ncol(design) > 2L) {
            designFUN(design)
        } else {
            NULL
        }
    }
}

#' @importFrom BiocParallel bpnworkers
.niters_by_nworkers <- function(iters, BPPARAM) {
    nworkers <- bpnworkers(BPPARAM)
    if (iters <= nworkers) {
        jobs <- integer(nworkers)
        jobs[seq_len(iters)] <- 1L
    } else {
        per.worker <- as.integer(floor(iters/nworkers))
        jobs <- rep(per.worker, nworkers)
        jobs[1] <- iters - sum(jobs[-1])
    }
    jobs
}