File: testLinearModel.R

package info (click to toggle)
r-bioc-scran 1.18.5%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,856 kB
  • sloc: cpp: 960; sh: 13; makefile: 2
file content (210 lines) | stat: -rw-r--r-- 8,036 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#' Hypothesis tests with linear models
#'
#' Perform basic hypothesis tests with linear models in an efficient manner.
#'
#' @param x A numeric matrix-like object containing log-expression values for cells (columns) and genes (rows).
#' Alternatively, a \linkS4class{SummarizedExperiment} containing such a matrix.
#' @param design A numeric design matrix with number of rows equal to \code{ncol(x)}.
#' @param coefs An integer vector specifying the coefficients to drop to form the null model.
#' Only used if \code{contrasts} is not specified.
#' @param contrasts A numeric vector or matrix specifying the contrast of interest.
#' This should have length (if vector) or number of rows (if matrix) equal to \code{ncol(x)}.
#' @param ... For the generic, further arguments to pass to specific methods.
#'
#' For the SummarizedExperiment method, further arguments to pass to the ANY method.
#' @inheritParams modelGeneVar
#'
#' @return A \linkS4class{DataFrame} containing test results with one row per row of \code{x}.
#' It contains the estimated values of the contrasted coefficients
#' as well as the p-value and FDR for each gene.
#'
#' @details
#' This function can be considered a more efficient version of \code{\link{lmFit}}
#' that works on a variety of matrix representations (see \code{\link{fitLinearModel}}).
#' It also omits the empirical Bayes shrinkage step,
#' which is acceptable given the large number of residual d.f. in typical single-cell studies.
#'
#' If \code{contrasts} is specified, the null hypothesis is defined by the contrast matrix or vector in the same manner 
#' that is used in the \pkg{limma} and \pkg{edgeR} packages.
#' Briefly, the contrast vector specifies a linear combination of coefficients that sums to zero under the null.
#' For contrast matrices, the joint null consists of the intersection of the nulls defined by each column vector.
#'
#' Otherwise, if only \code{coefs} is specified, 
#' the null model is formed by simply dropping all of the specified coefficients from \code{design}.
#'
#' If \code{block} is specified, a linear model is fitted separately to the cells in each level.
#' The results are combined across levels by averaging coefficients and combining p-values with \code{\link{combinePValues}}.
#' By default, the contribution from each level is weighted by its number of cells;
#' if \code{equiweight=TRUE}, each level is given equal weight instead.
#' 
#' @author Aaron Lun
#'
#' @seealso
#' \code{\link{fitLinearModel}}, which performs the hard work of fitting the linear models.
#'
#' @examples
#' y <- matrix(rnorm(10000), ncol=100)
#'
#' # Example with categorical factors:
#' A <- gl(2, 50)
#' design <- model.matrix(~A)
#' testLinearModel(y, design, contrasts=c(0, 1))
#'
#' # Example with continuous variables:
#' u <- runif(100)
#' design <- model.matrix(~u)
#' testLinearModel(y, design, contrasts=c(0, 1))
#'
#' # Example with multiple variables:
#' B <- gl(4, 25)
#' design <- model.matrix(~B)
#' testLinearModel(y, design, contrasts=cbind(c(0,1,0,0), c(0,0,1,-1)))
#'
#' @name testLinearModel
NULL

###########################################################

#' @importFrom BiocParallel SerialParam
#' @importFrom beachmat rowBlockApply
#' @importFrom stats p.adjust
.test_linear_model <- function(x, design, coefs=ncol(design), contrasts=NULL, 
    block=NULL, equiweight=FALSE, method="z", subset.row=NULL, BPPARAM=SerialParam()) 
{
    if (!is.null(subset.row)) {
        x <- x[subset.row,,drop=FALSE]
    }

    if (is.null(block)) {
        .test_linear_model_simple(x, design, coefs=coefs, contrasts=contrasts, BPPARAM=BPPARAM)
    } else {
        collated <- split(seq_len(ncol(x)), block)

        # We want the parallelization to be as fine-grained as possible so we
        # do it here; we don't punt it to the fitLinearModel() function, as 
        # then we would have to restart the parallel workers for each block.
        output <- rowBlockApply(x, FUN=.test_linear_model_multiblock, 
            collated=collated, equiweight=equiweight, method=method, 
            design=design, coefs=coefs, contrasts=contrasts, 
            BPPARAM=BPPARAM)

        if (any(vapply(output, is.null, TRUE))) {
            stop("no level of 'block' has a full column rank 'design'")
        }

        output <- do.call(rbind, output)

        # Resetting all the FDRs.
        output$FDR <- p.adjust(output$p.value, method="BH")
        for (i in seq_along(output$per.block)) {
            output$per.block[[i]]$FDR <- p.adjust(output$per.block[[i]]$p.value, method="BH")
        }

        output
    }
}

#' @importFrom S4Vectors metadata
.test_linear_model_multiblock <- function(collated, x, design, equiweight, method, ...) {
    ncells <- lengths(collated)
    for (i in seq_along(collated)) {
        sub <- collated[[i]] 
        res <- .test_linear_model_simple(x[,sub,drop=FALSE], design=design[sub,,drop=FALSE], ...)

        collated[[i]] <- res
        if (is.na(metadata(res)$residual.df) || metadata(res)$residual.df==0L) {
            ncells[i] <- -Inf
        }
    }

    if (all(ncells < 0L)) {
        return(NULL)
    }

    targets <- setdiff(colnames(collated[[1]]), c("p.value", "FDR"))
    output <- combineBlocks(collated, 
        method=method, 
        geometric=FALSE,
        equiweight=equiweight, 
        weights=ncells, 
        ave.fields=targets,
        pval.field="p.value", 
        valid=ncells > 0L)

    rownames(output) <- rownames(collated[[1]])
    output
}

#' @importFrom scuttle fitLinearModel
#' @importFrom limma lmFit classifyTestsF contrasts.fit
#' @importFrom S4Vectors DataFrame metadata<- metadata
#' @importFrom stats p.adjust pt pf
.test_linear_model_simple <- function(x, design, coefs=ncol(design), contrasts=NULL, ...) {
    full <- fitLinearModel(x, design, get.coefs=TRUE, rank.error=FALSE, ...)

    if (is.null(contrasts)) {
        contrasts <- matrix(0, ncol(design), length(coefs))
        contrasts[cbind(coefs, seq_along(coefs))] <- 1
        if (length(coefs) > 1) {
            colnames(contrasts) <- colnames(design)[coefs]
        }
    } else {  
        if (is.null(dim(contrasts))) {
            contrasts <- matrix(contrasts)
        }
    }
    if (ncol(contrasts)==1L && is.null(colnames(contrasts))) {
        colnames(contrasts) <- "logFC"
    }

    if (is.na(full$residual.df)) {
        pvalue <- rep(NA_real_, nrow(full$coefficients))
        coefs <- matrix(NA_real_, length(pvalue), ncol(contrasts))
        colnames(coefs) <- colnames(contrasts)

    } else {
        # Hacking limma to compute our desired statistics.
        lfit <- lmFit(rbind(seq_len(nrow(design))), design)
        lfit$coefficients <- full$coefficients
        lfit$sigma2 <- full$variance
        lfit <- contrasts.fit(lfit, contrasts)

        coefs <- lfit$coefficients
        tstat <- coefs / outer(sqrt(lfit$sigma2), lfit$stdev.unscaled[1,])

        if (ncol(tstat)==1L) {
            tstat <- drop(tstat)
            pvalue <- pt(abs(tstat), df=full$residual.df, lower.tail=FALSE) * 2
        } else {
            lfit$tstat <- tstat
            fstat <- classifyTestsF(lfit, fstat.only=TRUE)
            pvalue <- pf(fstat, ncol(tstat), full$residual.df, lower.tail = FALSE)
            attributes(pvalue) <- NULL
        }
    }

    output <- DataFrame(row.names=rownames(full$coefficients), # account for subsetting.
        coefs, 
        p.value=pvalue,
        FDR=p.adjust(pvalue, method="BH"))

    metadata(output)$residual.df <- full$residual.df
    output
}

###########################################################

#' @export
#' @rdname testLinearModel
setGeneric("testLinearModel", function(x, ...) standardGeneric("testLinearModel"))

#' @export
#' @rdname testLinearModel
setMethod("testLinearModel", "ANY", .test_linear_model)

#' @export
#' @rdname testLinearModel
#' @importFrom SummarizedExperiment assay
setMethod("testLinearModel", "SummarizedExperiment", function(x, ..., assay.type="logcounts") {
    .test_linear_model(assay(x, assay.type), ...)
})