File: testPseudotime.R

package info (click to toggle)
r-bioc-scran 1.18.5%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,856 kB
  • sloc: cpp: 960; sh: 13; makefile: 2
file content (224 lines) | stat: -rw-r--r-- 8,758 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#' Test for differences along pseudotime
#'
#' Implements a simple method of testing for significant differences with respect to pseudotime,
#' based on fitting linear models with a spline basis matrix.
#' This function is now deprecated as it has been moved to the \pkg{TSCAN} package itself.
#'
#' @param x A numeric matrix-like object containing log-expression values for cells (columns) and genes (rows).
#' Alternatively, a \linkS4class{SummarizedExperiment} containing such a matrix.
#' @param pseudotime A numeric matrix with one row per cell in \code{x} and one column per path (i.e., lineage).
#' A vector is treated the same as a 1-column matrix.
#' @param df Integer scalar specifying the degrees of freedom for the splines.
#' @param get.lfc Logical scalar indicating whether to return an overall log-fold change along each path.
#' @param get.spline.coef Logical scalar indicating whether to return the estimates of the spline coefficients.
#' @param trend.only Logical scalar indicating whether only differences in the trend should be considered
#' when testing for differences between paths.
#' @param ... For the generic, further arguments to pass to specific method.
#'
#' For the ANY method, further arguments to pass to \code{\link{fitLinearModel}}.
#' 
#' For the SummarizedExperiment method, further arguments to pass to the ANY method.
#' @param assay.type String or integer scalar specifying the assay containing the log-expression matrix.
#'
#' @return
#' A \linkS4class{DataFrame} is returned containing the statistics for each gene (row),
#' including the p-value and its BH-adjusted equivalent.
#' If \code{get.lfc=TRUE}, an overall log-fold change is returned for each path.
#'
#' If \code{get.spline.coef=TRUE}, the estimated spline coefficients are also returned (single path)
#' or the differences in the spline fits to the first path are returned (multiple paths).
#' 
#' @details
#' For a single path in \code{pseudotime},
#' this function fits a natural spline to the expression of each gene with respect to pseudotime.
#' It then does an ANOVA to test whether any of the spline coefficients are non-zero.
#' In this manner, genes exhibiting a significant (and potentially non-linear) trend
#' with respect to the pseudotime can be detected as those with low p-values.
#' 
#' For multiple paths in \code{pseudotime}, 
#' the null hypothesis is that all paths have the same trend (if \code{trend.only=TRUE})
#' or the same trend and intercept (if \code{FALSE}).
#' This is done by effectively fitting a separate trend to each path 
#' and performing an ANOVA to detect differences in the trend alone or in the trend and intercept.
#' In this manner, genes exhibiting differences in behavior between paths can be detected.
#'
#' The expected format of \code{pseudotime} is the same as that returned by \code{\link{orderClusterMST}}.
#' Each cell is assigned to a path if it has a non-\code{NA} value in the corresponding column.
#' For single path testing, cells with \code{NA} values in \code{pseudotime} are ignored;
#' for multiple path testing, cells assigned to multiple paths are ignored.
#'
#' By default, estimates of the spline coefficients are not returned as they are difficult to interpret.
#' Rather, a log-fold change of expression along each path is estimated
#' to provide some indication of the overall magnitude and direction of any change.
#' 
#' @author Aaron Lun
#'
#' @examples
#' y <- matrix(rnorm(10000), ncol=100)
#'
#' # Testing for a difference in a single path:
#' u <- runif(100)
#' testPseudotime(y, u)
#'
#' # Testing for differences in multiple paths
#' # by mocking up a pseudotime matrix.
#' p <- cbind(path1=u, path2=u)
#' path1 <- rbinom(length(u), 1, 0.5)==0
#' p[!path1,1] <- NA
#' p[path1,2] <- NA
#' 
#' testPseudotime(y, p)
#'
#' @seealso
#' \code{\link{orderClusterMST}}, to generate the pseudotime matrix.
#'
#' \code{\link{testLinearModel}}, which performs the tests under the hood.
#'
#' @name testPseudotime
NULL

.test_pseudotime <- function(x, pseudotime, df=5, get.lfc=TRUE, get.spline.coef=FALSE, trend.only=TRUE) {
    .Deprecated(old="scran::testPseudotime", new="TSCAN::testPseudotime")
    if (is.null(dim(pseudotime)) || ncol(pseudotime)==1) {
        .test_solo_pseudotime(x, pseudotime, df=df, get.lfc=get.lfc, 
            get.spline.coef=get.spline.coef)
    } else {
        .test_multi_pseudotime(x, pseudotime, df=df, get.lfc=get.lfc, 
            get.spline.coef=get.spline.coef, trend.only=trend.only)
    }
}

#' @importFrom stats model.matrix
#' @importFrom scuttle fitLinearModel
.test_solo_pseudotime <- function(x, pseudotime, df, get.lfc, get.spline.coef) {
    pseudotime <- drop(pseudotime)

    keep <- !is.na(pseudotime)
    pseudotime <- pseudotime[keep]
    x <- x[,keep,drop=FALSE] 
    design <- .forge_spline_basis_design(pseudotime, df=df)
    output <- testLinearModel(x, design=design, coefs=2:ncol(design))
 
    if (get.lfc) {
        prior <- colnames(output)
        design.lfc <- model.matrix(~pseudotime)
        output$logFC <- fitLinearModel(x, design=design.lfc)$coefficients[,2]
        output <- output[,c("logFC", prior)] 
    }

    if (!get.spline.coef) {
        output <- output[,setdiff(colnames(output), colnames(design))]
    }

    output
}

#' @importFrom stats model.matrix
.forge_spline_basis_design <- function(p, df) { 
    # Uniquify'ing to avoid non-full rank problems when
    # many of the quantiles are stacked at the same position. 
    up <- unique(p)
    if (length(up) <= df) {
        stop("'not enough unique pseudotime values for the specified 'df'")
    }

    basis <- splines::ns(up, df=df)
    colnames(basis) <- sprintf("spline%i", seq_len(df))
    basis <- basis[match(p, up),,drop=FALSE]

    cbind(Intercept=rep(1, length(p)), basis)
}

#' @importFrom stats model.matrix
#' @importFrom scuttle fitLinearModel
.test_multi_pseudotime <- function(x, pseudotime, df, get.lfc, get.spline.coef, trend.only) {
    if (anyDuplicated(colnames(pseudotime))) {
        warning("'pseudotime' has duplicated column names")
        colnames(pseudotime) <- NULL
    }
    if (is.null(colnames(pseudotime))) {
        colnames(pseudotime) <- sprintf("path%i", seq_len(ncol(pseudotime)))
    }

    nonna <- !is.na(pseudotime)
    solo <- rowSums(nonna) == 1
    if (!all(solo)) {
        x <- x[,solo,drop=FALSE]
        pseudotime <- pseudotime[solo,,drop=FALSE]
    }

    common <- rowMeans(pseudotime, na.rm=TRUE)
    design.raw <- .forge_spline_basis_design(common, df=df)

    all.x <- all.design <- all.lfc <- list()
    for (i in colnames(pseudotime)) {
        current.pseudo <- pseudotime[,i]
        keep <- !is.na(current.pseudo) 
        current.pseudo <- current.pseudo[keep]

        all.x[[i]] <- x[,keep,drop=FALSE] 
        cur.design <- design.raw[keep,,drop=FALSE] 
        colnames(cur.design) <- paste0(colnames(cur.design), ".", i)
        all.design[[i]] <- cur.design

        if (get.lfc) {
            design.lfc <- model.matrix(~current.pseudo)
            all.lfc[[paste0("logFC.", i)]] <- fitLinearModel(all.x[[i]], design=design.lfc)$coefficients[,2]
        }
    }

    # Creating the design matrix.
    x2 <- do.call(cbind, all.x)
    for (i in seq_along(all.design)) {
        copy <- rep(all.design[i], length(all.design))
        for (j in setdiff(seq_along(copy), i)) {
            copy[[j]][] <- 0
        }
        all.design[[i]] <- do.call(cbind, copy)
    }
    design <- do.call(rbind, all.design)

    # Building up a mega contrast of doom.
    contrastor <- list()
    for (i in seq_len(ncol(pseudotime) - 1)) {
        N <- df + 1L

        con <- matrix(0, ncol(design), N)
        diag(con) <- -1
        con[cbind(i * N + seq_len(N), seq_len(N))] <- 1
        if (trend.only) {
            con <- con[,-1,drop=FALSE]
        }

        colnames(con) <- sprintf("spline%i.%s", seq_len(ncol(con)), colnames(pseudotime)[i+1])
        contrastor[[i]] <- con
    }

    contrast <- do.call(cbind, contrastor)
    output <- testLinearModel(x2, design=design, contrasts=contrast)

    # Organizing the output.
    if (!get.spline.coef) {
        output <- output[,setdiff(colnames(output), colnames(contrast))]
    }
    if (get.lfc) {
        output <- cbind(DataFrame(all.lfc, row.names=rownames(output)), output)
    }

    output
}

#' @export
#' @rdname testPseudotime
setGeneric("testPseudotime", function(x, ...) standardGeneric("testPseudotime"))

#' @export
#' @rdname testPseudotime
setMethod("testPseudotime", "ANY", .test_pseudotime)

#' @export
#' @rdname testPseudotime
#' @importFrom SummarizedExperiment assay
setMethod("testPseudotime", "SummarizedExperiment", function(x, ..., assay.type="logcounts") {
    .test_pseudotime(assay(x, assay.type), ...)
})