1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
|
#' Label-specific pseudo-bulk DE
#'
#' Detect label-specific DE genes in a pseudo-bulk analysis,
#' by testing whether the log-fold change is more extreme than the average log-fold change of other labels.
#'
#' @inheritParams pseudoBulkDGE
#' @param ... For the generic, further arguments to pass to individual methods.
#'
#' For the ANY method, further arguments to pass to \code{\link{pseudoBulkDGE}}.
#'
#' For the SummarizedExperiment method, further arguments to pass to the ANY method.
#' @param average String specifying the method to compute the average log-fold change of all other labels.
#' @param missing.as.zero Logical scalar indicating whether missing log-fold changes should be set to zero.
#' @param reference A List containing the (unsorted) output of \code{\link{pseudoBulkDGE}}.
#' This can be supplied to avoid redundant calculations but is automatically computed if \code{NULL}.
#'
#' @details
#' This function implements a quick and dirty method for detecting label-specific DE genes.
#' For a given label and gene, the null hypothesis is that the log-fold change lies between zero
#' and the average log-fold change for that gene across all other labels.
#' Genes that reject this null either have log-fold changes in the opposite sign
#' or are significantly more extreme than the average.
#'
#' To implement this, we test each gene against the two extremes and taking the larger of the two p-values.
#' The p-value is set to 1 if the log-fold change lies between the extremes.
#' This is somewhat similar to how \code{\link{treat}} might behave if the null interval was not centered at zero;
#' however, our approach is more conservative than the \code{\link{treat}} as the p-value calculations are not quite correct.
#'
#' It is worth stressing that there are no guarantees that the DE genes detected in this manner are truly label-specific.
#' For any label and DEG, there may be one or more labels with stronger log-fold changes,
#' but the average may be pulled towards zero by other labels with weaker or opposing effects.
#' The use of the average is analogous to recommendations in the \pkg{edgeR} user's guide for testing against multiple groups.
#' However, a more stringent selection can be achieved by applying gates on \code{\link{decideTestsPerLabel}}.
#'
#' Note that, if \code{lfc} is specified in the arguments to \code{\link{pseudoBulkDGE}},
#' the null interval is expanded in both directions by the specified value.
#'
#' @section Computing the average:
#' The average log-fold change for each gene is computed by taking the median or mean (depending on \code{average})
#' of the corresponding log-fold changes in each of the DE analyses for the other labels.
#' We use the median by default as this means that at least half of all other labels should have weaker or opposing effects.
#'
#' By default, low-abundance genes that were filtered out in a comparison do not contribute to the average.
#' Any log-fold changes that could be computed from them are considered to be too unstable.
#' If the gene is filtered out in all other labels, the average is set to zero for testing but is reported as \code{NA}.
#'
#' If \code{missing.as.zero=TRUE}, the log-fold changes for all filtered genes are set to zero.
#' This is useful when a gene is only expressed in the subset of labels and is consistently DEG in each comparison of the subset.
#' Testing against the average computed from only those labels in the subset would fail to detect this DEG as subset-specific.
#'
#' @return
#' A \linkS4class{List} of \linkS4class{DataFrame}s where each DataFrame contains DE statistics for one label.
#' This is equivalent to the output of \code{\link{pseudoBulkDGE}};
#' if \code{reference} is supplied, most of the statistics will be identical to those reported there.
#'
#' The main differences are that the p-values and FDR values are changed.
#' Each DataFrame also has an \code{OtherAverage} field containing the average log-fold change across all other labels.
#'
#' @author Aaron Lun
#'
#' @seealso
#' \code{\link{pseudoBulkDGE}}, for the underlying function that does all the heavy lifting.
#'
#' @examples
#' set.seed(10000)
#' library(scuttle)
#' sce <- mockSCE(ncells=1000)
#' sce$samples <- gl(8, 125) # Pretending we have 8 samples.
#'
#' # Making up some clusters.
#' sce <- logNormCounts(sce)
#' clusters <- kmeans(t(logcounts(sce)), centers=3)$cluster
#'
#' # Creating a set of pseudo-bulk profiles:
#' info <- DataFrame(sample=sce$samples, cluster=clusters)
#' pseudo <- sumCountsAcrossCells(sce, info)
#'
#' # Making up an experimental design for our 8 samples
#' # and adding a common DEG across all labels.
#' pseudo$DRUG <- gl(2,4)[pseudo$sample]
#' assay(pseudo)[1,pseudo$DRUG==1] <- assay(pseudo)[1,pseudo$DRUG==1] * 10
#'
#' # Label-specific analysis (note behavior of the first gene).
#' out <- pseudoBulkSpecific(pseudo,
#' label=pseudo$cluster,
#' condition=pseudo$DRUG,
#' design=~DRUG,
#' coef="DRUG2"
#' )
#'
#' out[[1]]
#'
#' @export
#' @name pseudoBulkSpecific
NULL
#' @importFrom DelayedMatrixStats rowMedians
#' @importFrom Matrix rowMeans
.pseudo_bulk_specific <- function(x, label, condition=NULL, ..., method=c("edgeR", "voom"),
sorted=FALSE, average=c("median", "mean"), missing.as.zero=FALSE, reference=NULL)
{
method <- match.arg(method)
if (is.null(reference)) {
reference <- pseudoBulkDGE(x, label=label, condition=condition, ..., method=method, sorted=FALSE)
}
all.lfc <- lapply(reference, "[[", i="logFC")
if (any(vapply(all.lfc, is.null, FALSE))) {
stop("ANOVA-like contrasts cannot be specified with non-zero 'null.lfc'")
}
null.lfc.list <- lost <- list()
average <- match.arg(average)
for (l in names(reference)) {
other.lfc <- all.lfc[names(all.lfc)!=l]
other.lfc <- do.call(cbind, other.lfc)
if (missing.as.zero) {
other.lfc[is.na(other.lfc)] <- 0
}
if (average=="median") {
ave.other <- rowMedians(other.lfc, na.rm=TRUE)
} else {
ave.other <- rowMeans(other.lfc, na.rm=TRUE)
}
zeroed <- is.na(ave.other)
ave.other[zeroed] <- 0
lost[[l]] <- zeroed
null.lfc.list[[l]] <- ave.other
}
alt <- .pseudo_bulk_dge(x=x, label=label, condition=condition, method=method,
null.lfc.list=null.lfc.list, ..., sorted=FALSE, include.intermediates=FALSE)
if (method=="edgeR"){
pname <- "PValue"
fname <- "FDR"
} else {
pname <- "P.Value"
fname <- "adj.P.Val"
}
for (l in names(reference)) {
R <- reference[[l]]
A <- alt[[l]]
nulls <- null.lfc.list[[l]]
in.between <- sign(R$logFC) == sign(nulls) & abs(R$logFC) <= abs(nulls)
p <- ifelse(in.between, 1, pmax(A[[pname]], R[[pname]]))
R[[pname]] <- p
R[[fname]] <- p.adjust(p, method="BH")
R$OtherAverage <- ifelse(lost[[l]], NA_real_, nulls)
if (sorted) {
R <- R[order(p),,drop=FALSE]
}
reference[[l]] <- R
}
reference
}
#' @export
#' @rdname pseudoBulkSpecific
setGeneric("pseudoBulkSpecific", function(x, ...) standardGeneric("pseudoBulkSpecific"))
#' @export
#' @rdname pseudoBulkSpecific
setMethod("pseudoBulkSpecific", "ANY", .pseudo_bulk_specific)
#' @export
#' @rdname pseudoBulkSpecific
#' @importFrom SummarizedExperiment assay colData
setMethod("pseudoBulkSpecific", "SummarizedExperiment", function(x, ..., assay.type=1) {
.pseudo_bulk_specific(assay(x, assay.type), col.data=colData(x), ...)
})
|