1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
|
#' Find marker genes
#'
#' Find candidate marker genes for groups of cells (e.g., clusters)
#' by testing for differential expression between pairs of groups.
#'
#' @param x A numeric matrix-like object of expression values,
#' where each column corresponds to a cell and each row corresponds to an endogenous gene.
#' This is expected to be normalized log-expression values for most tests - see Details.
#'
#' Alternatively, a \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} object containing such a matrix.
#' @param groups A vector of length equal to \code{ncol(x)},
#' specifying the group to which each cell is assigned.
#' If \code{x} is a \linkS4class{SingleCellExperiment}, this defaults to \code{\link{colLabels}(x)} if available.
#' @param test.type String specifying the type of pairwise test to perform -
#' a t-test with \code{"t"}, a Wilcoxon rank sum test with \code{"wilcox"},
#' or a binomial test with \code{"binom"}.
#' @inheritParams combineMarkers
#' @param log.p A logical scalar indicating if log-transformed p-values/FDRs should be returned.
#' @param row.data A \linkS4class{DataFrame} containing additional row metadata for each gene in \code{x},
#' to be included in each of the output DataFrames.
#' This should generally have row names identical to those of \code{x}.
#'
#' Alternatively, a list containing one such DataFrame per level of \code{groups},
#' where each DataFrame contains group-specific metadata for each gene to be included in the appropriate output DataFrame.
#' @param add.summary Logical scalar indicating whether statistics from \code{\link{summaryMarkerStats}} should be added.
#' @param ... For the generic, further arguments to pass to specific methods.
#'
#' For the ANY method:
#' \itemize{
#' \item For \code{test.type="t"}, further arguments to pass to \code{\link{pairwiseTTests}}.
#' \item For \code{test.type="wilcox"}, further arguments to pass to \code{\link{pairwiseWilcox}}.
#' \item For \code{test.type="binom"}, further arguments to pass to \code{\link{pairwiseBinom}}.
#' }
#' Common arguments for all testing functions include \code{gene.names}, \code{direction},
#' \code{block} and \code{BPPARAM}.
#' Test-specific arguments are also supported for the appropriate \code{test.type}.
#'
#' For the SummarizedExperiment method, further arguments to pass to the ANY method.
#'
#' For the SingleCellExperiment method, further arguments to pass to the SummarizedExperiment method.
#' @param assay.type A string specifying which assay values to use, usually \code{"logcounts"}.
#'
#' @details
#' This function provides a convenience wrapper for marker gene identification between groups of cells,
#' based on running \code{\link{pairwiseTTests}} or related functions and passing the result to \code{\link{combineMarkers}}.
#' All of the arguments above are supplied directly to one of these two functions -
#' refer to the relevant function's documentation for more details.
#'
#' If \code{x} contains log-normalized expression values generated with a pseudo-count of 1,
#' it can be used in any of the pairwise testing procedures.
#' If \code{x} is scale-normalized but not log-transformed, it can be used with \code{test.type="wilcox"} and \code{test.type="binom"}.
#' If \code{x} contains raw counts, it can only be used with \code{test.type="binom"}.
#'
#' Note that \code{log.p} only affects the combined p-values and FDRs.
#' If \code{full.stats=TRUE}, the p-values for each individual pairwise comparison will always be log-transformed,
#' regardless of the value of \code{log.p}.
#' Log-transformed p-values and FDRs are reported using the natural base.
#'
#' The choice of \code{pval.type} determines whether the highly ranked genes are those that are DE between the current group and:
#' \itemize{
#' \item any other group (\code{"any"})
#' \item all other groups (\code{"all"})
#' \item some other groups (\code{"some"})
#' }
#' See \code{?\link{combineMarkers}} for more details.
#'
#' @return
#' A named list of \linkS4class{DataFrame}s, each of which contains a sorted marker gene list for the corresponding group.
#' In each DataFrame, the top genes are chosen to enable separation of that group from all other groups.
#' See \code{?\link{combineMarkers}} for more details on the output format.
#'
#' If \code{row.data} is provided, the additional fields are added to the front of the DataFrame for each cluster.
#' If \code{add.summary=TRUE}, extra statistics for each cluster are also computed and added.
#'
#' Any log-fold changes are reported as differences in average \code{x} between groups
#' (usually in base 2, depending on the transformation applied to \code{x}).
#'
#' @author
#' Aaron Lun
#'
#' @seealso
#' \code{\link{pairwiseTTests}},
#' \code{\link{pairwiseWilcox}},
#' \code{\link{pairwiseBinom}},
#' for the underlying functions that compute the pairwise DE statistics.
#'
#' \code{\link{combineMarkers}}, to combine pairwise statistics into a single marker list per cluster.
#'
#' \code{\link{summaryMarkerStats}}, to incorporate additional summary statistics per cluster.
#'
#' \code{\link{getMarkerEffects}}, to easily extract a matrix of effect sizes from each DataFrame.
#'
#' @examples
#' library(scuttle)
#' sce <- mockSCE()
#' sce <- logNormCounts(sce)
#'
#' # Any clustering method is okay, only using k-means for convenience.
#' kout <- kmeans(t(logcounts(sce)), centers=4)
#'
#' out <- findMarkers(sce, groups=kout$cluster)
#' names(out)
#' out[[1]]
#'
#' # More customization of the tests:
#' out <- findMarkers(sce, groups=kout$cluster, test.type="wilcox")
#' out[[1]]
#'
#' out <- findMarkers(sce, groups=kout$cluster, lfc=1, direction="up")
#' out[[1]]
#'
#' out <- findMarkers(sce, groups=kout$cluster, pval.type="all")
#' out[[1]]
#'
#' @name findMarkers
NULL
#' @importFrom BiocParallel SerialParam bpstart bpstop
#' @importFrom BiocGenerics cbind
#' @importFrom scuttle .bpNotSharedOrUp
.findMarkers <- function(x, groups, test.type=c("t", "wilcox", "binom"), ...,
pval.type=c("any", "some", "all"), min.prop=NULL, log.p=FALSE, full.stats=FALSE,
sorted=TRUE, row.data=NULL, add.summary=FALSE, BPPARAM=SerialParam())
{
test.type <- match.arg(test.type)
if (test.type=="t") {
FUN <- pairwiseTTests
effect.field <- "logFC"
} else if (test.type=="wilcox") {
FUN <- pairwiseWilcox
effect.field <- "AUC"
} else {
FUN <- pairwiseBinom
effect.field <- "logFC"
}
if (.bpNotSharedOrUp(BPPARAM)) {
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM))
}
fit <- FUN(x, groups, ..., log.p=TRUE, BPPARAM=BPPARAM)
output <- combineMarkers(fit$statistics, fit$pairs, pval.type=pval.type, min.prop=min.prop,
log.p.in=TRUE, log.p.out=log.p, full.stats=full.stats, pval.field="log.p.value",
effect.field=effect.field, sorted=sorted, BPPARAM=BPPARAM)
if (add.summary) {
row.data <- summaryMarkerStats(x, groups, row.data=row.data, BPPARAM=BPPARAM)
}
.add_row_data(output, row.data, match.names=sorted)
}
#' @importClassesFrom S4Vectors DataFrame
.add_row_data <- function(output, row.data, match.names) {
if (is.null(row.data)) {
return(output)
}
for (i in names(output)) {
current <- output[[i]]
if (is.data.frame(row.data) || is(row.data, "DataFrame")) {
rd <- row.data
} else {
if (!i %in% names(row.data)) {
stop("list-like 'row.data' should be named with the levels of 'groups'")
}
rd <- row.data[[i]]
}
rn <- rownames(current)
if (match.names) {
if (is.null(rn) || !identical(sort(rn), sort(rownames(rd)))) {
stop("inconsistent or NULL row names for 'row.data' and result tables")
}
rd <- rd[rn,,drop=FALSE]
} else if (!identical(rn, rownames(rd))) {
stop("inconsistent row names for 'row.data' and result tables")
}
output[[i]] <- cbind(rd, current)
}
output
}
#' @export
#' @rdname findMarkers
setGeneric("findMarkers", function(x, ...) standardGeneric("findMarkers"))
#' @export
#' @rdname findMarkers
setMethod("findMarkers", "ANY", .findMarkers)
#' @export
#' @rdname findMarkers
#' @importFrom SummarizedExperiment assay
setMethod("findMarkers", "SummarizedExperiment", function(x, ..., assay.type="logcounts") {
.findMarkers(assay(x, i=assay.type), ...)
})
#' @export
#' @rdname findMarkers
#' @importFrom SingleCellExperiment colLabels
setMethod("findMarkers", "SingleCellExperiment", function(x, groups=colLabels(x, onAbsence="error"), ...) {
callNextMethod(x=x, groups=groups, ...)
})
|