1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
#' Quick and dirty subclustering
#'
#' Performs a quick subclustering for all cells within each group.
#'
#' @param x A matrix of counts or log-normalized expression values (if \code{normalize=FALSE}),
#' where each row corresponds to a gene and each column corresponds to a cell.
#'
#' Alternatively, a \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} object containing such a matrix.
#' @param groups A vector of group assignments for all cells, usually corresponding to cluster identities.
#' @param normalize Logical scalar indicating whether each subset of \code{x} should be log-transformed prior to further analysis.
#' @param prepFUN A function that accepts a single \linkS4class{SingleCellExperiment} object and returns another \linkS4class{SingleCellExperiment} containing any additional elements required for clustering (e.g., PCA results).
#' @param min.ncells An integer scalar specifying the minimum number of cells in a group to be considered for subclustering.
#' @param clusterFUN A function that accepts a single \linkS4class{SingleCellExperiment} object and returns a vector of cluster assignments for each cell in that object.
#' @param BLUSPARAM A \linkS4class{BlusterParam} object that is used to specify the clustering via \code{\link{clusterRows}}.
#' Only used when \code{clusterFUN=NULL}.
#' @param format A string to be passed to \code{\link{sprintf}}, specifying how the subclusters should be named with respect to the parent level in \code{groups} and the level returned by \code{clusterFUN}.
#' @param assay.type String or integer scalar specifying the relevant assay.
#' @param ... For the generic, further arguments to pass to specific methods.
#'
#' For the ANY and SummarizedExperiment methods, further arguments to pass to the SingleCellExperiment method.
#' @param simplify Logical scalar indicating whether just the subcluster assignments should be returned.
#'
#' @return
#' By default, a named \linkS4class{List} of \linkS4class{SingleCellExperiment} objects.
#' Each object corresponds to a level of \code{groups} and contains a \code{"subcluster"} column metadata field with the subcluster identities for each cell.
#' The \code{\link{metadata}} of the List also contains \code{index}, a list of integer vectors specifying the cells in \code{x} in each returned SingleCellExperiment object;
#' and \code{subcluster}, a character vector of subcluster identities (see next).
#'
#' If \code{simplify=TRUE}, the character vector of subcluster identities is returned.
#' This is of length equal to \code{ncol(x)} and each entry follows the format defined in \code{format}.
#' (Unless the number of cells in the parent cluster is less than \code{min.cells}, in which case the parent cluster's name is used.)
#'
#' @details
#' \code{quickSubCluster} is a simple convenience function that loops over all levels of \code{groups} to perform subclustering.
#' It subsets \code{x} to retain all cells in one level and then runs \code{prepFUN} and \code{clusterFUN} to cluster them.
#' Levels with fewer than \code{min.ncells} are not subclustered and have \code{"subcluster"} set to the name of the level.
#'
#' The distinction between \code{prepFUN} and \code{clusterFUN} is that the former's calculations are preserved in the output.
#' For example, we would put the PCA in \code{prepFUN} so that the PCs are returned in the \code{\link{reducedDims}} for later use.
#' In contrast, \code{clusterFUN} is only used to obtain the subcluster assignments so any intermediate objects are lost.
#'
#' By default, \code{prepFUN} will run \code{\link{modelGeneVar}}, take the top 10% of genes with large biological components with \code{\link{getTopHVGs}}, and then run \code{\link{denoisePCA}} to perform the PCA.
#' \code{clusterFUN} will then perform clustering on the PC matrix with \code{\link{clusterRows}} and \code{BLUSPARAM}.
#' Either or both of these functions can be replaced with custom functions.
#'
#' % We use denoisePCA+modelGeneVar by default here, because we hope that each parent cluster is reasonably homogeneous.
#' % This allows us to assume that the trend is actually a good estimate of the technical noise.
#' % We don't use the other modelGeneVar*'s to avoid making assumptions about the available of spike-ins, UMI data, etc.
#'
#' The default behavior of this function is the same as running \code{\link{quickCluster}} on each subset with default parameters except for \code{min.size=0}.
#'
#' @author Aaron Lun
#'
#' @examples
#' library(scuttle)
#' sce <- mockSCE(ncells=200)
#'
#' # Lowering min.size for this small demo:
#' clusters <- quickCluster(sce, min.size=50)
#'
#' # Getting subclusters:
#' out <- quickSubCluster(sce, clusters)
#'
#' # Defining custom prep functions:
#' out2 <- quickSubCluster(sce, clusters,
#' prepFUN=function(x) {
#' dec <- modelGeneVarWithSpikes(x, "Spikes")
#' top <- getTopHVGs(dec, prop=0.2)
#' scater::runPCA(x, subset_row=top, ncomponents=25)
#' }
#' )
#'
#' # Defining custom cluster functions:
#' out3 <- quickSubCluster(sce, clusters,
#' clusterFUN=function(x) {
#' kmeans(reducedDim(x, "PCA"), sqrt(ncol(x)))$cluster
#' }
#' )
#'
#' @seealso
#' \code{\link{quickCluster}}, for a related function to quickly obtain clusters.
#'
#' @name quickSubCluster
NULL
#' @importFrom scuttle logNormCounts
#' @importFrom S4Vectors metadata<-
#' @importFrom BiocSingular bsparam
#' @importClassesFrom S4Vectors List
#' @importFrom bluster clusterRows NNGraphParam
.quick_sub_cluster <- function(x, groups, normalize=TRUE,
prepFUN=NULL, min.ncells=50, clusterFUN=NULL, BLUSPARAM=NNGraphParam(),
format="%s.%s", assay.type="counts", simplify=FALSE)
{
if (normalize) {
alt.assay <- "logcounts"
} else {
alt.assay <- assay.type
}
if (is.null(prepFUN)) {
prepFUN <- function(x) {
# Putting this check here to avoid skipping
# user-specified modifications.
if (ncol(x) < 2L) {
return(x)
}
# For consistency with quickCluster().
dec <- modelGeneVar(x, assay.type=alt.assay)
top <- getTopHVGs(dec, n=500, prop=0.1)
denoisePCA(x, dec, subset.row=top, assay.type=alt.assay)
}
}
if (is.null(clusterFUN)) {
clusterFUN <- function(x) {
clusterRows(reducedDim(x, "PCA"), BLUSPARAM)
}
}
all.sce <- collated <- list()
by.group <- split(seq_along(groups), groups)
for (i in names(by.group)) {
y <- x[,by.group[[i]]]
if (normalize) {
y <- logNormCounts(y, exprs_values=assay.type)
}
y <- prepFUN(y)
if (ncol(y) >= min.ncells) {
clusters <- clusterFUN(y)
clusters <- sprintf(format, i, clusters)
} else {
clusters <- rep(i, ncol(y))
}
y$subcluster <- clusters
all.sce[[i]] <- y
collated[[i]] <- clusters
}
all.clusters <- rep(NA_character_, ncol(x))
all.clusters[unlist(by.group)] <- unlist(collated)
if (simplify) {
all.clusters
} else {
output <- as(all.sce, "List")
metadata(output) <- list(index=by.group, subcluster=all.clusters)
output
}
}
############################
# S4 method definitions
############################
# NOTE: normally, I would have dispatch flow "downwards" towards base
# classes or simpler objects. However, in this case, I have implemented
# it to flow upwards because prepFUN and clustFUN expect SCE inputs.
# This means that everything ends up being promoted to an SCE anyway,
# and it's too hard to require users write their functions endomorphically.
#' @export
#' @rdname quickSubCluster
setGeneric("quickSubCluster", function(x, ...) standardGeneric("quickSubCluster"))
#' @export
#' @rdname quickSubCluster
#' @importFrom SingleCellExperiment SingleCellExperiment
setMethod("quickSubCluster", "ANY", function(x, normalize=TRUE, ...)
{
assays <- list(x)
assay.type <- if (normalize) "counts" else "logcounts"
names(assays) <- assay.type
.quick_sub_cluster(SingleCellExperiment(assays), normalize=normalize, assay.type=assay.type, ...)
})
#' @export
#' @rdname quickSubCluster
#' @importFrom methods as
#' @importClassesFrom SingleCellExperiment SingleCellExperiment
setMethod("quickSubCluster", "SummarizedExperiment", function(x, ...) {
.quick_sub_cluster(as(x, "SingleCellExperiment"), ...)
})
#' @export
#' @rdname quickSubCluster
setMethod("quickSubCluster", "SingleCellExperiment", .quick_sub_cluster)
|