File: scaledColRanks.R

package info (click to toggle)
r-bioc-scran 1.18.5%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,856 kB
  • sloc: cpp: 960; sh: 13; makefile: 2
file content (116 lines) | stat: -rw-r--r-- 4,626 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#' Compute scaled column ranks
#'
#' Compute scaled column ranks from each cell's expression profile for distance calculations based on rank correlations.
#' 
#' @param x A numeric matrix-like object containing cells in columns and features in the rows.
#' @param subset.row A logical, integer or character scalar indicating the rows of \code{x} to use, see \code{?"\link{scran-gene-selection}"}.
#' @param min.mean A numeric scalar specifying the filter to be applied on the average normalized count for each feature prior to computing ranks.
#' Disabled by setting to \code{NULL}.
#' @param transposed A logical scalar specifying whether the output should be transposed.
#' @param as.sparse A logical scalar indicating whether the output should be sparse.
#' @param withDimnames A logical scalar specifying whether the output should contain the dimnames of \code{x}.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying whether and how parallelization should be performed.
#' Currently only used for filtering if \code{min.mean} is not provided.
#' 
#' @return
#' A matrix of the same dimensions as \code{x}, where each column contains the centred and scaled ranks of the expression values for each cell.
#' If \code{transposed=TRUE}, this matrix is transposed so that rows correspond to cells.
#' If \code{as.sparse}, the columns are not centered to preserve sparsity.
#' 
#' @details
#' Euclidean distances computed based on the output rank matrix are equivalent to distances computed from Spearman's rank correlation.
#' This can be used in clustering, nearest-neighbour searches, etc. as a robust alternative to Euclidean distances computed directly from \code{x}. 
#' 
#' If \code{as.sparse=TRUE}, the most common average rank is set to zero in the output.
#' This can be useful for highly sparse input data where zeroes have the same rank and are themselves returned as zeroes.
#' Obviously, this means that the ranks are not centred, so this will have to be done manually prior to any downstream distance calculations.
#' 
#' @author
#' Aaron Lun
#' 
#' @seealso
#' \code{\link{quickCluster}}, where this function is used.
#' 
#' @examples
#' library(scuttle)
#' sce <- mockSCE()
#' rout <- scaledColRanks(counts(sce), transposed=TRUE)
#' 
#' # For use in clustering:
#' d <- dist(rout)
#' table(cutree(hclust(d), 4))
#' 
#' g <- buildSNNGraph(rout, transposed=TRUE)
#' table(igraph::cluster_walktrap(g)$membership)
#' 
#' @export
#' @importFrom scuttle calculateAverage .subset2index .bpNotSharedOrUp
#' @importFrom BiocParallel SerialParam bpstart bpstop
scaledColRanks <- function(x, subset.row=NULL, min.mean=NULL, transposed=FALSE, as.sparse=FALSE, 
    withDimnames=TRUE, BPPARAM=SerialParam())
{
    if (.bpNotSharedOrUp(BPPARAM)) {
        bpstart(BPPARAM)
        on.exit(bpstop(BPPARAM))
    }

    subset.row <- .subset2index(subset.row, x, byrow=TRUE)
    if (!is.null(min.mean) && all(dim(x)>0L)) {
        further.subset <- calculateAverage(x, subset_row=subset.row, BPPARAM=BPPARAM) >= min.mean
        subset.row <- subset.row[further.subset]
    }

    out <- colBlockApply(x[subset.row,,drop=FALSE], FUN=.get_scaled_ranks, 
        transposed=transposed, as.sparse=as.sparse, BPPARAM=BPPARAM)

    if (transposed) {
        rkout <- do.call(rbind, out)
    } else {
        rkout <- do.call(cbind, out)
    }

    if (withDimnames && !is.null(dimnames(x))) {
        dn <- list(rownames(x)[subset.row], colnames(x))
        if (transposed) { 
            dn <- rev(dn)
        }
        dimnames(rkout) <- dn
    }
    rkout
}

#' @importClassesFrom Matrix sparseMatrix dgCMatrix
#' @importFrom DelayedMatrixStats colRanks rowSds
#' @importFrom DelayedArray rowMins
#' @importFrom Matrix rowMeans
#' @importFrom DelayedArray getAutoBPPARAM setAutoBPPARAM
.get_scaled_ranks <- function(block, transposed, as.sparse) {
    if (is(block, "SparseArraySeed")) {
        block <- as(block, "sparseMatrix")
    }

    old <- getAutoBPPARAM()
    setAutoBPPARAM(NULL) # turning off any additional parallelization, just in case.
    on.exit(setAutoBPPARAM(old))

    out <- colRanks(DelayedArray(block), ties.method="average", preserveShape=FALSE)

    sig <- sqrt(rowVars(out) * (ncol(out)-1)) * 2
    if (any(is.na(sig) | sig==0)) {
        stop("rank variances of zero detected for a cell")
    }

    if (as.sparse) {
        out <- out - rowMins(DelayedArray(out)) # TODO: switch to MatGen once this is available.
        out <- as(out, "dgCMatrix")
    } else {
        out <- out - rowMeans(out)
    }

    out <- out/sig

    if (!transposed) {
        out <- t(out)
    }
    out
}