File: combineMarkers.R

package info (click to toggle)
r-bioc-scran 1.26.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,692 kB
  • sloc: cpp: 733; makefile: 2
file content (328 lines) | stat: -rw-r--r-- 18,953 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#' Combine pairwise DE results into a marker list
#' 
#' Combine multiple pairwise differential expression comparisons between groups or clusters into a single ranked list of markers for each cluster.
#' 
#' @param de.lists A list-like object where each element is a data.frame or \linkS4class{DataFrame}.
#' Each element should represent the results of a pairwise comparison between two groups/clusters,
#' in which each row should contain the statistics for a single gene/feature.
#' Rows should be named by the feature name in the same order for all elements.
#' @param pairs A matrix, data.frame or \linkS4class{DataFrame} with two columns and number of rows equal to the length of \code{de.lists}.
#' Each row should specify the pair of clusters being compared for the corresponding element of \code{de.lists}.
#' @param pval.field A string specifying the column name of each element of \code{de.lists} that contains the p-value.
#' @param effect.field A string specifying the column name of each element of \code{de.lists} that contains the effect size.
#' If \code{NULL}, effect sizes are not reported in the output.
#' @param pval.type A string specifying how p-values are to be combined across pairwise comparisons for a given group/cluster.
#' @param min.prop Numeric scalar specifying the minimum proportion of significant comparisons per gene,
#' Defaults to 0.5 when \code{pval.type="some"}, otherwise defaults to zero.
#' @param log.p.in A logical scalar indicating if the p-values in \code{de.lists} were log-transformed.
#' @param log.p.out A logical scalar indicating if log-transformed p-values/FDRs should be returned.
#' @param output.field A string specifying the prefix of the field names containing the effect sizes.
#' Defaults to \code{"stats"} if \code{full.stats=TRUE}, otherwise it is set to \code{effect.field}.
#' @param full.stats A logical scalar indicating whether all statistics in \code{de.lists} should be stored in the output for each pairwise comparison.
#' @param sorted Logical scalar indicating whether each output DataFrame should be sorted by a statistic relevant to \code{pval.type}.
#' @param flatten Logical scalar indicating whether the individual effect sizes should be flattened in the output DataFrame.
#' If \code{FALSE}, effect sizes are reported as a nested matrix for easier programmatic use.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating whether and how parallelization should be performed across genes.
#' 
#' @return
#' A named \linkS4class{List} of \linkS4class{DataFrame}s where each DataFrame contains the consolidated marker statistics for each gene (row) for the cluster of the same name.
#' The DataFrame for cluster \eqn{X} contains the fields:
#' \describe{
#' \item{\code{Top}:}{Integer, the minimum rank across all pairwise comparisons - see \code{?\link{computeMinRank}} for details.
#' This is only reported if \code{pval.type="any"}.}
#' \item{\code{p.value}:}{Numeric, the combined p-value across all comparisons if \code{log.p.out=FALSE}.}
#' \item{\code{FDR}:}{Numeric, the BH-adjusted p-value for each gene if \code{log.p.out=FALSE}.}
#' \item{\code{log.p.value}:}{Numeric, the (natural) log-transformed version p-value.
#' Replaces the \code{p.value} field if \code{log.p.out=TRUE}.}
#' \item{\code{log.FDR}:}{Numeric, the (natural) log-transformed adjusted p-value.
#' Replaces the \code{FDR} field if \code{log.p.out=TRUE}.}
#' \item{\code{summary.<OUTPUT>}:}{Numeric, named by replacing \code{<OUTPUT>} with \code{output.field}.
#' This contains the summary effect size, obtained by combining effect sizes from all pairwise comparison into a single value.
#' Only reported when \code{effect.field} is not \code{NULL}.}
#' \item{\code{<OUTPUT>.Y}:}{Comparison-specific statistics, named by replacing \code{<OUTPUT>} with \code{output.field}.
#' One of these fields is present for every other cluster \eqn{Y} in \code{clusters} and contains statistics for the comparison of \eqn{X} to \eqn{Y}.
#' If \code{full.stats=FALSE}, each field is numeric and contains the effect size of the comparison of \eqn{X} over \eqn{Y}.
#' Otherwise, each field is a nested DataFrame containing the full statistics for that comparison (i.e., the same asthe corresponding entry of \code{de.lists}).
#' Only reported if \code{flatten=FALSE} and (for \code{full.stats=FALSE}) if \code{effect.field} is not \code{NULL}.
#' }
#' \item{\code{each.<OUTPUT>}:}{A nested DataFrame of comparison-specific statistics, named by replacing \code{<OUTPUT>} with \code{output.field}.
#' If \code{full.stats=FALSE}, one column is present for every other cluster \eqn{Y} in \code{clusters} and contains the effect size of the comparison of \eqn{X} to \eqn{Y}.
#' Otherwise, each column contains another nested DataFrame containing the full set of statistics for that comparison.
#' Only reported if \code{flatten=FALSE} and (for \code{full.stats=FALSE}) if \code{effect.field} is not \code{NULL}.
#' }
#' }
#' 
#' @details
#' An obvious strategy to characterizing differences between clusters is to look for genes that are differentially expressed (DE) between them.
#' However, this entails a number of comparisons between all pairs of clusters to comprehensively identify genes that define each cluster.
#' For all pairwise comparisons involving a single cluster, we would like to consolidate the DE results into a single list of candidate marker genes.
#' Doing so is the purpose of the \code{combineMarkers} function.
#'
#' DE statistics from any testing regime can be supplied to this function - see the Examples for how this is done with t-tests from \code{\link{pairwiseTTests}}.
#' The effect size field in the output will vary according to the type of input statistics, for example:
#' \itemize{
#' \item \code{logFC.Y} from \code{\link{pairwiseTTests}}, containing log-fold changes in mean expression (usually in base 2).
#' \item \code{AUC.Y} from \code{\link{pairwiseWilcox}}, containing the area under the curve, i.e., the concordance probability. 
#' \item \code{logFC.Y} from \code{\link{pairwiseBinom}}, containing log2-fold changes in the expressing proportion.
#' }
#' 
#' @section Consolidating with DE against any other cluster:
#' When \code{pval.type="any"}, each DataFrame is sorted by the min-rank in the \code{Top} column.
#' Taking all rows with \code{Top} values less than or equal to \eqn{T} yields a marker set containing the top \eqn{T} genes (ranked by significance) from each pairwise comparison.
#' This guarantees the inclusion of genes that can distinguish between any two clusters.
#' Also see \code{?\link{computeMinRank}} for more details on the rationale behind this metric.
#' 
#' For each gene and cluster, the summary effect size is defined as the effect size from the pairwise comparison with the lowest p-value.
#' The combined p-value is computed by applying Simes' method to all p-values.
#' Neither of these values are directly used for ranking and are only reported for the sake of the user.
#'
#' @section Consolidating with DE against all other clusters:
#' If \code{pval.type="all"}, the null hypothesis is that the gene is not DE in all contrasts.
#' A combined p-value for each gene is computed using Berger's intersection union test (IUT).
#' Ranking based on the IUT p-value will focus on genes that are DE in that cluster compared to \emph{all} other clusters.
#' This strategy is particularly effective when dealing with distinct clusters that have a unique expression profile.
#' In such cases, it yields a highly focused marker set that concisely captures the differences between clusters.
#' 
#' However, it can be too stringent if the cluster's separation is driven by combinations of gene expression.
#' For example, consider a situation involving four clusters expressing each combination of two marker genes A and B.
#' With \code{pval.type="all"}, neither A nor B would be detected as markers as it is not uniquely defined in any one cluster.
#' This is especially detrimental with overclustering where an otherwise acceptable marker is discarded if it is not DE between two adjacent clusters.
#'
#' For each gene and cluster, the summary effect size is defined as the effect size from the pairwise comparison with the \emph{largest} p-value.
#' This reflects the fact that, with this approach, a gene is only as significant as its weakest DE.
#' Again, this value is not directly used for ranking and are only reported for the sake of the user.
#'
#' @section Consolidating with DE against some other clusters:
#' The \code{pval.type="some"} setting serves as a compromise between \code{"all"} and \code{"any"}.
#' A combined p-value is calculated by taking the middlemost value of the Holm-corrected p-values for each gene.
#' (By default, this the median for odd numbers of contrasts and one-after-the-median for even numbers, but the exact proportion can be changed by setting \code{min.prop} - see \code{?\link{combineParallelPValues}}.)
#' Here, the null hypothesis is that the gene is not DE in at least half of the contrasts.
#' 
#' Genes are then ranked by the combined p-value.
#' The aim is to provide a more focused marker set without being overly stringent, though obviously it loses the theoretical guarantees of the more extreme settings.
#' For example, there is no guarantee that the top set contains genes that can distinguish a cluster from any other cluster, which would have been possible with \code{pval.type="any"}.
#'
#' For each gene and cluster, the summary effect size is defined as the effect size from the pairwise comparison with the \code{min.prop}-smallest p-value.
#' This mirrors the p-value calculation but, again, is reported only for the benefit of the user.
#'
#' @section Consolidating against some other clusters, rank-style:
#' A slightly different flavor of the \dQuote{some cluster} approach is achieved by setting \code{method="any"} with \code{min.prop} set to some positive value in (0, 1).
#' A gene will only be high-ranked if it is among the top-ranked genes in at least \code{min.prop} of the pairwise comparisons.
#' For example, if \code{min.prop=0.3}, any gene with a value of \code{Top} less than or equal to 5 will be in the top 5 DEGs of at least 30\% of the comparisons.
#' 
#' This method increases the stringency of the \code{"any"} setting in a safer manner than \code{pval.type="some"}.
#' Specifically, we avoid comparing p-values across pairwise comparisons, which can be problematic if there are power differences across comparisons, e.g., due to differences in the number of cells across the other clusters.
#'
#' Note that the value of \code{min.prop} does not affect the combined p-value and summary effect size calculations for \code{pval.type="any"}.
#'
#' @section Correcting for multiple testing:
#' The BH method is then applied on the consolidated p-values across all genes to obtain the \code{FDR} field.
#' The reported FDRs are intended only as a rough measure of significance.
#' Properly correcting for multiple testing is not generally possible when \code{clusters} is determined from the same \code{x} used for DE testing.
#' 
#' If \code{log.p=TRUE}, log-transformed p-values and FDRs will be reported.
#' This may be useful in over-powered studies with many cells, where directly reporting the raw p-values would result in many zeroes due to the limits of machine precision.
#' 
#' @section Ordering of the output:
#' \itemize{
#' \item Within each DataFrame, if \code{sorted=TRUE}, genes are ranked by the \code{Top} column if available and the \code{p.value} (or \code{log.p.value}) if not.
#' Otherwise, the input order of the genes is preserved.
#' \item For the DataFrame corresponding to cluster \eqn{X}, the \code{<OUTPUT>.Y} columns are sorted according to the order of cluster IDs in \code{pairs[,2]} for all rows where \code{pairs[,1]} is \eqn{X}.
#' \item In the output List, the DataFrames themselves are sorted according to the order of cluster IDs in \code{pairs[,1]}.
#' Note that DataFrames are only created for clusters present in \code{pairs[,1]}.
#' Clusters unique to \code{pairs[,2]} will only be present within a DataFrame as \eqn{Y}.
#' }
#' 
#' @seealso
#' \code{\link{pairwiseTTests}} and \code{\link{pairwiseWilcox}}, for functions that can generate \code{de.lists} and \code{pairs}.
#' 
#' \code{\link{findMarkers}}, which automatically performs \code{combineMarkers} on the t-test or Wilcoxon test results.
#' 
#' @references
#' Simes RJ (1986). 
#' An improved Bonferroni procedure for multiple tests of significance. 
#' \emph{Biometrika} 73:751-754.
#' 
#' Berger RL and Hsu JC (1996). 
#' Bioequivalence trials, intersection-union tests and equivalence confidence sets.
#' \emph{Statist. Sci.} 11, 283-319.
#' 
#' @author
#' Aaron Lun
#' 
#' @examples
#' library(scuttle)
#' sce <- mockSCE()
#' sce <- logNormCounts(sce)
#' 
#' # Any clustering method is okay.
#' kout <- kmeans(t(logcounts(sce)), centers=3)
#' clusters <- paste0("Cluster", kout$cluster)
#' 
#' out <- pairwiseTTests(logcounts(sce), groups=clusters)
#' comb <- combineMarkers(out$statistics, out$pairs)
#' comb[["Cluster1"]]
#' 
#' out <- pairwiseWilcox(logcounts(sce), groups=clusters)
#' comb <- combineMarkers(out$statistics, out$pairs, effect.field="AUC")
#' comb[["Cluster2"]]
#' 
#' out <- pairwiseBinom(logcounts(sce), groups=clusters)
#' comb <- combineMarkers(out$statistics, out$pairs)
#' comb[["Cluster3"]]
#'
#' @export
#' @importFrom S4Vectors SimpleList
#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom metapod combineParallelPValues
combineMarkers <- function(de.lists, pairs, pval.field="p.value", effect.field="logFC", 
    pval.type=c("any", "some", "all"), min.prop=NULL, log.p.in=FALSE, log.p.out=log.p.in, 
    output.field=NULL, full.stats=FALSE, sorted=TRUE, flatten=TRUE, BPPARAM=SerialParam())
{
    if (length(de.lists)!=nrow(pairs)) {
        stop("'nrow(pairs)' must be equal to 'length(de.lists)'")
    }
    if (is.null(output.field)) {
        output.field <- if (full.stats) "stats" else effect.field
    }
    report.effects <- !is.null(effect.field)

    pval.type <- match.arg(pval.type)
    method <- switch(pval.type, any="simes", some="holm-min", all="berger")
    if (is.null(min.prop))  {
        min.prop <- if (pval.type=="any") 0 else 0.5
    }

    # Checking that all genes are the same across lists.
    gene.names <- NULL
    for (x in seq_along(de.lists)) {
        current <- de.lists[[x]]
        curnames <- rownames(current)

        if (is.null(gene.names)) {
            gene.names <- curnames
        } else if (!identical(gene.names, curnames)) {
            stop("row names should be the same for all elements of 'de.lists'")
        }
    }

    # Processing by the first element of each pair.
    first.fac <- factor(pairs[,1], unique(pairs[,1]))
    by.first <- split(seq_along(de.lists), first.fac, drop=TRUE)

    output <- bplapply(by.first, FUN=.combine_markers_internal,
        pairs=pairs, de.lists=de.lists, method=method, gene.names=gene.names, report.effects=report.effects,
        pval.type=pval.type, effect.field=effect.field, min.prop=min.prop, log.p.in=log.p.in, log.p.out=log.p.out, 
        pval.field=pval.field, output.field=output.field, full.stats=full.stats, sorted=sorted, flatten=flatten, 
        BPPARAM=BPPARAM)

    SimpleList(output)
}

#' @importFrom methods as
#' @importFrom stats p.adjust
#' @importFrom BiocGenerics cbind
#' @importFrom S4Vectors DataFrame I
.combine_markers_internal <- function(chosen, pairs, de.lists, method, gene.names, report.effects, 
    pval.type, effect.field, min.prop, log.p.in, log.p.out, pval.field, output.field, full.stats,
    sorted, flatten, BPPARAM=SerialParam())
{
    targets <- pairs[chosen, 2]
    cur.stats <- de.lists[chosen]

    keep <- !is.na(targets)
    targets <- targets[keep]
    cur.stats <- cur.stats[keep]

    all.p <- lapply(cur.stats, "[[", i=pval.field)
    pval <- combineParallelPValues(all.p, method=method, log.p=log.p.in, min.prop=min.prop)$p.value
    marker.set <- DataFrame(row.names=gene.names)

    # Determining rank.
    if (pval.type=="any") {
        ranked <- lapply(all.p, rank, ties.method="first", na.last="keep")
        min.rank <- compute_Top_statistic_from_ranks(ranked, min.prop)
        gene.order <- order(min.rank) 
        marker.set$Top <- min.rank
    } else {
        gene.order <- order(pval)
    }

    # Correcting for multiple testing. We try to preserve the log-ness as long as we can,
    # to avoid underflow upon exp()'ing that could be avoided by correction.
    if (log.p.in) {
        corrected <- .logBH(pval)
    } else {
        corrected <- p.adjust(pval, method="BH")
    }
    if (log.p.out!=log.p.in) {
        transFUN <- if (log.p.out) log else exp
        pval <- transFUN(pval)
        corrected <- transFUN(corrected)
    }
    
    prefix <- if (log.p.out) "log." else ""
    marker.set[[paste0(prefix, "p.value")]] <- pval 
    marker.set[[paste0(prefix, "FDR")]] <- corrected 

    if (report.effects) {
        all.effects <- lapply(cur.stats, "[[", i=effect.field)
        marker.set[[paste0("summary.", output.field)]] <- .choose_effect_size(all.p, all.effects, pval.type, min.prop)
    }

    # Saving effect sizes or all statistics.
    if (full.stats || report.effects) {
        if (full.stats) {
            cur.stats <- lapply(cur.stats, FUN=function(x) { I(DataFrame(x)) })
        } else {
            cur.stats <- all.effects
        }
        stat.df <- do.call(DataFrame, c(cur.stats, list(row.names=gene.names)))

        if (flatten) {
            colnames(stat.df) <- sprintf("%s.%s", output.field, targets)
            marker.set <- cbind(marker.set, stat.df)
        } else {
            colnames(stat.df) <- as.character(targets)
            marker.set[[paste0("each.", output.field)]] <- stat.df
        }
    }

    if (sorted) {
        marker.set <- marker.set[gene.order,,drop=FALSE]
    }
    marker.set
}

.choose_effect_size <- function(all.p, all.effects, pval.type, min.prop) {
    if (!length(all.p)) {
        return(numeric(0))
    }

    ngenes <- length(all.p[[1]])
    chosen <- rep(NA_real_, ngenes)

    if (pval.type=="all") {
        # Don't start from worst <- all.p[[i]] as this would
        # require another layer of protection against NA's.
        worst <- rep(-Inf, ngenes)
        for (i in seq_along(all.p)) {
            worse <- all.p[[i]] > worst & !is.na(all.p[[i]])
            chosen[worse] <- all.effects[[i]][worse]
            worst[worse] <- all.p[[i]][worse]
        }

    } else if (pval.type=="any") {
        best <- rep(Inf, ngenes)
        for (i in seq_along(all.p)) {
            better <- all.p[[i]] < best & !is.na(all.p[[i]])
            chosen[better] <- all.effects[[i]][better]
            best[better] <- all.p[[i]][better]
        }

    } else {
        chosen <- choose_middle_effect_size(all.p, all.effects, min.prop)
    }

    chosen
}