File: test_de.R

package info (click to toggle)
r-bioc-glmgampoi 1.2.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 704 kB
  • sloc: cpp: 523; ansic: 114; sh: 13; makefile: 2
file content (269 lines) | stat: -rw-r--r-- 13,124 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269


#' Test for Differential Expression
#'
#' Conduct a quasi-likelihood ratio test for a Gamma-Poisson fit.
#'
#' @inheritParams glm_gp
#' @param fit object of class `glmGamPoi`. Usually the result of calling `glm_gp(data, ...)`
#' @param contrast The contrast to test. Can be a single column name (quoted or as a string)
#'   that is removed from the  full model matrix of `fit`. Or a complex contrast comparing two
#'   or more columns: e.g. `A - B`, `"A - 3 * B"`, `(A + B) / 2 - C` etc. \cr
#'   Only one of `contrast` or `reduced_design` must be specified.
#' @param reduced_design a specification of the reduced design used as a comparison to see what
#'   how much better `fit` describes the data.
#'   Analogous to the `design` parameter in `glm_gp()`, it can be either a `formula`,
#'   a `model.matrix()`, or a `vector`. \cr
#'    Only one of `contrast` or `reduced_design` must be specified.
#' @param full_design option to specify an alternative `full_design` that can differ from
#'   `fit$model_matrix`. Can be a `formula` or a `matrix`. Default: `fit$model_matrix`
#' @param subset_to a vector with the same length as `ncol(fit$data)` or  an expression
#'   that evaluates to such a vector. The expression can reference columns from `colData(fit$data)`.
#'   A typical use case in single cell analysis would be to subset to a specific cell type
#'   (e.g. `subset_to = cell_type == "T-cells"`). Note that if this argument is set a new
#'   the model for the `full_design` is re-fit.\cr
#'   Default: `NULL` which means that the data is not subset.
#' @param pseudobulk_by a vector with the same length as `ncol(fit$data)` that is used to
#'   split the columns into different groups (calls [split()]). `pseudobulk_by` can also be an
#'   expression that evaluates to a vector. The expression can reference columns from `colData(fit$data)`. \cr
#'   The counts are summed across the groups
#'   to create "pseudobulk" samples. This is typically used in single cell analysis if the cells come
#'   from different samples to get a proper estimate of the differences. This is particularly powerful
#'   in combination with the `subset_to` parameter to analyze differences between samples for
#'   subgroups of cells. Note that this does a fresh fit for both the full and the reduced design.
#'   Default: `NULL` which means that the data is not aggregated.
#' @param pval_adjust_method one of the p-value adjustment method from
#'   [p.adjust.methods]. Default: `"BH"`.
#' @param sort_by the name of the column or an expression used to sort the result. If `sort_by` is `NULL`
#'   the table is not sorted. Default: `NULL`
#' @param decreasing boolean to decide if the result is sorted increasing or decreasing
#'   order. Default: `FALSE`.
#' @param n_max the maximum number of rows to return. Default: `Inf` which means that all
#'   rows are returned
#'
#' @return a `data.frame` with the following columns
#' \describe{
#'   \item{name}{the rownames of the input data}
#'   \item{pval}{the p-values of the quasi-likelihood ratio test}
#'   \item{adj_pval}{the adjusted p-values returned from [p.adjust()]}
#'   \item{f_statistic}{the F-statistic: \eqn{F = (Dev_full - Dev_red) / (df_1 * disp_ql-shrunken)}}
#'   \item{df1}{the degrees of freedom of the test: `ncol(design) - ncol(reduced_design)`}
#'   \item{df2}{the degrees of freedom of the fit: `ncol(data) - ncol(design) + df_0`}
#'   \item{lfc}{the log2-fold change. If the alternative model is specified by `reduced_design` will
#'    be `NA`.}
#' }
#'
#' @examples
#'   Y <- matrix(rnbinom(n = 30 * 100, mu = 4, size = 0.3), nrow = 30, ncol = 100)
#'   annot <- data.frame(sample = sample(LETTERS[1:6], size = 100, replace = TRUE),
#'                       cont1 = rnorm(100), cont2 = rnorm(100, mean = 30))
#'   annot$condition <- ifelse(annot$sample %in% c("A", "B", "C"), "ctrl", "treated")
#'   head(annot)
#'   se <- SummarizedExperiment::SummarizedExperiment(Y, colData = annot)
#'   fit <- glm_gp(se, design = ~ condition + cont1 + cont2)
#'
#'   # Test with reduced design
#'   res <- test_de(fit, reduced_design = ~ condition + cont1)
#'   head(res)
#'
#'   # Test with contrast argument, the results are identical
#'   res2 <- test_de(fit, contrast = cont2)
#'   head(res2)
#'
#'   # The column names of fit$Beta are valid variables in the contrast argument
#'   colnames(fit$Beta)
#'
#'
#'   # You can also have more complex contrasts:
#'   # the following compares cont1 vs cont2:
#'   test_de(fit, cont1 - cont2, n_max = 4)
#'
#'
#'   # You can also sort the output
#'   test_de(fit, cont1 - cont2, n_max = 4,
#'           sort_by = "pval")
#'
#'   test_de(fit, cont1 - cont2, n_max = 4,
#'           sort_by = - abs(f_statistic))
#'
#'   # If the data has multiple samples, it is a good
#'   # idea to aggregate the cell counts by samples.
#'   # This is called "pseudobulk".
#'   test_de(fit, contrast = "conditiontreated", n_max = 4,
#'           pseudobulk_by = sample)
#'
#'
#'   # You can also do the pseudobulk only on a subset of cells:
#'   cell_types <- sample(c("Tcell", "Bcell", "Makrophages"), size = 100, replace = TRUE)
#'   test_de(fit, contrast = "conditiontreated", n_max = 4,
#'           pseudobulk_by = sample,
#'           subset_to = cell_types == "Bcell")
#'
#'
#'   # Be care full, if you included the cell type information in
#'   # the original fit, after subsetting the design matrix would
#'   # be degenerate. To fix this, specify the full_design in 'test_de()'
#'   SummarizedExperiment::colData(se)$ct <- cell_types
#'   fit_with_celltype <- glm_gp(se, design = ~ condition + cont1 + cont2 + ct)
#'   test_de(fit_with_celltype, contrast = cont1, n_max = 4,
#'           full_design =  ~ condition + cont1 + cont2,
#'           pseudobulk_by = sample,
#'           subset_to = ct == "Bcell")
#'
#'
#'
#' @references
#' * Lund, S. P., Nettleton, D., McCarthy, D. J., & Smyth, G. K. (2012). Detecting differential expression
#'   in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates. Statistical
#'   Applications in Genetics and Molecular Biology, 11(5).
#'   [https://doi.org/10.1515/1544-6115.1826](https://doi.org/10.1515/1544-6115.1826).
#'
#' @seealso [glm_gp()]
#'
#' @export
test_de <- function(fit,
                    contrast,
                    reduced_design = NULL,
                    full_design = fit$model_matrix,
                    subset_to = NULL, pseudobulk_by = NULL,
                    pval_adjust_method = "BH", sort_by = NULL,
                    decreasing = FALSE, n_max = Inf,
                    verbose = FALSE){
  # Capture all NSE variables
  contrast_capture <- substitute(contrast)
  subset_to_capture <- substitute(subset_to)
  pseudobulk_by_capture <- substitute(pseudobulk_by)
  sort_by_capture <- substitute(sort_by)

  test_de_q(fit, contrast = contrast_capture, reduced_design = reduced_design, full_design = full_design,
            subset_to = subset_to_capture, pseudobulk_by = pseudobulk_by_capture,
            pval_adjust_method = pval_adjust_method, sort_by = sort_by_capture,
            decreasing = decreasing, n_max = n_max,
            verbose = verbose,
            env = parent.frame())
}

# This function is necessary to handle the NSE stuff
# for an explanation see:
# http://adv-r.had.co.nz/Computing-on-the-language.html
test_de_q <- function(fit,
                      contrast,
                      reduced_design = NULL,
                      full_design = fit$model_matrix,
                      subset_to = NULL, pseudobulk_by = NULL,
                      pval_adjust_method = "BH", sort_by = NULL,
                      decreasing = FALSE, n_max = Inf,
                      verbose = FALSE,
                      env = parent.frame()){
  if(! is.matrix(full_design) || length(full_design) != length(fit$model_matrix) || ! all(full_design == fit$model_matrix) ){
    full_design <- handle_design_parameter(full_design, fit$data, SummarizedExperiment::colData(fit$data), NULL)$model_matrix
  }
  if(is.null(reduced_design) == missing(contrast)){
    stop("Please provide either an alternative design (formula or matrix) or a contrast ",
         "(name of a column in fit$model_matrix or a combination of them).")
  }

  pseudobulk_by_e <- eval_with_q(pseudobulk_by, SummarizedExperiment::colData(fit$data), env = env)
  subset_to_e <- eval_with_q(subset_to, SummarizedExperiment::colData(fit$data), env = env)
  if(! is.null(pseudobulk_by_e)){
    if(verbose) { message("Aggregate counts of columns that belong to the same sample") }
    # This method aggregates the data
    # then does a fresh fit for the full model
    # then calls test_de() with the reduced dataset
    return(test_pseudobulk_q(fit$data, design = full_design,
                             aggregate_cells_by = pseudobulk_by_e,
                             contrast = contrast,
                             reduced_design = reduced_design,
                             subset_to = subset_to_e,
                             pval_adjust_method = pval_adjust_method, sort_by = sort_by,
                             decreasing = decreasing, n_max = n_max,
                             verbose = verbose, env = env))
  }

  if(! is.null(subset_to_e)){
    if(verbose) { message("Subset the dataset, thus a new model is fitted!") }
    # Only run test on a subset of things:
    # Redo fit, but keep dispersion
    data_subset <- fit$data[,subset_to_e,drop=FALSE]

    model_matrix_subset <- full_design[subset_to_e, ,drop=FALSE]
    size_factor_subset <- fit$size_factors[subset_to_e]

    fit_subset <- glm_gp(data_subset, design = model_matrix_subset, size_factors = size_factor_subset,
                  overdispersion = fit$overdispersions, on_disk = is_on_disk.glmGamPoi(fit),
                  verbose = verbose)
    test_res <- test_de_q(fit_subset, contrast = contrast, reduced_design = reduced_design,
                          subset_to = NULL, pseudobulk_by = NULL,
                          pval_adjust_method = pval_adjust_method, sort_by = sort_by,
                          decreasing = decreasing, n_max = n_max,
                          verbose = verbose, env = env)
    return(test_res)
  }
  if(is.null(fit$overdispersion_shrinkage_list)){
    stop("fit$overdispersion_shrinkage_list is NULL. Run 'glm_gp' with ",
         "'overdispersion_shrinkage = TRUE'.")
  }
  disp_trend <- fit$overdispersion_shrinkage_list$dispersion_trend
  if(! missing(contrast)){
    # e.g. a vector with c(1, 0, -1, 0) for contrast = A - C
    cntrst <- parse_contrast_q(contrast, levels = colnames(fit$model_matrix),
                               env = env)
    # The modifying matrix of reduced_design has ncol(model_matrix) - 1 columns and rank.
    # The columns are all orthogonal to cntrst.
    # see: https://scicomp.stackexchange.com/a/27835/36204
    # Think about this as a rotation of of the design matrix. The QR decomposition approach
    # has the added benefit that the new columns are all orthogonal to each other, which
    # isn't necessary, but makes fitting more robust
    # The following is a simplified version of edgeR's glmLRT (line 159 in glmfit.R)
    qrc <- qr(cntrst)
    rot <- qr.Q(qrc, complete = TRUE)[,-1,drop=FALSE]
    reduced_design <- fit$model_matrix %*% rot
    lfc <- fit$Beta %*% cntrst / log(2)
  }else{
    lfc <- NA
  }
  if(verbose){message("Fit reduced model")}
  data <- fit$data
  do_on_disk <- is_on_disk.glmGamPoi(fit)
  fit_alt <- glm_gp(data, design = reduced_design,
                    size_factors = fit$size_factors,
                    overdispersion = disp_trend,
                    overdispersion_shrinkage = FALSE,
                    on_disk = do_on_disk)

  if(ncol(fit_alt$model_matrix) >= ncol(fit$model_matrix)){
    stop("The reduced model is as complex (or even more complex) than ",
         "the 'fit' model. The 'reduced_design' should contain fewer terms ",
         "the original 'design'.")
  }

  # Likelihood ratio
  if(verbose){message("Calculate quasi likelihood ratio")}
  lr <- fit_alt$deviances - fit$deviances
  df_test <- ncol(fit$model_matrix) - ncol(fit_alt$model_matrix)
  df_test <- ifelse(df_test == 0, NA, df_test)
  df_fit <- fit$overdispersion_shrinkage_list$ql_df0 + (ncol(data) - ncol(fit_alt$model_matrix))
  f_stat <- lr / df_test / fit$overdispersion_shrinkage_list$ql_disp_shrunken
  pval <- pf(f_stat, df_test, df_fit, lower.tail = FALSE, log.p = FALSE)
  adj_pval <- p.adjust(pval, method = pval_adjust_method)

  names <- rownames(data)
  if(is.null(names)){
    names <- sprintf(paste0("row_%0", floor(log10(nrow(data))), "i"), seq_len(nrow(data)))
  }
  if(verbose){message("Preprare results")}
  res <- data.frame(name = names, pval = pval, adj_pval = adj_pval,
                    f_statistic = f_stat, df1 = df_test, df2 = df_fit,
                    lfc = lfc,
                    stringsAsFactors = FALSE, row.names = NULL)
  sort_by_e <- eval_with_q(sort_by, res, env = env)
  res <- if(is.null(sort_by_e)) {
    res
  }else{
    res[order(sort_by_e, decreasing = decreasing), ]
  }
  res[seq_len(min(nrow(res), n_max)), ,drop=FALSE]
}