File: glm_gp.R

package info (click to toggle)
r-bioc-glmgampoi 1.2.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 704 kB
  • sloc: cpp: 523; ansic: 114; sh: 13; makefile: 2
file content (483 lines) | stat: -rw-r--r-- 24,143 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483


#' Fit a Gamma-Poisson Generalized Linear Model
#'
#' This function provides a simple to use interface to fit Gamma-Poisson generalized
#' linear models. It works equally well for small scale (a single model) and large scale data
#' (e.g. thousands of rows and columns, potentially stored on disk). The function
#' automatically determines the appropriate size factors for each sample and efficiently
#' finds the best overdispersion parameter for each gene.
#'
#' @param data any matrix-like object (e.g. [matrix], [DelayedArray], [HDF5Matrix]) or
#'   anything that can be cast to a [SummarizedExperiment] (e.g. `MSnSet`, `eSet` etc.) with
#'   one column per sample and row per gene.
#' @param design a specification of the experimental design used to fit the Gamma-Poisson GLM.
#'   It can be a [model.matrix()] with one row for each sample and one column for each
#'   coefficient. \cr
#'   Alternatively, `design` can be a `formula`. The entries in the
#'   formula can refer to global objects, columns in the `col_data` parameter, or the `colData(data)`
#'   of `data` if it is a `SummarizedExperiment`. \cr
#'   The third option is that `design` is a vector where each element specifies to which
#'   condition a sample belongs. \cr
#'   Default: `design = ~ 1`, which means that all samples are treated as if they belong to the
#'   same condition. Note that this is the fasted option.
#' @param col_data a dataframe with one row for each sample in `data`. Default: `NULL`.
#' @param reference_level a single string that specifies which level is used as reference
#'   when the model matrix is created. The reference level becomes the intercept and all
#'   other coefficients are calculated with respect to the `reference_level`.
#'   Default: `NULL`.
#' @param offset Constant offset in the model in addition to `log(size_factors)`. It can
#'   either be a single number, a vector of length `ncol(data)` or a matrix with the
#'   same dimensions as `dim(data)`. Note that if data is a [DelayedArray] or [HDF5Matrix],
#'   `offset` must be as well. Default: `0`.
#' @param size_factors in large scale experiments, each sample is typically of different size
#'   (for example different sequencing depths). A size factor is an internal mechanism of GLMs to
#'   correct for this effect.\cr
#'   `size_factors` is either a numeric vector with positive entries that has the same lengths as columns in the data
#'   that specifies the size factors that are used.
#'   Or it can be a string that species the method that is used to estimate the size factors
#'   (one of \code{c("normed_sum", "deconvolution", "poscounts")}).
#'   Note that \code{"normed_sum"} and \code{"poscounts"} are fairly
#'   simple methods and can lead to suboptimal results. For the best performance, I recommend to use
#'   `size_factors = "deconvolution"` which calls `scran::calculateSumFactors()`. However, you need
#'   to separately install the `scran` package from Bioconductor for this method to work.
#'   Also note that `size_factors = 1` and `size_factors = FALSE` are equivalent. If only a single gene is given,
#'   no size factor is estimated (ie. `size_factors = 1`). Default: `"normed_sum"`.
#' @param overdispersion the simplest count model is the Poisson model. However, the Poisson model
#'   assumes that \eqn{variance = mean}. For many applications this is too rigid and the Gamma-Poisson
#'   allows a more flexible mean-variance relation (\eqn{variance = mean + mean^2 * overdispersion}). \cr
#'   `overdispersion` can either be
#'   \itemize{
#'      \item a single boolean that indicates if an overdispersion is estimated for each gene.
#'      \item a numeric vector of length `nrow(data)` fixing the overdispersion to those values.
#'      \item the string `"global"` to indicate that one dispersion is fit across all genes.
#'   }
#'   Note that `overdispersion = 0` and `overdispersion = FALSE` are equivalent and both reduce
#'   the Gamma-Poisson to the classical Poisson model. Default: `TRUE`.
#' @param overdispersion_shrinkage the overdispersion can be difficult to estimate with few replicates. To
#'   improve the overdispersion estimates, we can share information across genes and shrink each individual
#'   overdispersion estimate towards a global overdispersion estimate. Empirical studies show however that
#'   the overdispersion varies based on the mean expression level (lower expression level => higher
#'   dispersion). If `overdispersion_shrinkage = TRUE`, a median trend of dispersion and expression level is
#'   fit and used to estimate the variances of a quasi Gamma Poisson model (Lund et al. 2012). Default: `TRUE`.
#' @param do_cox_reid_adjustment the classical maximum likelihood estimator of the `overdisperion` is biased
#'   towards small values. McCarthy _et al._ (2012) showed that it is preferable to optimize the Cox-Reid
#'   adjusted profile likelihood.\cr
#'   `do_cox_reid_adjustment` can be either be `TRUE` or `FALSE` to indicate if the adjustment is
#'   added during the optimization of the `overdispersion` parameter. Default: `TRUE`.
#' @param subsample the estimation of the overdispersion is the slowest step when fitting
#'   a Gamma-Poisson GLM. For datasets with many samples, the estimation can be considerably sped up
#'   without loosing much precision by fitting the overdispersion only on a random subset of the samples.
#'   Default: `FALSE` which means that the data is not subsampled. If set to `TRUE`, at most 1,000 samples
#'   are considered. Otherwise the parameter just specifies the number of samples that are considered
#'   for each gene to estimate the overdispersion.
#' @param on_disk a boolean that indicates if the dataset is loaded into memory or if it is kept on disk
#'   to reduce the memory usage. Processing in memory can be significantly faster than on disk.
#'   Default: `NULL` which means that the data is only processed in memory if `data` is an in-memory
#'   data structure.
#' @param verbose a boolean that indicates if information about the individual steps are printed
#'   while fitting the GLM. Default: `FALSE`.
#'
#'
#' @details
#' The method follows the following steps:
#'
#' 1. The size factors are estimated.\cr
#'    If `size_factors = "normed_sum"`, the column-sum for each cell is calculated and the resulting
#'    size factors are normalized so that their geometric mean is `1`. If `size_factors = "poscounts"`,
#'    a slightly adapted version of the procedure proposed by Anders and Huber (2010) in
#'    equation (5) is used. To handle the large number of zeros the geometric means are calculated for
#'    \eqn{Y + 0.5} and ignored during the calculation of the median. Columns with all zeros get a
#'    default size factor of \eqn{0.001}. If `size_factors = "deconvolution"`, the method
#'    `scran::calculateSumFactors()` is called.
#' 2. The dispersion estimates are initialized based on the moments of each row of \eqn{Y}.
#' 3. The coefficients of the model are estimated.\cr
#'    If all samples belong to the same condition (i.e. `design = ~ 1`), the betas are estimated using
#'    a quick Newton-Raphson algorithm. This is similar to the behavior of `edgeR`. For more complex
#'    designs, the general Fisher-scoring algorithm is used. Here, the code is based on a fork  of the
#'    internal function `fitBeta()` from `DESeq2`. It does however contain some modification to make
#'    the fit more robust and faster.
#' 4. The mean for each gene and sample is calculate.\cr
#'    Note that this step can be very IO intensive if `data` is or contains a DelayedArray.
#' 5. The overdispersion is estimated.\cr
#'    The classical method for estimating the overdispersion for each gene is to maximize the
#'    Gamma-Poisson log-likelihood by iterating over each count and summing the the corresponding
#'    log-likelihood. It is however, much more efficient
#'    for genes with many small counts to work on the contingency table of the counts. Originally, this
#'    approach had already been used by Anscombe (1950). In this package, I have implemented an
#'    extension of their method that can handle general offsets.\cr
#'    See also [overdispersion_mle()].
#'  6. The beta coefficients are estimated once more with the updated overdispersion estimates
#'  7. The mean for each gene and sample is calculated again.
#'
#' This method can handle not just in memory data, but also data stored on disk. This is essential for
#' large scale datasets with thousands of samples, as they sometimes encountered in modern single-cell
#' RNA-seq analysis. `glmGamPoi` relies on the `DelayedArray` and `beachmat` package to efficiently
#' implement the access to the on-disk data.
#'
#' @return
#' The method returns a list with the following elements:
#' \describe{
#'   \item{`Beta`}{a matrix with dimensions `nrow(data) x n_coefficients` where `n_coefficients` is
#'   based on the `design` argument. It contains the estimated coefficients for each gene.}
#'   \item{`overdispersions`}{a vector with length `nrow(data)`. The overdispersion parameter for each
#'   gene. It describes how much more the counts vary than one would expect according to the Poisson
#'   model.}
#'   \item{`overdispersion_shrinkage_list`}{a list with additional information from the quasi-likelihood
#'   shrinkage. For details see [overdispersion_shrinkage()].}
#'   \item{`deviances`}{a vector with the deviance of the fit for each row. The deviance is a measure
#'   how well the data is fit by the model. A smaller deviance means a better fit.}
#'   \item{`Mu`}{a matrix with the same dimensions as `dim(data)`. If the calculation happened on
#'   disk, than `Mu` is a `HDF5Matrix`. It contains the estimated mean value for each gene and
#'   sample.}
#'   \item{`size_factors`}{a vector with length `ncol(data)`. The size factors are the inferred
#'   correction factors for different sizes of each sample. They are also sometimes called the
#'   exposure factor.}
#'   \item{`data`}{a `SummarizedExperiment` that contains the input counts and the `col_data`}
#'   \item{`model_matrix`}{a matrix with dimensions `ncol(data) x n_coefficients`. It is build based
#'   on the `design` argument.}
#' }
#'
#' @examples
#'  set.seed(1)
#'  # The simplest example
#'  y <- rnbinom(n = 10, mu = 3, size = 1/2.4)
#'  c(glm_gp(y, size_factors = FALSE))
#'
#'  # Fitting a whole matrix
#'  model_matrix <- cbind(1, rnorm(5))
#'  true_Beta <- cbind(rnorm(n = 30), rnorm(n = 30, mean = 3))
#'  sf <- exp(rnorm(n = 5, mean = 0.7))
#'  model_matrix
#'  Y <- matrix(rnbinom(n = 30 * 5, mu = sf * exp(true_Beta %*% t(model_matrix)), size = 1/2.4),
#'              nrow = 30, ncol = 5)
#'
#'  fit <- glm_gp(Y, design = model_matrix, size_factors = sf, verbose = TRUE)
#'  summary(fit)
#'
#'  # Fitting a model with covariates
#'  data <- data.frame(fav_food = sample(c("apple", "banana", "cherry"), size = 50, replace = TRUE),
#'  city = sample(c("heidelberg", "paris", "new york"), size = 50, replace = TRUE),
#'  age = rnorm(n = 50, mean = 40, sd = 15))
#'  Y <- matrix(rnbinom(n = 100 * 50, mu = 3, size = 1/3.1), nrow = 100, ncol = 50)
#'  fit <- glm_gp(Y, design = ~ fav_food + city + age, col_data = data)
#'  summary(fit)
#'
#'
#'
#' @seealso [overdispersion_mle()] and [overdispersion_shrinkage()] for the internal functions that do the
#'   work. For differential expression analysis, see [test_de()].
#'
#' @references
#'   * McCarthy, D. J., Chen, Y., & Smyth, G. K. (2012). Differential expression analysis of multifactor
#'   RNA-Seq experiments with respect to biological variation. Nucleic Acids Research, 40(10), 4288–4297.
#'   [https://doi.org/10.1093/nar/gks042](https://doi.org/10.1093/nar/gks042).
#'   * Anders Simon, & Huber Wolfgang. (2010). Differential expression analysis for sequence count data.
#'   Genome Biology. [https://doi.org/10.1016/j.jcf.2018.05.006](https://doi.org/10.1016/j.jcf.2018.05.006).
#'   * Love, M. I., Huber, W., & Anders, S. (2014). Moderated estimation of fold change and  dispersion
#'   for RNA-seq data with DESeq2. Genome Biology, 15(12), 550.
#'   [https://doi.org/10.1186/s13059-014-0550-8](https://doi.org/10.1186/s13059-014-0550-8).
#'   * Robinson, M. D., McCarthy, D. J., & Smyth, G. K. (2009). edgeR: A Bioconductor package for differential
#'   expression analysis of digital gene expression data. Bioinformatics, 26(1), 139–140.
#'   [https://doi.org/10.1093/bioinformatics/btp616](https://doi.org/10.1093/bioinformatics/btp616).
#'   * Lun ATL, Pagès H, Smith ML (2018). “beachmat: A Bioconductor C++ API for accessing high-throughput
#'   biological data from a variety of R matrix types.” PLoS Comput. Biol., 14(5), e1006135. doi:
#'   [10.1371/journal.pcbi.1006135.](https://doi.org/10.1371/journal.pcbi.1006135).
#'   * Lund, S. P., Nettleton, D., McCarthy, D. J., & Smyth, G. K. (2012). Detecting differential expression
#'   in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates. Statistical
#'   Applications in Genetics and Molecular Biology, 11(5).
#'   [https://doi.org/10.1515/1544-6115.1826](https://doi.org/10.1515/1544-6115.1826).
#'   * Lun ATL, Bach K and Marioni JC (2016). Pooling across cells to normalize single-cell RNA sequencing
#'   data with many zero counts. Genome Biol. 17:75
#'   [https://doi.org/10.1186/s13059-016-0947-7](https://doi.org/10.1186/s13059-016-0947-7)
#'
#' @export
glm_gp <- function(data,
                   design = ~ 1,
                   col_data = NULL,
                   reference_level = NULL,
                   offset = 0,
                   size_factors = c("normed_sum", "deconvolution", "poscounts"),
                   overdispersion = TRUE,
                   overdispersion_shrinkage = TRUE,
                   do_cox_reid_adjustment = TRUE,
                   subsample = FALSE,
                   on_disk = NULL,
                   verbose = FALSE){

  # Validate `data`
  if(inherits(data, "formula")){
    if(length(design) != 2 || design != ~ 1){
      stop("If the first argument is already a formula, the second argument must not be set. Please call this function like this:\n",
           "'glm_gp(data = mat, design = ~ a + b + c, ...)'", call. = FALSE)
    }
    extr <- extract_data_from_formula(data, col_data, parent.frame())
    data <- extr$data
    design <- extr$design
  }
  if(is.vector(data)){
    data <- matrix(data, nrow = 1)
  }
  data_mat <- handle_data_parameter(data, on_disk)

  # Convert the formula to a model_matrix
  col_data <- get_col_data(data, col_data)
  des <- handle_design_parameter(design, data, col_data, reference_level)

  # Call glm_gp_impl()
  res <- glm_gp_impl(data_mat,
              model_matrix = des$model_matrix,
              offset = offset,
              size_factors = size_factors,
              overdispersion = overdispersion,
              overdispersion_shrinkage = overdispersion_shrinkage,
              do_cox_reid_adjustment = do_cox_reid_adjustment,
              subsample = subsample,
              verbose = verbose)
  # Make sure that the output is nice and beautiful
  rownames(data_mat) <- rownames(data)
  colnames(data_mat) <- colnames(data)
  res$data <- SummarizedExperiment::SummarizedExperiment(list(counts = data_mat),
                                                         colData = col_data)
  res$model_matrix <- des$model_matrix
  if(is.null(colnames(res$model_matrix))){
    colnames(res$model_matrix) <- paste0("Coef_", seq_len(ncol(res$model_matrix)))
  }
  res$design_formula <- des$design_formula
  colnames(res$Beta) <- colnames(res$model_matrix)
  rownames(res$Beta) <- rownames(data)
  rownames(res$Mu) <- rownames(data)
  colnames(res$Mu) <- colnames(data)
  names(res$overdispersions) <- rownames(data)
  names(res$deviances) <- rownames(data)
  names(res$size_factors) <- colnames(data)

  class(res) <- "glmGamPoi"
  res
}


handle_data_parameter <- function(data, on_disk){
  if(is.matrix(data)){
    if(! is.numeric(data)){
      stop("The data argument must consist of numeric values and not of ", mode(data), " values")
    }
    if(is.null(on_disk) || isFALSE(on_disk)){
      data_mat <- data
    }else if(isTRUE(on_disk)){
      data_mat <- HDF5Array::writeHDF5Array(data)
    }else{
      stop("Illegal argument type for on_disk. Can only handle 'NULL', 'TRUE', or 'FALSE'")
    }
  }else if(is(data, "DelayedArray")){
    if(is.null(on_disk) || isTRUE(on_disk)){
      data_mat <- data
    }else if(isFALSE(on_disk)){
      data_mat <- as.matrix(data)
    }else{
      stop("Illegal argument type for on_disk. Can only handle 'NULL', 'TRUE', or 'FALSE'")
    }
  }else if(is(data, "SummarizedExperiment")){
    data_mat <- handle_data_parameter(SummarizedExperiment::assay(data), on_disk)
  }else if(canCoerce(data, "SummarizedExperiment")){
    se <- as(data, "SummarizedExperiment")
    data_mat <- handle_data_parameter(SummarizedExperiment::assay(se), on_disk)
  }else if(is(data, "dgCMatrix")) {
    stop("glmGamPoi does not yet support sparse input data of type 'dgCMatrix'. ",
         "If your data fits into memory, please cast it to a dense matrix with ",
         "'as.matrix(data)' or if it is too big, convert it to an on-disk dataset ",
         "with the 'HDF5Array' package. ")
  }else{
    stop("Cannot handle data of class ", class(data), ".",
         "It must be of type matrix, DelayedArray, SummarizedExperiment, ",
         "or something that can be cast to a SummarizedExperiment")
  }
  data_mat
}


get_col_data <- function(data, col_data){
  if(is.null(col_data)){
    col_data <- as.data.frame(matrix(numeric(0), nrow=ncol(data)))
  }
  if(is(data, "SummarizedExperiment")){
    cbind(col_data, SummarizedExperiment::colData(data))
  }else{
    col_data
  }
}


handle_design_parameter <- function(design, data, col_data, reference_level){
  n_samples <- ncol(data)

  # Handle the design parameter
  if(is.matrix(design)){
    if(! is.null(reference_level)){
      stop("Cannot specify `design` as a matrix and `reference_level` != NULL.\n",
           "Either provide `design` as a formula and specify `reference_level` or ",
           "make sure that the correct reference level is used when the matrix is created ",
           "for example with `stats::relevel()`.")
    }
    model_matrix <- design
    design_formula <- NULL
  }else if((is.vector(design) || is.factor(design))){
    if(length(design) != n_samples){
      if(length(design) == 1 && design == 1){
        stop("The specified design vector length (", length(design), ") does not match ",
             "the number of samples: ", n_samples, "\n",
             "Did you maybe mean: `design = ~ 1`?")
      }else{
        stop("The specified design vector length (", length(design), ") does not match ",
             "the number of samples: ", n_samples)
      }
    }
    model_matrix <- convert_chr_vec_to_model_matrix(design, reference_level)
    design_formula <- NULL
  }else if(inherits(design,"formula")){
    model_matrix <- convert_formula_to_model_matrix(design, col_data, reference_level)
    design_formula <- design
  }else{
    stop(paste0("design argment of class ", class(design), " is not supported. Please ",
                "specify a `model_matrix`, a `character vector`, or a `formula`."))
  }


  if(nrow(model_matrix) != ncol(data)) stop("Number of rows in col_data does not match number of columns of data")
  if(! is.null(rownames(model_matrix)) &&
     ! all(rownames(model_matrix) == as.character(seq_len(nrow(model_matrix)))) && # That's the default rownames
     ! is.null(colnames(data))){
    if(! all(rownames(model_matrix) == colnames(data))){
      if(setequal(rownames(model_matrix), colnames(data))){
        # Rearrange the rows to match the columns of data
        model_matrix <- model_matrix[colnames(data), ,drop=FALSE]
      }else{
        stop("The rownames of the model_matrix / col_data do not match the column names of data.")
      }
    }

  }

  if(ncol(model_matrix) >= ncol(data)){
    stop("The model_matrix:\n", format_matrix(head(model_matrix)), "\nhas more columns (", ncol(model_matrix),
         ") than the there are samples in the data matrix\n",
         format_matrix(head(data[,seq_len(min(5, ncol(data))),drop=FALSE])),
         "\nwhich has ", ncol(data), " columns. ",
         "Too few replicates / too many coefficients to fit model.")
  }

  # Check rank of model_matrix
  qr_mm <- qr(model_matrix)
  if(qr_mm$rank < ncol(model_matrix) && n_samples > 0){
    is_zero_column <- DelayedMatrixStats::colCounts(model_matrix, value = 0) == nrow(model_matrix)
    if(any(is_zero_column)){
      stop("The model matrix:\n", format_matrix(head(model_matrix)), "\nseems degenerate ('matrix_rank(model_matrix) < ncol(model_matrix)'). ",
           "Column ", paste0(which(is_zero_column), collapse = ", "), " contains only zeros.")
    }else{
      stop("The model matrix:\n", format_matrix(head(model_matrix)), "\nseems degenerate ('matrix_rank(model_matrix) < ncol(model_matrix)'). ",
           "Some columns are perfectly collinear. Did you maybe include the same coefficient twice?")
    }
  }

  rownames(model_matrix) <- colnames(data)
  validate_model_matrix(model_matrix, data)
  list(model_matrix = model_matrix, design_formula = design_formula)
}



handle_subsample_parameter <- function(data, subsample){
  if(isFALSE(subsample)){
    n_subsamples <- ncol(data)
  }else if(isTRUE(subsample)){
    n_subsamples <- 1000
  }else{
    n_subsamples <- subsample
  }
  min(n_subsamples, ncol(data))
}




validate_model_matrix <- function(matrix, data){
  stopifnot(is.matrix(matrix))
  stopifnot(nrow(matrix) == ncol(data))
}



convert_chr_vec_to_model_matrix <- function(design, reference_level){
  if(! is.factor(design)){
    design_fct <- as.factor(design)
  }else{
    design_fct <- design
  }

  if(length(levels(design_fct)) == 1){
    # All entries are identical build an intercept only model
    mm <- matrix(1, nrow=length(design_fct), ncol=1)
    colnames(mm) <- levels(design_fct)
  }else if(is.null(reference_level)){
    helper_df <- data.frame(x_ = design_fct)
    mm <- stats::model.matrix.default(~ x_ - 1, helper_df)
    colnames(mm) <- sub("^x_", "", colnames(mm))
  }else{
    design_fct <- stats::relevel(design_fct, ref = reference_level)
    helper_df <- data.frame(x_ = design_fct)
    mm <- stats::model.matrix.default(~ x_ + 1, helper_df)
    colnames(mm)[-1] <- paste0(sub("^x_", "", colnames(mm)[-1]), "_vs_", reference_level)
  }
  colnames(mm)[colnames(mm) == "(Intercept)"] <- "Intercept"
  mm
}

extract_data_from_formula <- function(formula, col_data, encl = parent.frame()){
  if(length(formula) < 3){
    stop("The formula does not have a left hand side. Please call this function like this:\n",
         "'glm_gp(data = mat, design = ~ a + b + c, ...)'", call. = FALSE)
  }
  data <- eval(formula[[2]], envir = col_data, enclos = encl)
  formula[[2]] <- NULL
  list(data = data, design = formula)
}


convert_formula_to_model_matrix <- function(formula, col_data, reference_level=NULL){
  if(! is.null(reference_level)){
    has_ref_level <- vapply(col_data, function(x){
      any(!is.na(x) & x == reference_level)
    }, FUN.VALUE = FALSE)
    if(all(has_ref_level == FALSE)){
      stop("None of the columns contains the specified reference_level.")
    }
    col_data[has_ref_level] <- lapply(col_data[has_ref_level], function(col){
      if(is.character(col)){
        col <- as.factor(col)
      }
      stats::relevel(col, ref = reference_level)
    })
  }
  tryCatch({
    mm <- stats::model.matrix.default(formula, col_data)
  }, error = function(e){
    # Try to extract text from error message
    match <- regmatches(e$message, regexec("object '(.+)' not found", e$message))[[1]]
    if(length(match) == 2){
      stop("Object '", match[2], "' not found. Allowed variables are:\n",
           paste0(colnames(col_data), collapse = ", "))
    }else{
      stop(e$message)
    }
  })

  colnames(mm)[colnames(mm) == "(Intercept)"] <- "Intercept"
  mm
}


is_on_disk.glmGamPoi <- function(fit){
  is(fit$Mu, "DelayedMatrix") && is(DelayedArray::seed(fit$Mu), "HDF5ArraySeed")
}