File: quasi_gamma_poisson_shrinkage.R

package info (click to toggle)
r-bioc-glmgampoi 1.2.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 704 kB
  • sloc: cpp: 523; ansic: 114; sh: 13; makefile: 2
file content (210 lines) | stat: -rw-r--r-- 9,527 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210



#' Shrink the overdispersion estimates
#'
#' Low-level function to shrink a set of overdispersion
#' estimates following the quasi-likelihood and Empirical
#' Bayesian framework.
#'
#' @param disp_est vector of overdispersion estimates
#' @param gene_means vector of average gene expression values. Used
#'   to fit `disp_trend` if that is `NULL`.
#' @param df degrees of freedom for estimating the Empirical Bayesian
#'   variance prior. Can be length 1 or same length as `disp_est` and
#'   `gene_means`.
#' @param disp_trend vector with the dispersion trend. If `NULL` or `TRUE` the
#'   dispersion trend is fitted using a (weighted) local median fit.
#'   Default: `TRUE`.
#' @param ql_disp_trend a logical to indicate if a second abundance trend
#'   using splines is fitted for the quasi-likelihood dispersions.
#'   Default: `NULL` which means that the extra fit is only done if
#'   enough observations are present.
#' @param ... additional parameters for the `loc_median_fit()` function
#' @inheritParams glm_gp
#'
#'
#'
#' @details
#' The function goes through the following steps
#' 1. Fit trend between overdispersion MLE's and the average
#'  gene expression. Per default it uses the `loc_median_fit()`
#'  function.
#' 2. Convert the overdispersion MLE's to quasi-likelihood
#'  dispersion estimates by fixing the trended dispersion as
#'  the "true" dispersion value:
#'  \eqn{disp_ql = (1 + mu * disp_mle) / (1 + mu * disp_trend)}
#' 3. Shrink the quasi-likelihood dispersion estimates using
#'  Empirical Bayesian variance shrinkage (see Smyth 2004).
#'
#' @return the function returns a list with the following elements
#' \describe{
#'   \item{dispersion_trend}{the dispersion trend provided by `disp_trend` or the
#'     local median fit.}
#'   \item{ql_disp_estimate}{the quasi-likelihood dispersion estimates based on
#'     the dispersion trend, `disp_est`, and `gene_means`}
#'   \item{ql_disp_trend}{the `ql_disp_estimate` still might show a trend with
#'     respect to `gene_means`. If `ql_disp_trend = TRUE` a spline is used to
#'     remove this secondary trend. If `ql_disp_trend = TRUE` it corresponds
#'     directly to the dispersion prior}
#'   \item{ql_disp_shrunken}{the shrunken quasi-likelihood dispersion estimates.
#'     They are shrunken towards `ql_disp_trend`.}
#'   \item{ql_df0}{the degrees of freedom of the empirical Bayesian shrinkage.
#'     They correspond to spread of the `ql_disp_estimate`'s}
#' }
#'
#' @examples
#'  Y <- matrix(rnbinom(n = 300 * 4, mu = 6, size = 1/4.2), nrow = 30, ncol = 4)
#'  disps <- sapply(seq_len(nrow(Y)), function(idx){
#'    overdispersion_mle(Y[idx, ])$estimate
#'  })
#'  shrink_list <- overdispersion_shrinkage(disps, rowMeans(Y), df = ncol(Y) - 1,
#'                                          disp_trend = FALSE, ql_disp_trend = FALSE)
#'
#'  plot(rowMeans(Y), shrink_list$ql_disp_estimate)
#'  lines(sort(rowMeans(Y)), shrink_list$ql_disp_trend[order(rowMeans(Y))], col = "red")
#'  points(rowMeans(Y), shrink_list$ql_disp_shrunken, col = "blue", pch = 16, cex = 0.5)
#'
#'
#' @references
#' * Lund, S. P., Nettleton, D., McCarthy, D. J., & Smyth, G. K. (2012). Detecting differential expression
#'   in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates. Statistical
#'   Applications in Genetics and Molecular Biology, 11(5).
#'   [https://doi.org/10.1515/1544-6115.1826](https://doi.org/10.1515/1544-6115.1826).
#' * Smyth, G. K. (2004). Linear models and empirical bayes methods for assessing differential expression
#'   in microarray experiments. Statistical Applications in Genetics and Molecular Biology, 3(1).
#'   [https://doi.org/10.2202/1544-6115.1027](https://doi.org/10.2202/1544-6115.1027)
#'
#' @seealso \code{limma::squeezeVar()}
#' @export
overdispersion_shrinkage <- function(disp_est, gene_means, df,
                                     disp_trend = TRUE,
                                     ql_disp_trend = NULL,
                                     ...,
                                     verbose = FALSE){
  stopifnot(length(disp_est) == length(gene_means))
  min(length(disp_est), max(0.1 * length(disp_est), 100))

  if(is.null(disp_trend) || isTRUE(disp_trend)){
    if(verbose){ message("Fit dispersion trend") }
    disp_trend <- loc_median_fit(gene_means, y = disp_est, ...)
  }else if(isFALSE(disp_trend)) {
    disp_trend <- rep(mean(disp_est), length(disp_est))
  }

  if(verbose){ message("Shrink dispersion estimates") }
  # Lund et al. 2012. Equation 4. Not ideal for very small counts
  # ql_disp <- rowSums2(compute_gp_deviance_residuals_matrix(Y, Mu, disp_trend)^2) / df

  # The following equations are used to go between quasi-likelihood and normal representation
  # variance = (gene_means + disp_est * gene_means^2)
  # variance = ql_disp * (gene_means + disp_trend * gene_means^2)
  ql_disp <- (1 + gene_means * disp_est) / (1 + gene_means * disp_trend)

  var_pr <- variance_prior(ql_disp, df, covariate = gene_means, abundance_trend = ql_disp_trend)

  list(dispersion_trend = disp_trend, ql_disp_estimate = ql_disp,
       ql_disp_trend = var_pr$variance0, ql_disp_shrunken = var_pr$var_post,
       ql_df0 = var_pr$df0)
}








#' Estimate the scale and df for a Inverse Chisquare distribution that generate the true gene variances
#'
#' This function implements Smyth's 2004 variance shrinkage. It also supports covariates that are
#' fitted to log(s2) with natural splines. This is based on the 2012 Lund et al. quasi-likelihood
#' paper.
#'
#' @param s2 vector of observed variances. Must not contain \code{0}'s.
#' @param df vector or single number with the degrees of freedom
#' @param covariate a vector with the same length as s2. \code{covariate} is used to regress
#'   out the trend in \code{s2}. If \code{covariate = NULL}, it is ignored.
#' @param abundance_trend logical that decides if the additional abundance trend is fit
#'   to the data. If `NULL` the abundance trend is fitted if there are more than 10 observations
#'   and the `covariate` is not `NULL`. Default: `NULL`
#'
#' @return a list with three elements:
#' \describe{
#'   \item{variance0}{estimate of the scale of the inverse Chisquared distribution. If
#'   covariate is \code{NULL} a single number, otherwise a vector of \code{length(covariate)}}
#'   \item{df0}{estimate of the degrees of freedom of the inverse Chisquared distribution}
#'   \item{var_pos}{the shrunken variance estimates: a combination of \code{s2} and \code{variance0}}
#' }
#'
#' @seealso \code{limma::squeezeVar()}
#'
#' @keywords internal
variance_prior <- function(s2, df, covariate = NULL, abundance_trend = NULL){
  stopifnot(length(s2) == length(df) || length(df) == 1)
  stopifnot(is.null(covariate) || length(covariate) == length(s2))
  if(is.null(covariate) && isTRUE(abundance_trend)) {
    stop("If abundance_trend is true, covariate must not be 'NULL'.")
  }

  if(all(is.na(s2))){
    # This happens if input has zero columns
    return(list(variance0 = rep(NA, length(s2)), df0 = NA, var_post = s2))
  }
  if(all(s2 == 1)){
    # Happens for example if overdispersion is fixed to 0 -> Poisson GLM
    return(list(variance0 = rep(1, length(s2)), df0 = Inf, var_post = s2))
  }
  if(any(df <= 0, na.rm=TRUE)){
    stop(paste0("All df must be positive. df ", paste0(which(df <= 0), collapse=", "), " are not."))
  }
  if(any(s2 <= 0, na.rm=TRUE)){
    stop(paste0("All s2 must be positive. s2 ", paste0(which(s2 <= 0), collapse=", "), " are not."))
  }

  # Fit an intercept to s2
  opt_res <- optim(par=c(log_variance0=0, log_df0_inv=0), function(par){
    -sum(df(s2/exp(par[1]), df1=df, df2=exp(par[2]), log=TRUE) - par[1], na.rm=TRUE)
  })
  variance0 <- rep(exp(unname(opt_res$par[1])), times = length(s2))
  df0 <- exp(unname(opt_res$par[2]))

  if(is.numeric(abundance_trend)){
    stop("Numeric abundance trend is not supported.")
  }else if(is.null(covariate) || is.null(abundance_trend) || ! abundance_trend || length(s2) < 10 || sum(covariate > 1e-8) < 10) {
    if(length(s2) < 10 && isTRUE(abundance_trend)){
      warning("abundance_trend was set to 'TRUE', however there were not enough observations (length(s2) = ",
              length(s2), " < 10) to fit the trend.")
    }
    # Do nothing else to data
  }else{
    # Fit a trend!
    # Fit a spline through the log(s2) ~ covariate + 1 to account
    # for remaing trends in s2
    log_covariate <- log(covariate)
    not_zero <- covariate > 1e-8
    ra <- range(log_covariate[not_zero])
    knots <- ra[1] + c(1/3, 2/3) * (ra[2] - ra[1])
    tryCatch({
      design <- splines::ns(log_covariate[not_zero], df = 4, knots = knots, intercept = TRUE)
      init_fit <- lm.fit(design, log(s2[not_zero]))
      opt_res <- optim(par=c(betas = init_fit$coefficients, log_df0_inv=0), function(par){
        variance0 <- exp(design %*% par[1:4])
        df0 <- exp(par[5])
        -sum(df(s2[not_zero]/variance0, df1=df, df2=df0, log=TRUE) - log(variance0), na.rm=TRUE)
      }, control = list(maxit = 5000))
      variance0[not_zero] <- c(exp(design %*% opt_res$par[1:4]))
      df0 <- exp(unname(opt_res$par[5]))
    }, error = function(e){
      warning("Problem fitting the abundance trend to the quasi-likelihood dispersion. ",
              "Falling back to non-trended variance0 estimation.")
    })
  }

  if(opt_res$convergence != 0){
    warning("Variance prior estimate did not properly converge\n")
  }

  var_post <- (df0 * variance0 + df * s2) / (df0 + df)
  list(variance0 = variance0, df0 = df0, var_post = var_post)
}