File: estimate_size_factors.R

package info (click to toggle)
r-bioc-glmgampoi 1.2.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 704 kB
  • sloc: cpp: 523; ansic: 114; sh: 13; makefile: 2
file content (118 lines) | stat: -rw-r--r-- 4,056 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118




#' Estimate the Size Factors
#'
#' @param Y any matrix-like object (\code{base::matrix()}, \code{DelayedArray}, \code{HDF5Matrix},
#'   \code{Matrix::Matrix()}, etc.)
#' @param method one of \code{c("normed_sum", "deconvolution", "poscounts")}
#'
#'
#' @return a vector with one size factor per column of `Y`
#'
#' @keywords internal
estimate_size_factors <- function(Y, method, verbose = FALSE){
  if(nrow(Y) <= 1){
    if(verbose) {
      message("Skipping size factor estimation, because there is only a single gene.",
            call. = FALSE)
    }
    return(rep(1, ncol(Y)))
  }
  stopifnot(length(method) == 1 && is.character(method))

  if(method == "poscounts"){
    # Accept any matrix-like object
    log_geometric_means <- DelayedMatrixStats::rowMeans2(log(Y + 0.5))
    Y2 <- Y
    Y2[Y2 == 0] <- NA
    sf <- exp(DelayedMatrixStats::colMedians(subtract_vector_from_each_column(log(Y2), log_geometric_means), na.rm = TRUE))
  }else if(method == "deconvolution"){
    if(requireNamespace("scran", quietly = TRUE)){
      tryCatch({
        sf <- scran::calculateSumFactors(Y)
      }, error = function(err){
        stop("Error in size factor estimation with 'size_factors = \"deconvolution\"'.\n",
             "'scran::calculateSumFactors(Y)' threw the following error: \n\t",
             err, call. = FALSE)
      })
    }else{
      stop("To use the \"deconvolution\" method for size factor calculation, you need to install the ",
           "'scran' package from Bioconductor. Alternatively, you can use \"normed_sum\" or \"poscounts\"",
           "to calculate the size factors.", call. = FALSE)
    }
  }else if(method == "normed_sum"){
    sf <- DelayedMatrixStats::colSums2(Y)
  }else{
    stop("Unknown size factor estimation method: ", method)
  }



  # stabilize size factors to have geometric mean of 1
  all_zero_column <- is.nan(sf) | sf <= 0
  sf[all_zero_column] <- NA
  if(any(all_zero_column)){
    sf <- sf/exp(mean(log(sf), na.rm=TRUE))
    sf[all_zero_column] <- 0.001
  }else{
    sf <- sf/exp(mean(log(sf)))
  }
  sf
}



combine_size_factors_and_offset <- function(offset, size_factors, Y, verbose = FALSE){
  n_genes <- nrow(Y)
  n_samples <- ncol(Y)

  make_offset_hdf5_mat <- is(Y, "DelayedMatrix") && is(DelayedArray::seed(Y), "HDF5ArraySeed")

  if(is.matrix(offset)){
    stopifnot(dim(offset) == c(n_genes, n_samples))
    offset_matrix <- offset
  }else{
    stopifnot(length(offset) == 1 || length(offset) == n_samples)
    if(make_offset_hdf5_mat){
      offset_matrix <- DelayedArray::DelayedArray(DelayedArray::SparseArraySeed(c(n_genes, n_samples)))
      offset_matrix <- add_vector_to_each_row(offset_matrix, offset)
    }else{
      offset_matrix <- matrix(offset, nrow=n_genes, ncol = n_samples, byrow = TRUE)
    }
  }
  if(isTRUE(size_factors) || is.character(size_factors)){
    method <- if(isTRUE(size_factors)){
      "deconvolution"
    }else if(all(size_factors == c("normed_sum", "deconvolution", "poscounts"))){
      "normed_sum"
    }else if(length(size_factors) == 1 && size_factors %in% c("normed_sum", "deconvolution", "poscounts")){
      size_factors
    }else{
      stop("Cannot handle size factor ", paste0(size_factors, collapse = ", "))
    }
    if(verbose){ message("Calculate Size Factors (", method, ")") }
    lsf <- log(estimate_size_factors(Y, method = method, verbose = verbose))
  }else if(isFALSE(size_factors)){
    lsf <- rep(0, n_samples)
  }else{
    stopifnot(is.numeric(size_factors) && (length(size_factors) == 1 || length(size_factors) == n_samples))
    if(any(size_factors < 0)){
      stop("size factor 'size_factors' must be larger than 0")
    }
    if(length(size_factors) == 1){
      lsf <- rep(log(size_factors), n_samples)
    }else{
      lsf <- log(size_factors)
    }
  }
  # offset_matrix <- offset_matrix + lsf_mat
  offset_matrix <- add_vector_to_each_row(offset_matrix, lsf)
  if(make_offset_hdf5_mat){
    offset_matrix <- HDF5Array::writeHDF5Array(offset_matrix)
  }
  list(offset_matrix = offset_matrix, size_factors = exp(lsf))
}