File: helper.R

package info (click to toggle)
r-bioc-tximport 1.18.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 300 kB
  • sloc: makefile: 5
file content (121 lines) | stat: -rw-r--r-- 4,438 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#' Low-level function to make counts from abundance using matrices
#'
#' Simple low-level function used within \link{tximport} to generate
#' \code{scaledTPM} or \code{lengthScaledTPM} counts, taking as input
#' the original counts, abundance and length matrices.
#' NOTE: This is a low-level function exported in case it is needed for some reason,
#' but the recommended way to generate counts-from-abundance is using
#' \link{tximport} with the \code{countsFromAbundance} argument.
#'
#' @param countsMat a matrix of original counts
#' @param abundanceMat a matrix of abundances (typically TPM)
#' @param lengthMat a matrix of effective lengths
#' @param countsFromAbundance the desired type of count-from-abundance output
#'
#' @return a matrix of count-scale data generated from abundances.
#' for details on the calculation see \link{tximport}.
#'
#' @export
makeCountsFromAbundance <- function(countsMat, abundanceMat, lengthMat,
                                    countsFromAbundance=c("scaledTPM","lengthScaledTPM")) {
  countsFromAbundance <- match.arg(countsFromAbundance)
  sparse <- is(countsMat, "dgCMatrix")
  colsumfun <- if (sparse) Matrix::colSums else colSums
  countsSum <- colsumfun(countsMat)
  if (countsFromAbundance == "lengthScaledTPM") {
    newCounts <- abundanceMat * rowMeans(lengthMat)
  } else if (countsFromAbundance == "scaledTPM") {
    newCounts <- abundanceMat
  } else {
    stop("expecting 'lengthScaledTPM' or 'scaledTPM'")
  }
  newSum <- colsumfun(newCounts)
  if (sparse) {
    countsMat <- Matrix::t(Matrix::t(newCounts) * (countsSum/newSum))
  } else {
    countsMat <- t(t(newCounts) * (countsSum/newSum))
  }
  countsMat
}

# function for replacing missing average transcript length values
replaceMissingLength <- function(lengthMat, aveLengthSampGene) {
  nanRows <- which(apply(lengthMat, 1, function(row) any(is.nan(row))))
  if (length(nanRows) > 0) {
    for (i in nanRows) {
      if (all(is.nan(lengthMat[i,]))) {
        # if all samples have 0 abundances for all tx, use the simple average
        lengthMat[i,] <- aveLengthSampGene[i]
      } else {
          # otherwise use the geometric mean of the lengths from the other samples
          idx <- is.nan(lengthMat[i,])
          lengthMat[i,idx] <-  exp(mean(log(lengthMat[i,!idx]), na.rm=TRUE))
        }
    }
  }
  lengthMat
}

medianLengthOverIsoform <- function(length, tx2gene, ignoreTxVersion, ignoreAfterBar) {
  txId <- rownames(length)
  if (ignoreTxVersion) {
    txId <- sub("\\..*", "", txId)
  } else if (ignoreAfterBar) {
    txId <- sub("\\|.*", "", txId)
  }
  tx2gene <- cleanTx2Gene(tx2gene)
  stopifnot(all(txId %in% tx2gene$tx))
  tx2gene <- tx2gene[match(txId, tx2gene$tx),]
  # average the lengths
  ave.len <- rowMeans(length)
  # median over isoforms
  med.len <- tapply(ave.len, tx2gene$gene, median)
  one.sample <- med.len[match(tx2gene$gene, names(med.len))]
  matrix(rep(one.sample, ncol(length)),
         ncol=ncol(length), dimnames=dimnames(length))
}

# code contributed from Andrew Morgan
read_kallisto_h5 <- function(fpath, ...) {
  if (!requireNamespace("rhdf5", quietly=TRUE)) {
    stop("reading kallisto results from hdf5 files requires Bioconductor package `rhdf5`")
  }
  counts <- rhdf5::h5read(fpath, "est_counts")
  ids <- rhdf5::h5read(fpath, "aux/ids")
  efflens <- rhdf5::h5read(fpath, "aux/eff_lengths")

  # as suggested by https://support.bioconductor.org/p/96958/#101090
  ids <- as.character(ids)
  
  stopifnot(length(counts) == length(ids)) 
  stopifnot(length(efflens) == length(ids))

  result <- data.frame(target_id = ids,
                       eff_length = efflens,
                       est_counts = counts,
                       stringsAsFactors = FALSE)
  normfac <- with(result, (1e6)/sum(est_counts/eff_length))
  result$tpm <- with(result, normfac*(est_counts/eff_length))
  return(result)
}

summarizeFail <- function() {
  stop("

  tximport failed at summarizing to the gene-level.
  Please see 'Solutions' in the Details section of the man page: ?tximport

")
}

# this is much faster than by(), a bit slower than dplyr summarize_each()
## fastby <- function(m, f, fun) {
##   idx <- split(1:nrow(m), f)
##   if (ncol(m) > 1) {
##     t(sapply(idx, function(i) fun(m[i,,drop=FALSE])))
##   } else {
##     matrix(vapply(idx, function(i) fun(m[i,,drop=FALSE], FUN.VALUE=numeric(ncol(m)))),
##            dimnames=list(levels(f), colnames(m)))
##   }
## }