1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
|
calcNormFactors <- function(object, ...)
UseMethod("calcNormFactors")
calcNormFactors.DGEList <- function(object, method=c("TMM","TMMwsp","RLE","upperquartile","none"), refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75, ...)
# Scale normalization of RNA-Seq data, for DGEList objects
# Created 2 October 2014. Last modified 2 June 2020.
{
if(!is.null(object$offset)) warning("object contains offsets, which take precedence over library\nsizes and norm factors (and which will not be recomputed).")
object$samples$norm.factors <- calcNormFactors(object=object$counts, lib.size=object$samples$lib.size, method=method, refColumn=refColumn, logratioTrim=logratioTrim, sumTrim=sumTrim, doWeighting=doWeighting, Acutoff=Acutoff, p=p)
object
}
calcNormFactors.SummarizedExperiment <- function(object, method=c("TMM","TMMwsp","RLE","upperquartile","none"), refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75, ...)
# Scale normalization of RNA-Seq data, for SummarizedExperiment objects
# Created 19 March 2020. Last modified 19 March 2020.
{
object <- SE2DGEList(object)
object$samples$norm.factors <- calcNormFactors(object=object$counts, lib.size=object$samples$lib.size, method=method, refColumn=refColumn, logratioTrim=logratioTrim, sumTrim=sumTrim, doWeighting=doWeighting, Acutoff=Acutoff, p=p)
object
}
calcNormFactors.default <- function(object, lib.size=NULL, method=c("TMM","TMMwsp","RLE","upperquartile","none"), refColumn=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10, p=0.75, ...)
# Scale normalization of RNA-Seq data, for count matrices
# Mark Robinson, Gordon Smyth and edgeR team
# Created 22 October 2009. Last modified 2 June 2020.
{
# Check object
x <- as.matrix(object)
if(any(is.na(x))) stop("NA counts not permitted")
nsamples <- ncol(x)
# Check lib.size
if(is.null(lib.size)) {
lib.size <- colSums(x)
} else {
if(anyNA(lib.size)) stop("NA lib.sizes not permitted")
if(length(lib.size) != nsamples) {
if(length(lib.size) > 1L) warning("calcNormFactors: length(lib.size) doesn't match number of samples",call.=FALSE)
lib.size <- rep_len(lib.size,nsamples)
}
}
# Check method
# Backward compatability with previous name
if(length(method)==1L && method=="TMMwzp") {
method <- "TMMwsp"
message("TMMwzp has been renamed to TMMwsp")
}
method <- match.arg(method)
# Remove all zero rows
allzero <- .rowSums(x>0, nrow(x), nsamples) == 0L
if(any(allzero)) x <- x[!allzero,,drop=FALSE]
# Degenerate cases
if(nrow(x)==0 || nsamples==1) method="none"
# Calculate factors
f <- switch(method,
TMM = {
if( is.null(refColumn) ) {
f75 <- suppressWarnings(.calcFactorQuantile(data=x, lib.size=lib.size, p=0.75))
if(median(f75) < 1e-20) {
refColumn <- which.max(colSums(sqrt(x)))
} else {
refColumn <- which.min(abs(f75-mean(f75)))
}
}
f <- rep_len(NA_real_,nsamples)
for(i in 1:nsamples)
f[i] <- .calcFactorTMM(obs=x[,i],ref=x[,refColumn], libsize.obs=lib.size[i], libsize.ref=lib.size[refColumn], logratioTrim=logratioTrim, sumTrim=sumTrim, doWeighting=doWeighting, Acutoff=Acutoff)
f
},
TMMwsp = {
if( is.null(refColumn) ) refColumn <- which.max(colSums(sqrt(x)))
f <- rep_len(NA_real_,nsamples)
for(i in 1:nsamples)
f[i] <- .calcFactorTMMwsp(obs=x[,i],ref=x[,refColumn], libsize.obs=lib.size[i], libsize.ref=lib.size[refColumn], logratioTrim=logratioTrim, sumTrim=sumTrim, doWeighting=doWeighting, Acutoff=Acutoff)
f
},
RLE = .calcFactorRLE(x)/lib.size,
upperquartile = .calcFactorQuantile(x,lib.size,p=p),
none = rep_len(1,nsamples)
)
# Factors should multiple to one
f <- f/exp(mean(log(f)))
# Output
names(f) <- colnames(x)
f
}
.calcFactorRLE <- function(data)
# Scale factors as in Anders et al (2010)
# Mark Robinson
# Created 16 Aug 2010
{
gm <- exp(rowMeans(log(data)))
apply(data, 2, function(u) median((u/gm)[gm > 0]))
}
.calcFactorQuantile <- function (data, lib.size, p=0.75)
# Generalized version of upper-quartile normalization
# Mark Robinson and Gordon Smyth
# Created 16 Aug 2010. Last modified 12 Sep 2020.
{
f <- rep_len(1,ncol(data))
for (j in seq_len(ncol(data))) f[j] <- quantile(data[,j], probs=p)
if(min(f)==0) warning("One or more quantiles are zero")
f / lib.size
}
.calcFactorTMM <- function(obs, ref, libsize.obs=NULL, libsize.ref=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10)
# TMM between two libraries
# Mark Robinson
{
obs <- as.numeric(obs)
ref <- as.numeric(ref)
if( is.null(libsize.obs) ) nO <- sum(obs) else nO <- libsize.obs
if( is.null(libsize.ref) ) nR <- sum(ref) else nR <- libsize.ref
logR <- log2((obs/nO)/(ref/nR)) # log ratio of expression, accounting for library size
absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression
v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref # estimated asymptotic variance
# remove infinite values, cutoff based on A
fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
logR <- logR[fin]
absE <- absE[fin]
v <- v[fin]
if(max(abs(logR)) < 1e-6) return(1)
# taken from the original mean() function
n <- length(logR)
loL <- floor(n * logratioTrim) + 1
hiL <- n + 1 - loL
loS <- floor(n * sumTrim) + 1
hiS <- n + 1 - loS
# keep <- (rank(logR) %in% loL:hiL) & (rank(absE) %in% loS:hiS)
# a fix from leonardo ivan almonacid cardenas, since rank() can return
# non-integer values when there are a lot of ties
keep <- (rank(logR)>=loL & rank(logR)<=hiL) & (rank(absE)>=loS & rank(absE)<=hiS)
if(doWeighting)
f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
else
f <- mean(logR[keep], na.rm=TRUE)
# Results will be missing if the two libraries share no features with positive counts
# In this case, return unity
if(is.na(f)) f <- 0
2^f
}
.calcFactorTMMwsp <- function(obs, ref, libsize.obs=NULL, libsize.ref=NULL, logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE, Acutoff=-1e10)
# TMM with pairing of singleton positive counts between the obs and ref libraries
# Gordon Smyth
# Created 19 Sep 2018. Last modified 9 Jun 2020.
{
obs <- as.numeric(obs)
ref <- as.numeric(ref)
# epsilon serves as floating-point zero
eps <- 1e-14
# Identify zero counts
pos.obs <- (obs > eps)
pos.ref <- (ref > eps)
npos <- 2L * pos.obs + pos.ref
# Remove double zeros and NAs
i <- which(npos==0L | is.na(npos))
if(length(i)) {
obs <- obs[-i]
ref <- ref[-i]
npos <- npos[-i]
}
# Check library sizes
if(is.null(libsize.obs)) libsize.obs <- sum(obs)
if(is.null(libsize.ref)) libsize.ref <- sum(ref)
# Pair up as many singleton positives as possible
# The unpaired singleton positives are discarded so that no zeros remain
zero.obs <- (npos == 1L)
zero.ref <- (npos == 2L)
k <- (zero.obs | zero.ref)
n.eligible.singles <- min( sum(zero.obs), sum(zero.ref))
if(n.eligible.singles > 0L) {
refk <- sort(ref[k],decreasing=TRUE)[1:n.eligible.singles]
obsk <- sort(obs[k],decreasing=TRUE)[1:n.eligible.singles]
obs <- c(obs[!k],obsk)
ref <- c(ref[!k],refk)
} else {
obs <- obs[!k]
ref <- ref[!k]
}
# Any left?
n <- length(obs)
if(n==0L) return(1)
# Compute M and A values
obs.p <- obs / libsize.obs
ref.p <- ref / libsize.ref
M <- log2( obs.p / ref.p )
A <- 0.5 * log2( obs.p * ref.p )
# If M all zero, return 1
if(max(abs(M)) < 1e-6) return(1)
# M order, breaking ties by shrunk M
obs.p.shrunk <- (obs+0.5) / (libsize.obs+0.5)
ref.p.shrunk <- (ref+0.5) / (libsize.ref+0.5)
M.shrunk <- log2( obs.p.shrunk / ref.p.shrunk )
o.M <- order(M, M.shrunk)
# A order
o.A <- order(A)
# Trim
loM <- as.integer(n * logratioTrim) + 1L
hiM <- n + 1L - loM
keep.M <- rep_len(FALSE,n)
keep.M[o.M[loM:hiM]] <- TRUE
loA <- as.integer(n * sumTrim) + 1L
hiA <- n + 1L - loA
keep.A <- rep_len(FALSE,n)
keep.A[o.A[loA:hiA]] <- TRUE
keep <- keep.M & keep.A
M <- M[keep]
# Average the M values
if(doWeighting) {
obs.p <- obs.p[keep]
ref.p <- ref.p[keep]
v <- (1-obs.p)/obs.p/libsize.obs + (1-ref.p)/ref.p/libsize.ref
w <- (1+1e-6) / (v+1e-6)
TMM <- sum(w*M) / sum(w)
} else {
TMM <- mean(M)
}
2^TMM
}
|