File: estimateDisp.R

package info (click to toggle)
r-bioc-edger 3.40.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,484 kB
  • sloc: cpp: 1,425; ansic: 1,109; sh: 21; makefile: 5
file content (345 lines) | stat: -rw-r--r-- 12,320 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#  Estimating dispersion using weighted likelihood empirical Bayes.

estimateDisp <- function(y, ...)
UseMethod("estimateDisp")

estimateDisp.DGEList <- function(y, design=NULL, prior.df=NULL, trend.method="locfit", tagwise=TRUE, span=NULL, min.row.sum=5, grid.length=21, grid.range=c(-10,10), robust=FALSE, winsor.tail.p=c(0.05,0.1), tol=1e-06, ...)
#  Yunshun Chen.
#  Created 16 March 2016. Last modified 16 Oct 2019.
{
	y <- validDGEList(y)
	group <- y$samples$group
	lib.size <- y$samples$lib.size * y$samples$norm.factors

	if(is.null(design)) {
		design <- y$design
	} else {
		y$design <- design
	}

	d <- estimateDisp(y=y$counts, design=design, group=group, lib.size=lib.size, offset=getOffset(y), prior.df=prior.df, trend.method=trend.method, tagwise=tagwise, span=span, min.row.sum=min.row.sum, grid.length=grid.length, grid.range=grid.range, robust=robust, winsor.tail.p=winsor.tail.p, tol=tol, weights=y$weights, ...)

	y$common.dispersion <- d$common.dispersion
	y$trended.dispersion <- d$trended.dispersion
	if(tagwise) y$tagwise.dispersion <- d$tagwise.dispersion
	y$AveLogCPM <- aveLogCPM(y)
	y$trend.method <- trend.method
	y$prior.df <- d$prior.df
	y$prior.n <- d$prior.n
	y$span <- d$span
	y
}

estimateDisp.SummarizedExperiment <- function(y, design=NULL, prior.df=NULL, trend.method="locfit", tagwise=TRUE, span=NULL, min.row.sum=5, grid.length=21, grid.range=c(-10,10), robust=FALSE, winsor.tail.p=c(0.05,0.1), tol=1e-06, ...)
#  Yunshun Chen.
#  Created 19 March 2020. Last modified 19 March 2020.
{
	y <- SE2DGEList(y)
	y <- estimateDisp.DGEList(y, design=design, prior.df=prior.df, trend.method=trend.method, tagwise=tagwise, span=span, min.row.sum=min.row.sum, grid.length=grid.length, grid.range=grid.range, robust=robust, winsor.tail.p=winsor.tail.p, tol=tol, ...)
	y
}

estimateDisp.default <- function(y, design=NULL, group=NULL, lib.size=NULL, offset=NULL, prior.df=NULL, trend.method="locfit", tagwise=TRUE, span=NULL, min.row.sum=5, grid.length=21, grid.range=c(-10,10), robust=FALSE, winsor.tail.p=c(0.05,0.1), tol=1e-06, weights=NULL, ...)
#  Estimate common, trended and tagwise dispersions
#  Use GLM approach if design matrix is given and classic approach otherwise.
#  A matrix of likelihoods is computed for each gene at a set of dispersion grid points
#  and WLEB() is called for weighted likelihood empirical Bayes.
#  Yunshun Chen, Aaron Lun, Gordon Smyth.
#  Created July 2012. Last modified 16 Oct 2019.
{
#	Check y
	y <- as.matrix(y)
	ntags <- nrow(y)
	if(ntags==0) return(list(span=span, prior.df=prior.df, prior.n=prior.n))
	nlibs <- ncol(y)
	
#	Check trend.method
	trend.method <- match.arg(trend.method, c("none", "loess", "locfit", "movingave", "locfit.mixed"))
	
#	Check group
	if(is.null(group)) group <- rep(1, nlibs)
	if(length(group)!=nlibs) stop("Incorrect length of group.")
	group <- dropEmptyLevels(group)

#	Check lib.size
	if(is.null(lib.size)) lib.size <- colSums(y)
	if(length(lib.size)!=nlibs) stop("Incorrect length of lib.size.")
	
#	Check offset
	offset <- .compressOffsets(y, lib.size=lib.size, offset=offset)

#	Check weights
	weights <- .compressWeights(y, weights)

#	Check for genes with small counts
	sel <- rowSums(y) >= min.row.sum
	sely <- .subsetMatrixWithoutCopying(y, i=sel)
	seloffset <- .subsetMatrixWithoutCopying(offset, i=sel)
	selweights <- .subsetMatrixWithoutCopying(weights, i=sel)
	
#	Spline points
	spline.pts <- seq(from=grid.range[1], to=grid.range[2], length.out=grid.length)
	spline.disp <- 0.1 * 2^spline.pts
	grid.vals <- spline.disp/(1+spline.disp)
	l0 <- matrix(0, sum(sel), grid.length)

#	Classic edgeR
	if(is.null(design)){
		# One way
		message("Using classic mode.")
		if(length(levels(group))==1)
			design <- matrix(1, nlibs, 1)
		else
			design <- model.matrix(~group)
		if( all(tabulate(group)<=1) ) {
			warning("There is no replication, setting dispersion to NA.")
			return(list(common.dispersion=NA_real_, trended.dispersion=NA_real_, tagwise.dispersion=NA_real_))
		}
		
		eq <- equalizeLibSizes(y, group=group, dispersion=0.01, lib.size=lib.size)
		y.pseudo <- eq$pseudo.counts[sel, , drop=FALSE]
		y.split <- splitIntoGroups(y.pseudo, group=group)
		delta <- optimize(commonCondLogLikDerDelta, interval=c(1e-4,100/(100+1)), tol=tol, maximum=TRUE, y=y.split, der=0)
		delta <- delta$maximum
		disp <- delta/(1-delta)

		eq <- equalizeLibSizes(y, group=group, dispersion=disp, lib.size=lib.size)
		y.pseudo <- eq$pseudo.counts[sel, , drop=FALSE]
		y.split <- splitIntoGroups(y.pseudo, group=group)
	
		for(j in 1:grid.length) for(i in 1:length(y.split)) 
			l0[,j] <- condLogLikDerDelta(y.split[[i]], grid.vals[j], der=0) + l0[,j]
	}
	# GLM edgeR 
	else {
		design <- as.matrix(design)
		if(ncol(design) >= nlibs) {
			warning("No residual df: setting dispersion to NA")
			return(list(common.dispersion=NA_real_, trended.dispersion=NA_real_, tagwise.dispersion=NA_real_))
		}

		# Identify which observations have means of zero (weights aren't needed here).
		glmfit <- glmFit(sely, design, offset=seloffset, dispersion=0.05, prior.count=0)
		zerofit <- (glmfit$counts < 1e-4 & glmfit$fitted.values < 1e-4)
		by.group <- .comboGroups(zerofit)

		for (subg in by.group) { 
			cur.nzero <- !zerofit[subg[1],]
			if (!any(cur.nzero)) { next } 

			# Removing samples with zero means from design, to avoid attempts to converge to -Inf.
			if (all(cur.nzero)) { 
				redesign <- design
			} else {
				redesign <- design[cur.nzero,,drop=FALSE]
				QR <- qr(redesign)
				redesign <- redesign[,QR$pivot[1:QR$rank],drop=FALSE]
				if (nrow(redesign) == ncol(redesign)) { next }
			}

			cury <- .subsetMatrixWithoutCopying(sely, i=subg, j=cur.nzero)
			curo <- .subsetMatrixWithoutCopying(seloffset, i=subg, j=cur.nzero)
			curw <- .subsetMatrixWithoutCopying(selweights, i=subg, j=cur.nzero)

			# Using the last fit to hot-start the next fit
			last.beta <- NULL
			for(i in seq_len(grid.length)) {
				out <- adjustedProfileLik(spline.disp[i], y=cury, design=redesign, 
					offset=curo, weights=curw, start=last.beta, get.coef=TRUE)
				l0[subg,i] <- out$apl
				last.beta <- out$beta
			}
		}
	}

	# Calculate common dispersion
	overall <- maximizeInterpolant(spline.pts, matrix(colSums(l0), nrow=1))
	common.dispersion <- 0.1 * 2^overall

	# Allow dispersion trend?
	if(trend.method!="none"){
		AveLogCPM <- aveLogCPM(y, lib.size=lib.size, dispersion=common.dispersion, weights=weights)
		out.1 <- WLEB(theta=spline.pts, loglik=l0, covariate=AveLogCPM[sel], trend.method=trend.method, 
			span=span, overall=FALSE, individual=FALSE, m0.out=TRUE)
		span <- out.1$span
		m0 <- out.1$shared.loglik
		disp.trend <- 0.1 * 2^out.1$trend
		trended.dispersion <- rep( disp.trend[which.min(AveLogCPM[sel])], ntags )
		trended.dispersion[sel] <- disp.trend
	} else {
		AveLogCPM <- NULL
		m0 <- matrix(colMeans(l0), ntags, length(spline.pts), byrow=TRUE)
		disp.trend <- common.dispersion
		trended.dispersion <- NULL
	}
	
	# Are tagwise dispersions required?
	if(!tagwise) return(list(common.dispersion=common.dispersion, trended.dispersion=trended.dispersion))

	# Calculate prior.df
	if(is.null(prior.df)){
		glmfit <- glmFit(sely, offset=seloffset, weights=selweights, design=design, dispersion=disp.trend, prior.count=0)

		# Residual deviances
		df.residual <- glmfit$df.residual

		# Adjust df.residual for fitted values at zero
		zerofit <- (glmfit$counts < 1e-4 & glmfit$fitted.values < 1e-4)
		df.residual <- .residDF(zerofit, design)

		# Empirical Bayes squeezing of the quasi-likelihood variance factors
		s2 <- glmfit$deviance / df.residual
		s2[df.residual==0] <- 0
		s2 <- pmax(s2,0)
		s2.fit <- squeezeVar(s2, df=df.residual, covariate=AveLogCPM[sel], robust=robust, winsor.tail.p=winsor.tail.p)
		prior.df <- s2.fit$df.prior
	}
	ncoefs <- ncol(design)
	prior.n <- prior.df/(nlibs-ncoefs)

	# Initiate tagwise dispersions
	if(trend.method!="none")
		tagwise.dispersion <- trended.dispersion
	else
		tagwise.dispersion <- rep(common.dispersion, ntags)

	# Checking if the shrinkage is near-infinite.
	too.large <- prior.n > 1e6
	if (!all(too.large)) { 
		temp.n <- prior.n
		if (any(too.large)) { 
			temp.n[too.large] <- 1e6 
		}

		# Estimating tagwise dispersions
		out.2 <- WLEB(theta=spline.pts, loglik=l0, prior.n=temp.n, covariate=AveLogCPM[sel], 
			trend.method=trend.method, span=span, overall=FALSE, trend=FALSE, m0=m0)

		if (!robust) { 
			tagwise.dispersion[sel] <- 0.1 * 2^out.2$individual
		} else {
			tagwise.dispersion[sel][!too.large] <- 0.1 * 2^out.2$individual[!too.large]
		}
	}

	if(robust) {
		temp.df <- prior.df
		temp.n <- prior.n
		prior.df <- prior.n <- rep(Inf, ntags)
		prior.df[sel] <- temp.df
		prior.n[sel] <- temp.n
	}

	list(common.dispersion=common.dispersion, trended.dispersion=trended.dispersion, tagwise.dispersion=tagwise.dispersion, span=span, prior.df=prior.df, prior.n=prior.n)
}


WLEB <- function(theta, loglik, prior.n=5, covariate=NULL, trend.method="locfit", span=NULL, 
	overall=TRUE, trend=TRUE, individual=TRUE, m0=NULL, m0.out=FALSE)
#  Weighted likelihood empirical Bayes for estimating a parameter vector theta
#  given log-likelihood values on a grid of theta values
#  Yunshun Chen, Gordon Smyth
#	Created July 2012. Last modified 16 Oct 2019.
{
#	Check loglik
	loglik <- as.matrix(loglik)
	ntheta <- ncol(loglik)
	ntags <- nrow(loglik)

#	Check covariate and trend
	if(is.null(covariate))
		trend.method <- "none"
	else
		trend.method <- match.arg(trend.method, c("none", "loess", "locfit", "movingave", "locfit.mixed"))

#	Set span
	if(is.null(span)) if(ntags<=50) span <- 1 else span <- 0.25+0.75*(50/ntags)^0.5

#	Output	
	out <- list()
	out$span <- span

#	overall prior
	if(overall)
		out$overall <- maximizeInterpolant(theta, matrix(colSums(loglik), nrow=1))

#	trended prior
	if(is.null(m0))
	m0 <- switch(trend.method,
		"movingave" = {
			o <- order(covariate)
			oo <- order(o)
			movingAverageByCol(loglik[o,], width=floor(span*ntags))[oo,]
		},
		"loess" = loessByCol(loglik, covariate, span=span)$fitted.values,
		"locfit" = locfitByCol(loglik, covariate, span=span, degree=0),
		"locfit.mixed" = {
			deg0 <- locfitByCol(loglik, covariate, span=span, degree=0)
			deg1 <- locfitByCol(loglik, covariate, span=span, degree=1)
			r <- range(covariate)
			w <- pbeta((covariate-r[1])/(r[2]-r[1]),shape1=2,shape2=2)
			w*deg0 + (1-w)*deg1
		},
		"none" = matrix(colMeans(loglik), ntags, length(theta), byrow=TRUE)
	)

#	make sure each row of m0 is unimodal
	if(trend.method=="locfit.mixed"){
		for(i in ncol(m0):3){
			diff1_m0 <- m0[,i] - m0[,i-1]
			diff2_m0 <- m0[,i-1] - m0[,i-2]
			k <- which(diff1_m0>0 & diff2_m0<0)
			m0[k,1:(i-2)] <- m0[k,(i-1)]
		}
	}

	if(trend)
		out$trend <- maximizeInterpolant(theta, m0)

#	weighted empirical Bayes posterior estimates
	if(individual){
		stopifnot(all(is.finite(prior.n)))
		l0a <- loglik + prior.n*m0
		out$individual <- maximizeInterpolant(theta, l0a)
	}

	if(m0.out) out$shared.loglik <- m0

	out
}

.subsetMatrixWithoutCopying <- function(x, i, j) 
# This will attempt to subset the matrix without any copying if
# it detects that 'i' and 'j' don't modify the ordering of the matrix.
# This reduces the memory footprint for large matrices.
#
# written by Aaron Lun
# created 29 September 2016
# last modified 16 December 2018
{
	isokay <- TRUE
	if (!missing(i)) {
		# Most flexible way of handling different types of subset vectors;
		# try it out and see if it gives the same results.
		example <- cbind(seq_len(nrow(x)))
		rownames(example) <- rownames(x)
		if (!identical(example, example[i,,drop=FALSE])) isokay <- FALSE
	}
	if (!missing(j)) {
		example <- rbind(seq_len(ncol(x)))
		colnames(example) <- colnames(x)
		if (!identical(example, example[,j,drop=FALSE])) isokay <- FALSE
	}

	if (isokay) {
		# Avoids copying if no modification incurred.
		return(x)
	} else if (!missing(i) && !missing(j)) {
		return(x[i,j,drop=FALSE])
	} else if (!missing(i)) {
		return(x[i,,drop=FALSE])
	} else {
		return(x[,j,drop=FALSE])
	}
}