File: spliceVariants.R

package info (click to toggle)
r-bioc-edger 3.40.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,484 kB
  • sloc: cpp: 1,425; ansic: 1,109; sh: 21; makefile: 5
file content (231 lines) | stat: -rw-r--r-- 8,813 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
spliceVariants <- function(y, geneID, dispersion=NULL, group=NULL, estimate.genewise.disp=TRUE, trace=FALSE)
# Identify genes with splice variants using a negative binomial model
# We assume that the data come in a matrix (possibly and/or a DGEList), counts summarized at exon level, with gene information available
# Davis McCarthy and Gordon Smyth
# Created 4 February 2011.  Last modified 3 Oct 2016. 
{
	if( is(y, "DGEList") ) {
		y.mat <- y$counts
		if( is.null(group) )
			group <- y$samples$group
		lib.size <- y$samples$lib.size * y$samples$norm.factors
	}
	else {
		y.mat <- as.matrix(y)
		if( is.null(group) )
			stop("y is a matrix and no group argument has been supplied. Please supply group argument.")
		lib.size <- colSums(y)
	}

	geneID <- as.vector(unlist(geneID))
#	Order genes by geneID: we need some way to reorganise the data---output cannot possibly be same dimension as input so this is a sensible way to organise things
	o <- order(geneID)
	geneID <- geneID[o]
	y.mat <- y.mat[o,]
	uniqIDs <- unique(geneID)
#	Organise the dispersion values to be used in the NB models
	if( is.null(dispersion) ) {
		if(estimate.genewise.disp) {
			dispersion <- estimateExonGenewiseDisp(y.mat, geneID, group)
			genewise.disp <- TRUE
			if(trace)
				cat("Computing genewise dispersions from exon counts.\n")
		}
		else {
			dispersion <- estimateCommonDisp(DGEList(counts=y.mat, group=group))
			genewise.disp <- FALSE
			if(trace)
				cat("Computing common dispersion across all exons.\n")
		}
	}
	else {
		if( length(dispersion)==length(uniqIDs) ) {
			if(is.null(names(dispersion)))
				stop("names(dispersion) is NULL. All names of dispersion must be unique geneID.\n")
			matches <- match(uniqIDs, names(dispersion))
			if( any(is.na(matches)) | any(duplicated(matches)))
			   stop("names(dispersion) of dispersion do not have a one-to-one mapping to unique geneID. All names of dispersion must be unique geneID.\n")
			dispersion <- dispersion[matches]
			genewise.disp <- TRUE
		}
		if( length(dispersion)==1 ) {
			genewise.disp <- FALSE
		}
	}
#	Can't get any results if there are no counts for an exon, so remove all-zero exons
	keep <- rowSums(y.mat) > 0
	exons <- y.mat[keep,]
	rownames(exons) <- geneID[keep]
	uniqIDs <- unique(geneID[keep])
	na.vec <- rep_len(NA_real_, length(uniqIDs))
	if(genewise.disp)
		dispersion <- dispersion[names(dispersion) %in% uniqIDs]
	else {
		dispersion <- rep_len(dispersion, length(uniqIDs))
		names(dispersion) <- uniqIDs
	}
#	We want to know how many exons each gene has
	dummy <- rowsum(rep_len(1, nrow(exons)), rownames(exons))
	nexons <- as.vector(dummy)
	names(nexons) <- rownames(dummy)
	mm <- match(uniqIDs,names(nexons))
	nexons <- nexons[mm]
	if(trace)
		cat("Max number exons: ",max(nexons),"\n")
#	Genes with the same number of exons have the same design matrix, allowing some parallelization of computations
	splicevars.out <- data.frame(logFC= na.vec, logCPM = na.vec, LR = na.vec, PValue = na.vec)
	rownames(splicevars.out) <- uniqIDs
	abundance <- na.vec
#	For loop iterates over number of exons for genes, starting at 2 (can't have splice variants if only one exon!)
	for(i.exons in sort(unique(nexons))) {
	#	Select the genes with the right number of exons for this iteration
		this.genes <- nexons==i.exons
		full.index <- rownames(exons) %in% uniqIDs[this.genes]
		if( any(this.genes) ) {
			gene.counts.mat <- matrix(t(exons[full.index,]), nrow=sum(this.genes), ncol=ncol(exons)*i.exons, byrow=TRUE)
			expanded.lib.size <- rep.int(lib.size, i.exons)
			if(i.exons==1) {
				abundance[this.genes] <- aveLogCPM(gene.counts.mat, lib.size=expanded.lib.size)
				splicevars.out$LR[this.genes] <- 0
				splicevars.out$PValue[this.genes] <- 1
			}
			else {
				exon.this <- factor(rep(1:i.exons, each=ncol(exons)))
				group.this <- as.factor(rep.int(group, i.exons))
			#	Define design matrices for this group of genes
				X.full <- model.matrix(~ exon.this + group.this + exon.this:group.this )
				X.null <- model.matrix(~ exon.this + group.this )
				coef <- (ncol(X.null)+1):ncol(X.full)
			#	Fit NB GLMs to these genes
				fit.this <- glmFit(gene.counts.mat, X.full, dispersion[this.genes], offset=0, prior.count=0)
				abundance[this.genes] <- aveLogCPM(gene.counts.mat, lib.size=expanded.lib.size)
				results.this <- glmLRT(fit.this, coef=coef)
				if(sum(this.genes)==1) {
					splicevars.out$LR[this.genes] <- results.this$table$LR[1]
					splicevars.out$PValue[this.genes] <- results.this$table$PValue[1]
				}
				else {
					splicevars.out$LR[this.genes] <- results.this$table$LR
					splicevars.out$PValue[this.genes] <- results.this$table$PValue
				}
			}
		}
	}
	splicevars.out$logCPM <- abundance
#	Create a list with the exons divided up neatly by geneID (a bit slow using in-built fn)
#	Not really necessary, so leave out for the time being
#	exon.list <- split(as.data.frame(exons), rownames(exons))
#	names(exon.list) <- uniqIDs
	if(!genewise.disp)
		dispersion <- dispersion[1]
	new("DGEExact",list(table=splicevars.out, comparison=NULL, genes=data.frame(GeneID=uniqIDs), dispersion=dispersion))
}



estimateExonGenewiseDisp <- function(y, geneID, group=NULL)
#	Function to estimate a common dispersion from exon count data
#	Created by Davis McCarthy, 29 July 2011.
#	Last modified 7 Aug 2019.
{
#	Check objects coming in
	if( is(y, "DGEList") ) {
		y.mat <- y$counts
		if( is.null(group) )
			group <- y$samples$group
	}
	else {
		y.mat <- as.matrix(y)
		if( is.null(group) )
			stop("y is a matrix and no group argument has been supplied. Please supply group argument\n")
	}
	geneID <- as.vector(unlist(geneID))
#	Cannot maintain order of the argument y, so order on geneID so that we have some sensible organisation
	o <- order(geneID)
	geneID <- geneID[o]
	y.mat <- y.mat[o,]
#	Sum counts from all exons for each gene to get gene-level counts and form DGEList object
	gene.counts <- rowsum(y.mat, geneID)
	genewise.disp <- rep_len(NA_real_, nrow(gene.counts))
	names(genewise.disp) <- rownames(gene.counts)
	gene.data <- DGEList(counts=gene.counts, group=group)
#	Cannot properly compute dispersion if there are no counts for the gene
	used <- rowSums(gene.data$counts) > 0
#	Need to first estimate a common dispersion
	gene.data <- estimateCommonDisp(gene.data[used,])
#	Next estimate tagwise dispersion for each gene, with trend. Default prop.used=2/3, grid.length=200
	gene.data <- estimateTagwiseDisp(gene.data, trend="movingave")
#	For those gene which have sufficient (>0) counts, assign estimated dispersion
	genewise.disp[used] <- gene.data$tagwise.dispersion
#	For those genes with zero counts, assign maximum estimated dispersion value
	genewise.disp[!used] <- max(gene.data$tagwise.dispersion)
	genewise.disp
}


plotExonUsage <- function(y, geneID, group=NULL, transform="none", counts.per.million=TRUE, legend.coords=NULL, ...)
#	Plots exon usage from a matrix, DGEList or list of exon counts
#	Created by Davis McCarthy, 30 July 2011.
#	Last modified 2 Aug 2011.
{
	if( is(y,"DGEList") ) {
		ind <- rownames(y$counts) %in% geneID
		counts <- y$counts
		if(counts.per.million)
			counts <- cpm(counts)
		exon.mat <- counts[ind,]
		group <- y$samples$group
	}
	else {
		if( is(y,"list") ) {
			exon.mat <- y[[geneID]]
			if(counts.per.million)
				stop("Counts per million cannot easily be computed when y is a list. Please use either a DGEList object or a matrix of counts.\n")
			if(is.null(group))
				stop("Group argument must be supplied.\n")
		}
		else {
			if(is.null(group))
				stop("Group argument must be supplied.\n")
			if(counts.per.million)
				y <- cpm(y)
			ind <- rownames(y) %in% geneID
			exon.mat <- y[ind,]
		}
	}
	transform <- match.arg(transform, c("none", "log2", "sqrt"))
	if(transform=="none") {
		if(counts.per.million)
			ylab <- "Counts per million"
		else
			ylab <- "Read counts"
	}
	if(transform=="log2") {
		exon.mat <- log2(exon.mat + 0.5)
		if(counts.per.million)
			ylab <- "log2( Counts per million )"
		else
			ylab <- "log2( Read counts )"
	}
	if(transform=="sqrt") {
		exon.mat <- sqrt(exon.mat)
		if(counts.per.million)
			ylab <- "sqrt( Counts per million )"
		else
			ylab <- "sqrt( Read counts )"
	}
	if(length(group)!=ncol(exon.mat))
		stop("Length of group vector does not match number of samples (columns of exon matrix).\n")
	cols <- 1:nlevels(group)
	plot(exon.mat[,1], type="b", col=cols[group[1]], panel.first=grid(), ylab=ylab, xlab="Exon", main=paste("GeneID: ",geneID), ylim=c(0, max(exon.mat)), ...)
	for( i in 2:ncol(exon.mat) )
		lines(exon.mat[,i], type="b", col=cols[group[i]], ...)
	if(is.null(legend.coords)) {
		legend.coords <- rep_len(NA_real_,2L)
		legend.coords[1] <- 0.8*nrow(exon.mat)
		legend.coords[2] <- 0.9*max(exon.mat)
	}
	legend(legend.coords[1], legend.coords[2], levels(group), col=cols, lwd=2)
	invisible(exon.mat)
}