1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
|
spliceVariants <- function(y, geneID, dispersion=NULL, group=NULL, estimate.genewise.disp=TRUE, trace=FALSE)
# Identify genes with splice variants using a negative binomial model
# We assume that the data come in a matrix (possibly and/or a DGEList), counts summarized at exon level, with gene information available
# Davis McCarthy and Gordon Smyth
# Created 4 February 2011. Last modified 3 Oct 2016.
{
if( is(y, "DGEList") ) {
y.mat <- y$counts
if( is.null(group) )
group <- y$samples$group
lib.size <- y$samples$lib.size * y$samples$norm.factors
}
else {
y.mat <- as.matrix(y)
if( is.null(group) )
stop("y is a matrix and no group argument has been supplied. Please supply group argument.")
lib.size <- colSums(y)
}
geneID <- as.vector(unlist(geneID))
# Order genes by geneID: we need some way to reorganise the data---output cannot possibly be same dimension as input so this is a sensible way to organise things
o <- order(geneID)
geneID <- geneID[o]
y.mat <- y.mat[o,]
uniqIDs <- unique(geneID)
# Organise the dispersion values to be used in the NB models
if( is.null(dispersion) ) {
if(estimate.genewise.disp) {
dispersion <- estimateExonGenewiseDisp(y.mat, geneID, group)
genewise.disp <- TRUE
if(trace)
cat("Computing genewise dispersions from exon counts.\n")
}
else {
dispersion <- estimateCommonDisp(DGEList(counts=y.mat, group=group))
genewise.disp <- FALSE
if(trace)
cat("Computing common dispersion across all exons.\n")
}
}
else {
if( length(dispersion)==length(uniqIDs) ) {
if(is.null(names(dispersion)))
stop("names(dispersion) is NULL. All names of dispersion must be unique geneID.\n")
matches <- match(uniqIDs, names(dispersion))
if( any(is.na(matches)) | any(duplicated(matches)))
stop("names(dispersion) of dispersion do not have a one-to-one mapping to unique geneID. All names of dispersion must be unique geneID.\n")
dispersion <- dispersion[matches]
genewise.disp <- TRUE
}
if( length(dispersion)==1 ) {
genewise.disp <- FALSE
}
}
# Can't get any results if there are no counts for an exon, so remove all-zero exons
keep <- rowSums(y.mat) > 0
exons <- y.mat[keep,]
rownames(exons) <- geneID[keep]
uniqIDs <- unique(geneID[keep])
na.vec <- rep_len(NA_real_, length(uniqIDs))
if(genewise.disp)
dispersion <- dispersion[names(dispersion) %in% uniqIDs]
else {
dispersion <- rep_len(dispersion, length(uniqIDs))
names(dispersion) <- uniqIDs
}
# We want to know how many exons each gene has
dummy <- rowsum(rep_len(1, nrow(exons)), rownames(exons))
nexons <- as.vector(dummy)
names(nexons) <- rownames(dummy)
mm <- match(uniqIDs,names(nexons))
nexons <- nexons[mm]
if(trace)
cat("Max number exons: ",max(nexons),"\n")
# Genes with the same number of exons have the same design matrix, allowing some parallelization of computations
splicevars.out <- data.frame(logFC= na.vec, logCPM = na.vec, LR = na.vec, PValue = na.vec)
rownames(splicevars.out) <- uniqIDs
abundance <- na.vec
# For loop iterates over number of exons for genes, starting at 2 (can't have splice variants if only one exon!)
for(i.exons in sort(unique(nexons))) {
# Select the genes with the right number of exons for this iteration
this.genes <- nexons==i.exons
full.index <- rownames(exons) %in% uniqIDs[this.genes]
if( any(this.genes) ) {
gene.counts.mat <- matrix(t(exons[full.index,]), nrow=sum(this.genes), ncol=ncol(exons)*i.exons, byrow=TRUE)
expanded.lib.size <- rep.int(lib.size, i.exons)
if(i.exons==1) {
abundance[this.genes] <- aveLogCPM(gene.counts.mat, lib.size=expanded.lib.size)
splicevars.out$LR[this.genes] <- 0
splicevars.out$PValue[this.genes] <- 1
}
else {
exon.this <- factor(rep(1:i.exons, each=ncol(exons)))
group.this <- as.factor(rep.int(group, i.exons))
# Define design matrices for this group of genes
X.full <- model.matrix(~ exon.this + group.this + exon.this:group.this )
X.null <- model.matrix(~ exon.this + group.this )
coef <- (ncol(X.null)+1):ncol(X.full)
# Fit NB GLMs to these genes
fit.this <- glmFit(gene.counts.mat, X.full, dispersion[this.genes], offset=0, prior.count=0)
abundance[this.genes] <- aveLogCPM(gene.counts.mat, lib.size=expanded.lib.size)
results.this <- glmLRT(fit.this, coef=coef)
if(sum(this.genes)==1) {
splicevars.out$LR[this.genes] <- results.this$table$LR[1]
splicevars.out$PValue[this.genes] <- results.this$table$PValue[1]
}
else {
splicevars.out$LR[this.genes] <- results.this$table$LR
splicevars.out$PValue[this.genes] <- results.this$table$PValue
}
}
}
}
splicevars.out$logCPM <- abundance
# Create a list with the exons divided up neatly by geneID (a bit slow using in-built fn)
# Not really necessary, so leave out for the time being
# exon.list <- split(as.data.frame(exons), rownames(exons))
# names(exon.list) <- uniqIDs
if(!genewise.disp)
dispersion <- dispersion[1]
new("DGEExact",list(table=splicevars.out, comparison=NULL, genes=data.frame(GeneID=uniqIDs), dispersion=dispersion))
}
estimateExonGenewiseDisp <- function(y, geneID, group=NULL)
# Function to estimate a common dispersion from exon count data
# Created by Davis McCarthy, 29 July 2011.
# Last modified 7 Aug 2019.
{
# Check objects coming in
if( is(y, "DGEList") ) {
y.mat <- y$counts
if( is.null(group) )
group <- y$samples$group
}
else {
y.mat <- as.matrix(y)
if( is.null(group) )
stop("y is a matrix and no group argument has been supplied. Please supply group argument\n")
}
geneID <- as.vector(unlist(geneID))
# Cannot maintain order of the argument y, so order on geneID so that we have some sensible organisation
o <- order(geneID)
geneID <- geneID[o]
y.mat <- y.mat[o,]
# Sum counts from all exons for each gene to get gene-level counts and form DGEList object
gene.counts <- rowsum(y.mat, geneID)
genewise.disp <- rep_len(NA_real_, nrow(gene.counts))
names(genewise.disp) <- rownames(gene.counts)
gene.data <- DGEList(counts=gene.counts, group=group)
# Cannot properly compute dispersion if there are no counts for the gene
used <- rowSums(gene.data$counts) > 0
# Need to first estimate a common dispersion
gene.data <- estimateCommonDisp(gene.data[used,])
# Next estimate tagwise dispersion for each gene, with trend. Default prop.used=2/3, grid.length=200
gene.data <- estimateTagwiseDisp(gene.data, trend="movingave")
# For those gene which have sufficient (>0) counts, assign estimated dispersion
genewise.disp[used] <- gene.data$tagwise.dispersion
# For those genes with zero counts, assign maximum estimated dispersion value
genewise.disp[!used] <- max(gene.data$tagwise.dispersion)
genewise.disp
}
plotExonUsage <- function(y, geneID, group=NULL, transform="none", counts.per.million=TRUE, legend.coords=NULL, ...)
# Plots exon usage from a matrix, DGEList or list of exon counts
# Created by Davis McCarthy, 30 July 2011.
# Last modified 2 Aug 2011.
{
if( is(y,"DGEList") ) {
ind <- rownames(y$counts) %in% geneID
counts <- y$counts
if(counts.per.million)
counts <- cpm(counts)
exon.mat <- counts[ind,]
group <- y$samples$group
}
else {
if( is(y,"list") ) {
exon.mat <- y[[geneID]]
if(counts.per.million)
stop("Counts per million cannot easily be computed when y is a list. Please use either a DGEList object or a matrix of counts.\n")
if(is.null(group))
stop("Group argument must be supplied.\n")
}
else {
if(is.null(group))
stop("Group argument must be supplied.\n")
if(counts.per.million)
y <- cpm(y)
ind <- rownames(y) %in% geneID
exon.mat <- y[ind,]
}
}
transform <- match.arg(transform, c("none", "log2", "sqrt"))
if(transform=="none") {
if(counts.per.million)
ylab <- "Counts per million"
else
ylab <- "Read counts"
}
if(transform=="log2") {
exon.mat <- log2(exon.mat + 0.5)
if(counts.per.million)
ylab <- "log2( Counts per million )"
else
ylab <- "log2( Read counts )"
}
if(transform=="sqrt") {
exon.mat <- sqrt(exon.mat)
if(counts.per.million)
ylab <- "sqrt( Counts per million )"
else
ylab <- "sqrt( Read counts )"
}
if(length(group)!=ncol(exon.mat))
stop("Length of group vector does not match number of samples (columns of exon matrix).\n")
cols <- 1:nlevels(group)
plot(exon.mat[,1], type="b", col=cols[group[1]], panel.first=grid(), ylab=ylab, xlab="Exon", main=paste("GeneID: ",geneID), ylim=c(0, max(exon.mat)), ...)
for( i in 2:ncol(exon.mat) )
lines(exon.mat[,i], type="b", col=cols[group[i]], ...)
if(is.null(legend.coords)) {
legend.coords <- rep_len(NA_real_,2L)
legend.coords[1] <- 0.8*nrow(exon.mat)
legend.coords[2] <- 0.9*max(exon.mat)
}
legend(legend.coords[1], legend.coords[2], levels(group), col=cols, lwd=2)
invisible(exon.mat)
}
|