1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
|
voomLmFit <- function(
counts, design=NULL, block=NULL, prior.weights=NULL,
sample.weights=FALSE, var.design=NULL, var.group=NULL,
lib.size=NULL, normalize.method="none",
span=0.5, plot=FALSE, save.plot=FALSE, keep.EList=TRUE
)
# limma+lmFit pipeline for counts taking into account of structural zeros
# Creates an MArrayLM object for entry to eBayes() etc in the limma pipeline.
# Depends on edgeR as well as limma
# Gordon Smyth
# Created 21 Jan 2020. Last modified 16 Mar 2022.
{
Block <- !is.null(block)
PriorWeights <- !is.null(prior.weights)
SampleWeights <- sample.weights || !is.null(var.design) || !is.null(var.group)
# Can't specify prior weights and ask for sample weights to be estimated as well
if(PriorWeights && SampleWeights) stop("Can't specify prior.weights and estimate sample weights")
# Create output object
out <- list()
# Extract counts from known data objects
if(is(counts,"SummarizedExperiment")) counts <- SE2DGEList(counts)
if(is(counts,"DGEList")) {
out$genes <- counts$genes
out$targets <- counts$samples
if(is.null(design) && diff(range(as.numeric(counts$sample$group)))>0) design <- model.matrix(~group,data=counts$samples)
if(is.null(lib.size)) lib.size <- effectiveLibSizes(counts)
counts <- counts$counts
} else {
if(is(counts,"eSet")) {
if(!requireNamespace("Biobase",quietly=TRUE))
stop("Biobase package required but is not installed (or can't be loaded)")
if(length(Biobase::fData(counts))) out$genes <- Biobase::fData(counts)
if(length(Biobase::pData(counts))) out$targets <- Biobase::pData(counts)
counts <- get("counts",Biobase::assayData(counts))
} else {
counts <- as.matrix(counts)
}
}
# Check counts
n <- nrow(counts)
if(n < 2L) stop("Need at least two genes to fit a mean-variance trend")
m <- min(counts)
if(is.na(m)) stop("NA counts not allowed")
if(m < 0) stop("Negative counts not allowed")
# Check design
if(is.null(design)) {
design <- matrix(1,ncol(counts),1)
rownames(design) <- colnames(counts)
colnames(design) <- "GrandMean"
}
# Check lib.size
if(is.null(lib.size)) lib.size <- colSums(counts)
# Expand prior.weights if necessary
if(!is.null(prior.weights)) prior.weights <- asMatrixWeights(prior.weights,dim(counts))
# log2-counts-per-million
y <- t(log2(t(counts+0.5)/(lib.size+1)*1e6))
# Microarray-style normalization
y <- normalizeBetweenArrays(y,method=normalize.method)
# Fit linear model
fit <- lmFit(y,design,weights=prior.weights)
# Find largest leverage value of design matrix
if(is.null(fit$qr))
h <- hat(design,intercept=FALSE)
else
h <- hat(fit$qr)
MinGroupSize <- 1/max(h)
# Identify fitted values that are exactly zero and should not contribute to the genewise variances
# Note that a single zero is never a problem
eps <- 1e-4
RowHasZero <- which(rowSums(counts < eps) > (max(2,MinGroupSize)-eps))
AnyZeroRows <- as.logical(length(RowHasZero))
if(AnyZeroRows) {
countsZero <- counts[RowHasZero,,drop=FALSE]
PoissonFit <- glmFit(countsZero,design=design,lib.size=lib.size,dispersion=0,prior.count=0)
IsZero <- (PoissonFit$fitted.values < eps & countsZero < eps)
RowHasExactZero <- which(rowSums(IsZero) > eps)
# If any exact zero fits, then rerun the linear model for those rows with NAs
if(length(RowHasExactZero)) {
RowHasZero <- RowHasZero[RowHasExactZero]
IsZero <- IsZero[RowHasExactZero,,drop=FALSE]
yNAshort <- y[RowHasZero,,drop=FALSE]
yNAshort[IsZero] <- NA
fitNA <- suppressWarnings(lmFit(yNAshort,design,weights=prior.weights[RowHasZero,,drop=FALSE]))
fit$df.residual[RowHasZero] <- fitNA$df.residual
fit$sigma[RowHasZero] <- fitNA$sigma
# If blocking or sample weights are present, then we will later on need a full length copy of y with NAs inserted
if(Block || SampleWeights) {
yNAfull <- y
yNAfull[RowHasZero,] <- yNAshort
}
} else {
AnyZeroRows <- FALSE
}
}
# If no replication found, assume all weights are 1 and return fit already computed
HasRep <- (fit$df.residual > 0L)
NWithReps <- sum(HasRep)
if(NWithReps < 2L) {
if(NWithReps == 0L) warning("The experimental design has no replication. Setting weights to 1.")
if(NWithReps == 1L) warning("Only one gene with any replication. Setting weights to 1.")
fit$genes <- out$genes
return(fit)
}
# Fit lowess trend to sqrt-standard-deviations by log-count-size
Amean <- Amean2 <- rowMeans(y)
if(AnyZeroRows) Amean2[RowHasZero] <- rowMeans(yNAshort,na.rm=TRUE)
sx <- Amean2[HasRep]+mean(log2(lib.size+1))-log2(1e6)
sy <- sqrt(fit$sigma[HasRep])
if(AnyZeroRows)
l <- weightedLowess(sx,sy,span=span,weights=fit$df.residual[HasRep],output.style="lowess")
else
l <- lowess(sx,sy,f=span)
if(plot) {
plot(sx,sy,xlab="log2( count size + 0.5 )",ylab="Sqrt( standard deviation )",pch=16,cex=0.25)
title("voom: Mean-variance trend")
lty <- ifelse(Block || SampleWeights,2,1)
lines(l,col="red",lty=lty)
}
# Make interpolating rule
f <- approxfun(l, rule=2, ties=list("ordered",mean))
# Find individual quarter-root fitted counts
if(fit$rank < ncol(design)) {
j <- fit$pivot[1:fit$rank]
fitted.values <- fit$coefficients[,j,drop=FALSE] %*% t(fit$design[,j,drop=FALSE])
} else {
fitted.values <- fit$coefficients %*% t(fit$design)
}
fitted.cpm <- 2^fitted.values
fitted.count <- 1e-6 * t(t(fitted.cpm)*(lib.size+1))
fitted.logcount <- log2(fitted.count)
# Apply trend to individual observations to get voom weights
w <- 1/f(fitted.logcount)^4
dim(w) <- dim(fitted.logcount)
# Add voom weights to prior weights
if(PriorWeights)
weights <- w * prior.weights
else
weights <- w
# Estimate sample weights?
if(SampleWeights) {
if(AnyZeroRows) {
sw <- arrayWeights(yNAfull,design,weights=weights,var.design=var.design,var.group=var.group)
} else {
sw <- arrayWeights(y,design,weights=weights,var.design=var.design,var.group=var.group)
}
message("First sample weights (min/max) ", paste(format(range(sw)),collapse="/") )
if(Block) weights <- t(sw * t(weights))
}
# Estimate correlation?
if(Block) {
if(AnyZeroRows) {
dc <- suppressWarnings(duplicateCorrelation(yNAfull,design,block=block,weights=weights))
} else {
dc <- suppressWarnings(duplicateCorrelation(y,design,block=block,weights=weights))
}
correlation <- dc$consensus.correlation
if(is.na(correlation)) correlation <- 0
message("First intra-block correlation ",format(correlation))
} else {
correlation <- NULL
}
# Seond iteration to refine intra-block correlation or sample weights
if(Block || SampleWeights) {
# Rerun voom weights with new correlation and sample weights
if(SampleWeights)
weights <- asMatrixWeights(sw,dim(y))
else
weights <- prior.weights
fit <- lmFit(y,design,block=block,correlation=correlation,weights=weights)
if(AnyZeroRows) {
fitNA <- suppressWarnings(lmFit(yNAshort,design,block=block,correlation=correlation,weights=weights[RowHasZero,,drop=FALSE]))
fit$df.residual[RowHasZero] <- fitNA$df.residual
fit$sigma[RowHasZero] <- fitNA$sigma
}
sy <- sqrt(fit$sigma[HasRep])
if(AnyZeroRows)
l <- weightedLowess(sx,sy,span=span,weights=fit$df.residual[HasRep],output.style="lowess")
else
l <- lowess(sx,sy,f=span)
if(plot) {
lines(l,col="red")
legend("topright",lty=c(2,1),col="red",legend=c("First","Final"))
}
f <- approxfun(l, rule=2, ties=list("ordered",mean))
if(fit$rank < ncol(design)) {
j <- fit$pivot[1:fit$rank]
fitted.values <- fit$coefficients[,j,drop=FALSE] %*% t(fit$design[,j,drop=FALSE])
} else {
fitted.values <- fit$coefficients %*% t(fit$design)
}
fitted.cpm <- 2^fitted.values
fitted.count <- 1e-6 * t(t(fitted.cpm)*(lib.size+1))
fitted.logcount <- log2(fitted.count)
w <- 1/f(fitted.logcount)^4
dim(w) <- dim(fitted.logcount)
if(PriorWeights)
weights <- w * prior.weights
else
weights <- w
if(SampleWeights) {
if(AnyZeroRows) {
sw <- arrayWeights(yNAfull,design,weights=weights,var.design=var.design,var.group=var.group)
} else {
sw <- arrayWeights(y,design,weights=weights,var.design=var.design,var.group=var.group)
}
message("Final sample weights (min/max) ", paste(format(range(sw)),collapse="/") )
weights <- t(sw * t(weights))
}
if(Block) {
if(AnyZeroRows) {
dc <- suppressWarnings(duplicateCorrelation(yNAfull,design,block=block,weights=weights))
} else {
dc <- suppressWarnings(duplicateCorrelation(y,design,block=block,weights=weights))
}
correlation <- dc$consensus.correlation
if(is.na(correlation)) correlation <- 0
message("Final intra-block correlation ",format(correlation))
}
}
# Final linear model fit with voom weights
fit <- lmFit(y,design,block=block,correlation=correlation,weights=weights)
if(is.null(fit$Amean)) fit$Amean <- Amean
if(AnyZeroRows) {
fitNA <- suppressWarnings(lmFit(yNAshort,design,block=block,correlation=correlation,weights=weights[RowHasZero,,drop=FALSE]))
fit$df.residual[RowHasZero] <- fitNA$df.residual
fit$sigma[RowHasZero] <- fitNA$sigma
}
# Output
fit$genes <- out$genes
fit$targets <- out$targets
if(is.null(fit$targets)) {
fit$targets <- data.frame(lib.size=lib.size)
row.names(fit$targets) <- colnames(y)
}
if(SampleWeights) fit$targets$sample.weight <- sw
if(save.plot) {
fit$voom.xy <- list(x=sx,y=sy,xlab="log2( count size + 0.5 )",ylab="Sqrt( standard deviation )")
fit$voom.line <- l
}
if(keep.EList) {
fit$EList <- new("EList",list(E=y,weights=weights,genes=out$genes))
}
fit
}
|