1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
#################################################################
## FUNCTIONS FOR FALSE DISCOVERY RATE (FDR) ESTIMATION ##
## USING SEMI-PARAMETRIC EM-LIKE ALGORITHMS ##
## D. CHAUVEAU ##
## mixtools 1.0 addition ##
#################################################################
#################################################################
## EM-like algorithm for a nonparametric univariate mixture model
## - Component 1 known = N(0,1)
## = the pdf of a probit transform of pvalue under H0
## - component 2 symmetric shifted from a location parameter mu
## NB: stochastic=TRUE not implemented, parameter removed here
spEMsymlocN01 <- function(x, mu0=2, bw = bw.nrd0(x), h=bw, eps = 1e-8,
maxiter=100, verbose = FALSE, plotf=FALSE){
bw <- h # h is alternative bandwidth argument, for backward compatibility
n <- length(x)
# if (length(mu0)>1) m <- length(mu0) else m <- mu0
m <- 2 # fixed number of components in this model
z.hat <- matrix(0, nrow=n, ncol=m)
fkernel <- matrix(0, nrow=n, ncol=m)
tt0 <- proc.time()
o <- order(x) # for plotting fhat's if requested
kmeans <- kmeans(x, mu0) # is this a good init for probit data?
for(j in 1:m) {
z.hat[kmeans$cluster==j, j] <- 1
}
iter <- 0
finished <- FALSE
lambda <- matrix(0,maxiter,m)
mu <- rep(0,maxiter) # only component 2 mean used in this case
while (!finished) {
#while (max(abs(change)) > eps & iter < maxiter) {
iter <- iter + 1
t0 <- proc.time()
## M-Step
lambda[iter,] <- colMeans(z.hat)
# mu[iter,] <- apply(sweep(z.hat, 1, x, "*"), 2, mean)/lambda[iter,]
mu[iter] <- sum(x*z.hat[,2])/(n*lambda[iter,2]) #
## second component density estimation step evaluated at x_i-mu's
ans <- .C(C_KDEsymloc1comp, n=as.integer(n),
mu=as.double(mu[iter]), lbd2=as.double(lambda[iter,2]),
x=as.double(x), bw=as.double(bw),
z=as.double(z.hat), f = double(n))
#add PACKAGE="mixtools" option when integrated
# successive plots of fhat's (for debugging mostly)
if (plotf) {
if (iter==1) plotfunc <- plot else plotfunc <- lines
plotfunc(x[o],ans$f[o],type="l", col=iter)
}
# version lambda_j f_j(xi) specific for component one = N(0,1)
# NB: this is the only place where the known component pdf is used
lambda.f <- cbind(lambda[iter,1]*dnorm(x), lambda[iter,2]*ans$f)
## E-step (for next iteration)
z.hat <- lambda.f/rowSums(lambda.f)
finished <- iter >= maxiter
if (iter>1) { # This convergence criterion is too simplistic:
change <- c(lambda[iter,] - lambda[iter-1,], mu[iter]-mu[iter-1])
finished <- finished | (max(abs(change)) < eps)
}
if (verbose) {
t1 <- proc.time()
cat("iteration ", iter, " lambda ", round(lambda[iter,], 4),
" mu ", round(mu[iter], 4))
cat(" time", (t1 - t0)[3], "\n")
}
}
if (verbose) {
tt1 <- proc.time()
cat("lambda ", round(lambda[iter,], 4))
cat(", total time", (tt1 - tt0)[3], "s\n")
}
return(structure(list(data=x, posteriors=z.hat, lambda=lambda[1:iter,],
bandwidth=bw, lambdahat=lambda[iter,],
mu = mu[1:iter], muhat = mu[iter], symmetric=TRUE),
class="spEMN01"))
}
#######################################################
# plot mixture pdf for the semiparametric mixture model
# with component 1 pdf passed in the knownpdf parameter
# uses a weighted kernel density estimate of the nonparametric component 2
# a = object of class "spEMN01" as returned by spEMsymlocNorm
plot.spEMN01 <- function(x, bw=x$bandwidth, knownpdf=dnorm, add.plot=FALSE, ...) {
t <- seq(min(x$data), max(x$data), len=200)
f1 <- x$lambdahat[1]*knownpdf(t)
f2 <- x$lambdahat[2]*wkde(x$data-x$muhat, u=t-x$muhat, w=x$post[,2], bw=bw, sym=TRUE)
f <- f1+f2
if (!add.plot) plot(t,f1+f2, type="l", ...) else lines(t,f1+f2, ...)
lines(t,f1, col=2); lines(t,f2, col=3)
}
####################################################
# plot and compare FDR for 1 or 2 EM-like strategies
# for mixtools 1.0
# post NEEDS to be sorted by p
plotFDR <- function(post1, post2=NULL, lg1="FDR 1", lg2=NULL, title=NULL,
compH0=1, alpha=0.1, complete.data =NULL, pctfdr=0.3) {
n <- dim(post1)[1]
cs1 <- cumsum(post1[,compH0]) # local FDR(p_i)'s
fdr1 <- cs1/(1:n) # FDR(p_i)'s
if (is.null(title)) title <- paste("FDR estimate(s), n=",n)
if (!is.null(post2)) {
cs2 <- cumsum(post2[,compH0]) # local FDR(p_i)'s
fdr2 <- cs2/(1:n)
if (is.null(lg2)) lg2 <- "FDR 2"
}
i1 <- sum(fdr1<pctfdr) # plot up to pctfdr % FDR
if (i1 == 0) i1 <- n # for very bad fit, fdr[1] > pctfdr
# cat("index",i1)
plot(fdr1[1:i1], type="l", main=title, col=1, ylim=c(0,fdr1[i1]),
xlab="index", ylab="probability")
if (!is.null(post2)) lines(fdr2[1:i1], col=2)
abline(alpha, 0, lty=3)
if (!is.null(complete.data)) { # true complete data available
V <- cumsum(complete.data[,1]==1) # cumulative nb of items under H0
trueFDR <- V/(1:n)
lines(trueFDR[1:i1], lty=2, col=3)
if (!is.null(post2))
legend("topleft", c(lg1,lg2,"True FDR"),col=1:3, lty=c(1,1,2))
if (is.null(post2))
legend("topleft", c(lg1,"True FDR"),col=c(1,3), lty=c(1,2))
} else {
if (!is.null(post2))
legend("topleft", c(lg1,lg2), col=1:2, lty=c(1,1))
if (is.null(post2))
legend("topleft", lg1, col=1, lty=1)
}
}
|