1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
|
######################################
## code for sample size simulations ##
## Nicholas Reich
## August 2010
######################################
## this is the central user function
## N = overall sample size
## med = median of log normal distribution
## disp = dispersion of log normal distribution
## percentile = what percentile(s) should be simulated
## nsim = how many simulations to run
## exact data = T/F should simulation be done with exact data
## pct.type.A = percent type A data, will be rounded up to nearest integer
## exp.win.dat = a vector of exposure window lengths to sample from
## verb = whether to print 10 iteration counts during the course of the sim
##' @name precision.simulation
##' @aliases precision.simulation.exact
##' @aliases precision.simulation.coarse
##' @aliases generate.coarse.data
##'
##' @title Simulate incubation period analyses with coarse data
##'
##' @description These functions simulate coarse incubation period data sets and
##' analyze them. The goal is for these simulations to provide evidence for
##' how much information a given dataset contains about a characteristic of
##' the incubation period distribution.
##'
##' @param N Overall sample size for the datasets to be simulated.
##' @param med Median for the assumed log normal distribution of the incubation
##' periods.
##' @param disp Dispersion for the assumed log normal distribution of the
##' incubation periods.
##' @param percentile Percentile of the incubation period distribution which we
##' want to estimate.
##' @param nsim Number of datasets to analyze in the simulation.
##' @param exact.data Either TRUE/FALSE. Incidates whether the data generated
##' should be coarsened at all. If TRUE, pct.type.A and exp.win.dat are
##' ignored.
##' @param pct.type.A Percent of the N observations that are assumed to be type
##' A data. If N*pct.type.A is not an integer, it will be rounded to the
##' nearest integer.
##' @param exp.win.dat A vector of exposure window lengths. Defaults to the
##' observed window lengths from Lessler et al. (see below).
##' @param verb If TRUE, a message with the system time and iteration number
##' will be printed ten times during the simulation run.
##'
##'
##' @rdname precision.simulation
##' @return The \code{precision.simulation} functions return a matrix with four
##' columns and nsim rows. The "ests" column gives the estimated percentiles
##' for the incubation period distribution. The "SE" column gives the
##' standard error for the estimate. The "conv" column is 1 if the doubly
##' interval-censored likelihood maximization converged. Otherwise, it is 0.
##' The "bias" column gives the estimated percentile - true percentile. The
##' \code{generate.coarse.data} function returns a matrix with data suitable
##' for analysis by the \code{dic.fit} function.
##' @export
precision.simulation <- function(N,
med=2,
disp=1.3,
percentile=.5,
nsim=100,
exact.data=FALSE,
pct.type.A=.5,
exp.win.dat=NULL,
verb=FALSE) {
## logic check
if(percentile <= 0 | percentile >=1)
stop("percentile must be between 0 and 1.")
if(pct.type.A < 0 | pct.type.A >1)
stop("% of data that is type A must be between 0 and 1.")
if(is.null(exp.win.dat)){
if(verb) message("NYC exposure window data used")
exp.win.dat <- get(data(exp.win.lengths, envir = environment()))
}
## TODO: add default exp.win.dat to package and load if NULL
if(exact.data) {
out <- precision.simulation.exact(N=N,
med=med,
disp=disp,
percentile=percentile,
nsim=nsim,
verb=verb)
} else {
out <- precision.simulation.coarse(N=N,
med=med,
disp=disp,
percentile=percentile,
nsim=nsim,
pct.type.A=pct.type.A,
exp.win.dat=exp.win.dat,
verb=verb)
}
target <- qlnorm(percentile, log(med), log(disp))
bias <- out[,"ests"]-target
out <- cbind(out, bias)
return(out)
}
##' @rdname precision.simulation
##' @export
precision.simulation.exact <- function(N,
med,
disp,
percentile,
nsim,
verb) {
storage <- matrix(NA, ncol=3, nrow=nsim)
colnames(storage) <- c("ests", "SE", "conv")
data <- matrix(0, ncol=6, nrow=nsim*N)
colnames(data) <- c("dataset.id", "EL", "ER", "SL", "SR", "type")
data[,"dataset.id"] <- rep(1:nsim, each=N)
data[,"SL"] <- rlnorm(N*nsim, meanlog=log(med), sdlog=log(disp))
data[,"SR"] <- data[,"SL"]
data[,"type"] <- 2
for(i in 1:nsim){
tmp.dat <- data[which(data[,"dataset.id"]==i),]
tmp.fit <- dic.fit(tmp.dat, ptiles=percentile)
if(tmp.fit$conv==1){
row.name <- paste("p", round(percentile*100), sep="")
which.row <-
which(rownames(tmp.fit$ests)==row.name)[1]
storage[i,c("ests", "SE")] <-
tmp.fit$ests[which.row, c("est", "StdErr")]
} else {
storage[i,c("ests", "SE")] <- NA
}
storage[i,"conv"] <- tmp.fit$conv
if(verb & i%%(round(nsim/10))==0)
print(paste("iteration",i,"complete ::", Sys.time()))
}
return(storage)
}
##' @rdname precision.simulation
##' @export
precision.simulation.coarse <- function(N,
med,
disp,
percentile,
nsim,
pct.type.A,
exp.win.dat,
verb) {
## create storage
storage <- matrix(NA, ncol=3, nrow=nsim)
colnames(storage) <- c("ests", "SE", "conv")
for(i in 1:nsim){
tmp.dat <- generate.coarse.data(N=N,
med=med,
disp=disp,
pct.type.A=pct.type.A,
exp.win.dat=exp.win.dat)
tmp.fit <- dic.fit(tmp.dat, ptiles=percentile)
if(tmp.fit$conv==1){
row.name <- paste("p", round(percentile*100), sep="")
which.row <-
which(rownames(tmp.fit$ests)==row.name)[1]
storage[i,c("ests", "SE")] <-
tmp.fit$ests[which.row, c("est", "StdErr")]
} else {
storage[i,c("ests", "SE")] <- NA
}
storage[i,"conv"] <- tmp.fit$conv
if(verb & i%%(round(nsim/10))==0)
print(paste("iteration",i,"complete ::", Sys.time()))
}
return(storage)
}
##' @rdname precision.simulation
##' @export
generate.coarse.data <- function(N, med, disp, pct.type.A, exp.win.dat) {
n.type.A <- round(N*pct.type.A)
n.type.B <- N-n.type.A
E <- runif(N, 10, 11)
T <- rlnorm(N, log(med), log(disp))
S <- T + E
SR <- ceiling(S)
SL <- floor(S)
## generate window types
## 0 = short with bounded ER = type A
## 1 = "long" with no ER = type B
win.type <- rep(0:1, times=c(n.type.A, n.type.B))
## generate window lengths,
potential.lengths <- sample(exp.win.dat, size=N, replace=TRUE)
win.length <- 1*(win.type==0) + potential.lengths*(win.type>0)
## fix data
ER <- ceiling(E)*(win.type==0) + SR*(win.type==1)
EL <- pmin(ER-win.length, floor(E))
## return the data
cbind(EL, E, ER, SL, S, SR, win.length, win.type, type=0)
}
|