File: CFR_estimation.R

package info (click to toggle)
r-cran-coarsedatatools 0.6-6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 328 kB
  • sloc: makefile: 2
file content (416 lines) | stat: -rw-r--r-- 15,536 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
### function defining the EM algorithm for the CFR estimates

##' A function to estimate the relative case fatality ratio when reporting rates
##' are time-varying and deaths are lagged because of survival time.
##' 
##' This function implements an EM algorithm to estimate the relative case
##' fatality ratio between two groups when reporting rates are time-varying and
##' deaths are lagged because of survival time.
##' @usage EMforCFR(assumed.nu, alpha.start.values, full.data, max.iter = 50, 
##'   verb = FALSE, tol = 1e-10, SEM.var = TRUE)
##'   
##' @param assumed.nu  a vector of probabilities corresponding to the survival
##'   distribution, i.e. nu[i]=Pr(surviving i days | fatal case)
##' @param alpha.start.values a vector starting values for the reporting rate
##'   parameter of the GLM model. This must have length which corresponds to one
##'   less than the number of unique integer values of full.dat[,"new.times"].
##' @param full.data  A matrix of observed data. See description below.
##' @param max.iter The maximum number of iterations for the EM algorithm and
##'   the accompanying SEM algorithm (if used).
##' @param verb An indicator for whether the function should print results as it
##'   runs.
##' @param tol A tolerance to use to test for convergence of the EM algorithm.
##' @param SEM.var If TRUE, the SEM algorithm will be run in addition to the EM
##'   algorithm to calculate the variance of the parameter estimates.
##'   
##' @details The data matrix full.data must have the following columns: 
##'   \describe{ \item{grp}{a 1 or a 2 indicating which of the two groups, j, 
##'   the observation is for.} \item{new.times}{an integer value representing
##'   the time, t, of observation.} \item{R}{the count of recovered cases with
##'   onset at time t in group j.} \item{D}{the count of deaths which occurred at
##'   time t in groupo j (note that these deaths did not have disease onset at
##'   time t but rather died at time t).} \item{N}{the total cases at t, j, or
##'   the sum of R and D columns.} }
##'   
##' @return A list with the following elements \describe{ \item{naive.rel.cfr
##'   }{the naive estimate of the relative case fatality ratio} 
##'   \item{glm.rel.cfr }{the reporting-rate-adjusted estimate of the relative
##'   case fatality ratio} \item{EM.rel.cfr }{the lag-adjusted estimate of the
##'   relative case fatality ratio} \item{EM.re.cfr.var }{the variance for the
##'   log-scale lag-adjusted estimator taken from the final M-step} 
##'   \item{EM.rel.cfr.var.SEM }{ the Supplemented EM algorithm variance for the
##'   log-scale lag-adjusted estimator} \item{EM.rel.cfr.chain }{a vector of the
##'   EM algorithm iterates of the lag-adjusted relative CFR estimates} 
##'   \item{EMiter}{the number of iterations needed for the EM algorithm to
##'   converge} \item{EMconv}{indicator for convergence of the EM algorithm.  0
##'   indicates all parameters converged within max.iter iterations.  1 indicates
##'   that the estimate of the relative case fatality ratio converged but other
##'   did not.  2 indicates that the relative case fatality ratio did not
##'   converge.} \item{SEMconv}{indicator for convergence of SEM algorithm. 
##'   Same scheme as EMconv.} \item{ests}{ the coefficient estimates for the
##'   model } \item{ests.chain}{ a matrix with all of the coefficient estimates,
##'   at each EM iteration} \item{DM}{the DM matrix from the SEM algorithm} 
##'   \item{DMiter}{a vector showing how many iterations it took for the
##'   variance component to converge in the SEM algorithm} }
##' @examples        
##'     ## This is code from the CFR vignette provided in the documentation.
##'        
##' data(simulated.outbreak.deaths)
##' min.cases <- 10 
##' N.1 <- simulated.outbreak.deaths[1:60, "N"] 
##' N.2 <- simulated.outbreak.deaths[61:120, "N"] 
##' first.t <- min(which(N.1 > min.cases & N.2 > min.cases)) 
##' last.t <- max(which(N.1 > min.cases & N.2 > min.cases)) 
##' idx.for.Estep <- first.t:last.t 
##' new.times <- 1:length(idx.for.Estep) 
##' simulated.outbreak.deaths <- cbind(simulated.outbreak.deaths, new.times = NA) 
##' simulated.outbreak.deaths[c(idx.for.Estep, idx.for.Estep + 60), "new.times"] <- rep(new.times, + 2)
##' assumed.nu = c(0, 0.3, 0.4, 0.3)
##' alpha.start <- rep(0, 22)
##'        
##' ## caution! this next line may take several minutes (5-10, depanding on 
##' ##    the speed of your machine) to run.
##' \dontrun{cfr.ests <- EMforCFR(assumed.nu = assumed.nu, 
##'                               alpha.start.values = alpha.start, 
##'                               full.data = simulated.outbreak.deaths, 
##'                               verb = FALSE, 
##'                               SEM.var = TRUE, 
##'                               max.iter = 500, 
##'                               tol = 1e-05)}
##' @keywords coarse data
##' @keywords incomplete data
##' @keywords case fatality ratio
##' @keywords infectious disease
##' @export
EMforCFR <- function(assumed.nu, alpha.start.values, full.data,
		     max.iter=50, verb=FALSE, tol=1e-10, SEM.var=TRUE){
	## full.data is the data output from the observe.epidemic() function
	##    with a column specifying the rows to be used for analysis

	#######################################
	## calculate naive and GLM estimates ##
	#######################################

	D.1 <- sum(full.data[full.data[,"grp"]==1&!is.na(full.data[,"new.times"]),"D"])
	D.2 <- sum(full.data[full.data[,"grp"]==2&!is.na(full.data[,"new.times"]),"D"])
	N.1 <- sum(full.data[full.data[,"grp"]==1&!is.na(full.data[,"new.times"]),"N"])
	N.2 <- sum(full.data[full.data[,"grp"]==2&!is.na(full.data[,"new.times"]),"N"])
	naive.rel.cfr <- sum(D.2)/sum(N.2)/(sum(D.1)/sum(N.1))

	dat <- data.frame(full.data)
	unadj.glm.fit <- glm(dat[,"D"] ~ factor(dat[,"new.times"]) + factor(dat[,"grp"]),
			     offset=log(dat[,"N"]), family=poisson)
	glm.rel.cfr <- exp(coef(unadj.glm.fit))[length(coef(unadj.glm.fit))]

	#########################
	## set up EM algorithm ##
	#########################
	proposed.rel.cfr.chain <- rep(0, max.iter)
	n.params <- max(dat[,"new.times"], na.rm=TRUE)+1
	phi.chain <- matrix(nrow=n.params, ncol=max.iter)
	nlag <- length(assumed.nu)

	## convergence codes:
	## 0 = all parameters converge
	## 1 = rCFR converges, but some other parameters do not
	## 2 = rCFR does not converge
	EMconv <- SEMconv <- 0

	## looping variables
	eps <- 1
	j <- 0
	alpha <- alpha.start.values
	while(eps>tol){
		j <- j+1
		############
		## E STEP ##
		############
		dat <- run.Estep(alpha,full.data=full.data,
				 nlag=nlag, assumed.nu=assumed.nu)

		############
		## M STEP ##
		############
		maxLik <- run.Mstep(dat)
		phi.chain[,j] <- maxLik$phi
		alpha <- phi.chain[2:(n.params-1),j]
		proposed.rel.cfr.chain[j] <- exp(phi.chain[n.params,j])

		## check convergence
		if(j>1)	eps <- max((phi.chain[,j]-phi.chain[,j-1])^2)
		if(j==max.iter) {
			## message("*** WARNING: EM agorithm didn't converge ***")
			if((phi.chain[n.params,j]-phi.chain[n.params,j-1])^2 < eps){
				EMconv <- 1
			} else {
				EMconv <- 2
			}
			break
		}
	}

	var.joint <- maxLik$Var

	phi <- phi.chain[,j]
	proposed.rel.cfr <- proposed.rel.cfr.chain[j]
	EMiter <- j
	if(verb) {
		print(paste("naive estimator =", round(naive.rel.cfr, 3)))
		print(paste("GLM estimate unadjusted for lags =", round(glm.rel.cfr, 3)))
		print(paste(EMiter, "iterations for EM convergence"))
	}

	if(SEM.var){
		DM.out <- SEM.variance(full.data=full.data, dat, phi, max.iter, tol, nlag=nlag, alpha.start.values, assumed.nu=assumed.nu)
		DM <- DM.out$DM
		DMiter <- DM.out$DMiter
		if(length(DM.out$loop.idx)>0) {
			loop.idx <- DM.out$loop.idx
			var.joint <- var.joint[-loop.idx,-loop.idx]
		}
		## set SEM convergence codes
		if(any(DMiter==0)) {
			if(DMiter[length(DMiter)]==0) {
				SEMconv <- 2
			} else { SEMconv <- 1 }

		}

		## calculate CFR variance if it converged
		if(SEMconv<2){
			variance.SEM <- var.joint +
				var.joint %*% DM %*% solve(diag(1, ncol(DM))-DM)
			proposed.rel.cfr.var.SEM <- variance.SEM[nrow(DM), nrow(DM)]
			if(proposed.rel.cfr.var.SEM<0) {
				proposed.rel.cfr.var.SEM <- NA
				SEMconv <- 2
			}
		} else { proposed.rel.cfr.var.SEM <- NA }

		if(verb){
			print("Estimate with SEM")
			print(paste("proposed CFR =", round(proposed.rel.cfr, 3),
				    "; 95% CI (",
				    round(exp(log(proposed.rel.cfr)
					      -2*sqrt(proposed.rel.cfr.var.SEM)),3),
				    ",",
				    round(exp(log(proposed.rel.cfr)
					      +2*sqrt(proposed.rel.cfr.var.SEM)),3),
				    ")"))
			if(any(DM.out$DMiter==0)) {
				print("non-convergent DM indices:")
				print(which(DM.out$DMiter==0))
			}
		}
	} else {
		SEMconv <- NA
		DM <- NULL
		DMiter <- NULL
		proposed.rel.cfr.var.SEM <- NULL
	}

	## ##############
	## OUTPUT LIST ##
	## ##############
	out <- list(naive.rel.cfr=naive.rel.cfr,
		    glm.rel.cfr=glm.rel.cfr,
		    EM.rel.cfr=proposed.rel.cfr,
		    EM.rel.cfr.var=var.joint[nrow(var.joint), nrow(var.joint)],
		    EM.rel.cfr.var.SEM = proposed.rel.cfr.var.SEM,
		    EM.rel.cfr.chain=proposed.rel.cfr.chain,
		    EMiter=EMiter,
		    EMconv=EMconv,
		    SEMconv=SEMconv,
		    ests=phi,
		    ests.chain.EM=phi.chain,
		    DM=DM,
		    DMiter=DMiter)

	if(verb) {
		print("Estimate with just GLM variance")
		print(paste("proposed CFR =", round(proposed.rel.cfr, 3),
			    "; 95% CI (",
			    round(exp(log(proposed.rel.cfr)
				      -2*sqrt(out$EM.rel.cfr.var)),3),
			    ",",
			    round(exp(log(proposed.rel.cfr)
				      +2*sqrt(out$EM.rel.cfr.var)),3),
			    ")"))
	}

	return(out)

}


## This function is meant to be run only through the function EMforCFR() 
##   and is used to calculate the variance via the Supplemented EM 
##   algorithm (see Meng and Rubin, 1991)

## params:
##   \item{full.data}{ A matrix of observed data. See description in EMforCFR helpfile.}
##   \item{dat}{A data frame.}
##   \item{phi}{ A vector of fitted parameters from the final EM iteration.}
##   \item{max.iter}{ The maximum number of iterations for SEM algorithm. }
##   \item{tol}{ A tolerance to use to test for convergence of the EM algorithm. }
##   \item{nlag}{ The number of time units for lagged data.  Corresponds to length(assumed.nu).}  
##   \item{alpha.start.values}{ a vector starting values for the reporting rate parameter of the GLM model. This must have length which corresponds to one less than the number of unique integer values of full.dat[,"new.times"].}
##   \item{assumed.nu}{ a vector of probabilities corresponding to the survival distribution, i.e. nu[i]=Pr(surviving i days | fatal case) }

## returned list:
##    \item{DM}{The estimate of the variance-covariance matrix for the model parameters.  Only converged rows are returned.}
##    \item{DMiter}{A vector whose ith entry is the number of iterations needed for convergence of the ith row of the DM matrix.}
##    \item{loop.idx}{If not NULL, the values correspond to the original indices of DM which have been omitted because of lack of convergence.}
SEM.variance <- function(full.data, dat, phi, max.iter, tol, nlag,
			 alpha.start.values, assumed.nu){
	## algorithm parameters
	eps <- 1
	iter <- 0
	phi.hat <- phi ## using phi from notation in Bayesian Data Analysis (Gelman, p323)
	phi0 <- c(1, alpha.start.values, 1) ## the starting values for the parameters: c(beta0, alpha2-T, gamma2)
	n.params <- length(phi0)

	## sequence of matrices which will converge to DM
	Rt <- array(dim=c(n.params, n.params, max.iter))
	DM <- matrix(nrow=n.params, ncol=n.params)
	DMiter <- rep(0, n.params)

	## sequence of coefficient estimates
	phi.t <- matrix(nrow=n.params, ncol=max.iter+1)
	alpha.t <- alpha.start.values
	phi.t[,1] <- phi0
	loop.idx <- 1:n.params
	while(length(loop.idx)>0) {
		iter <- iter + 1

		##THIS EM CALCULATES PHI^(T+1)
		## E step
		dat <- run.Estep(alpha.t, nlag=nlag,
				 full.data=full.data, assumed.nu=assumed.nu)
		## M step
		phi.t[,iter+1] <- run.Mstep(dat)$phi

		## generate Rt given phi.t
		for(i in loop.idx){
			## defining phi.tmp here as phi^t(i) from Gelman
			phi.t.i <- phi.hat
			phi.t.i[i] <- phi.t[i,iter]
			alpha.t.i <- phi.t.i[2:(n.params-1)]

			## E step
			dat <- run.Estep(alpha.t.i, nlag=nlag,
					 full.data=full.data, assumed.nu=assumed.nu)
			## M step
			phi.tplus1.i <- run.Mstep(dat)$phi

			## fis the ith row of the current Rt matrix
			Rt[i,,iter] <- (phi.tplus1.i-phi.hat)/(phi.t[i,iter]-phi.hat[i])
			if(iter==1) next
			if(all((Rt[i,,iter]-Rt[i,,iter-1])^2 < sqrt(tol))){
				loop.idx <- loop.idx[-which(loop.idx==i)]
				DM[i,] <- Rt[i,,iter]
				DMiter[i] <- iter
			}

		}

		## renew alpha.t
		alpha.tplus1 <- phi.t[2:(n.params-1), iter+1]
		alpha.t <- alpha.tplus1

		if(iter == max.iter) break("maximum iterations reached")

	}

	if(length(loop.idx)>0)	DM <- DM[-loop.idx, -loop.idx]

	DM.out <- list(DM=DM, DMiter=DMiter, loop.idx=loop.idx)
	
	return(DM.out)
}


## E-step of the EM algorithm.

# \arguments{
#  \item{alpha}{Current estimates of the alpha parameters from the GLM model.}
#  \item{full.data}{ A matrix of observed data. See description in EMforCFR helpfile.}
#  \item{nlag}{ The number of time units for lagged data.  Corresponds to length(assumed.nu).}  
#  \item{assumed.nu}{ a vector of probabilities corresponding to the survival distribution, i.e. nu[i]=Pr(surviving i days | fatal case) }
# }

# \value{ A data matrix with the same format as full.data from the EMforCFR() documentation.
# }

run.Estep <- function(alpha, full.data, nlag, assumed.nu){

	## storage for reconstructed deaths
	n.times <- nrow(full.data)/2
	idx.to.reconstruct <- which(full.data[,"grp"]==1 &!is.na(full.data[,"new.times"]))
	times.to.reconstruct <- full.data[idx.to.reconstruct,"time"]
	Dhat.1 <- Dhat.2 <- rep(0, n.times)

	## define the data
	N.1 <- full.data[1:n.times,"N"]
	N.2 <- full.data[(n.times+1):(2*n.times),"N"]
	D.1 <- full.data[1:n.times,"D"]
	D.2 <- full.data[(n.times+1):(2*n.times),"D"]

	## set the alpha coefficients
	alpha.long <- rep(0, n.times)
	alpha.long[times.to.reconstruct[-1]] <- alpha
	alpha.long[(max(times.to.reconstruct)+1):n.times] <- alpha[length(alpha)]

	## ########
	## loops for calculating expected value of the deaths

	for(t in times.to.reconstruct) {
		denom.1 <- denom.2 <- rep(0, nlag)
		for(i in 1:nlag){
			N.idx <- (t+i-1):(t+i-nlag)
			nu.idx <- 1:length(N.idx)
			## group 1
			denom.1[i] <- sum(assumed.nu[nu.idx]*N.1[N.idx]*exp(alpha.long[N.idx]))
			## group 2
			denom.2[i] <- sum(assumed.nu[nu.idx]*N.2[N.idx]*exp(alpha.long[N.idx]))
		}

		## exclude 0s from denominator
		denom.1[denom.1==0] <- NA
		denom.2[denom.2==0] <- NA

		## sum them up
		D.idx <- (t+1):(t+nlag)
		Dhat.1[t] <- N.1[t] * exp(alpha.long[t]) * sum(D.1[D.idx] * assumed.nu[nu.idx]/denom.1, na.rm=TRUE)
		Dhat.2[t] <- N.2[t] * exp(alpha.long[t]) * sum(D.2[D.idx] * assumed.nu[nu.idx]/denom.2, na.rm=TRUE)
	}

	dat <- cbind(Dhat=c(Dhat.1, Dhat.2),
		     new.times=full.data[,"new.times"],
		     grp=full.data[,"grp"],
		     N=full.data[,"N"])
	return(dat)
}


## the M-step

# \arguments{
#  \item{dat}{data matrix passed from EMforCFR().}
# }

# \value{ A list with two components
#         \item{phi }{fitted vector of parameters}
#         \item{Var }{variance-covariance matrix from the fitted model}
# }

run.Mstep <- function(dat){
	subset <- which(dat[,"N"]>0)
	fit <- glm(dat[,"Dhat"] ~ factor(dat[,"new.times"]) + factor(dat[,"grp"]),
		   offset=log(dat[,"N"]),family=poisson, subset=subset)

	phi <- coef(fit)

	out <- list(phi=phi, Var=vcov(fit))
	return(out)
}