1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
|
## Bias-reduced Poisson and multinomial log-linear models
## using maximum penalized likelihood as in Firth (1993) Biometrika.
##
## Author: David Firth, d.firth@warwick.ac.uk, 2 Sep 2010
##
## NOT A POLISHED PIECE OF PUBLIC-USE SOFTWARE! Provided "as is",
## licensed under GPL2. NO WARRANTY OF FITNESS FOR ANY PURPOSE!
##
## This code still has lots of rough edges, some of which are
## indicated by the embedded comments.
##
## The intention is to include a more polished implementation as part
## of the CRAN package "brglm", in due course.
## Arguments are as for a Poisson log-linear glm,
## with one new argument:
## -- "fixed.totals", a factor indexing groups of observations
## whose response totals are fixed (for example, this might be
## the rows of a table).
## If fixed.totals is NULL (the default), pure Poisson sampling (with
## no totals fixed) is assumed.
brpr <- function(formula,
data,
subset,
na.action,
offset = NULL,
control = glm.control(...),
contrasts = NULL,
fixed.totals = NULL,
weights = NULL,
...) {
## The "weights" and "offset" arguments here can only be NULL -- anything
## else is an error (for now, at least)
if (!is.null(weights)) stop("The weights argument here can only be NULL")
if (!is.null(offset)) stop("The offset argument here can only be NULL")
## work needed here, to allow the use of offset as an argument?
## Initial setup as in glm (except that `prior weights' are not used here,
## and an offset (if any) must be specified through the formula rather
## than (as is also allowed by glm) as a separate argument:
call <- match.call()
theFormula <- formula
if (missing(data))
data <- environment(formula)
if (is.null(call$epsilon)) control$epsilon <- 1e-8
## Is 1e-8 too stringent? Work needed here!
fixed.totals <- eval(substitute(fixed.totals), envir = data)
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset", "na.action",
"offset", "fixed.totals"), names(mf), 0)
mf <- mf[c(1, m)]
mf$drop.unused.levels <- TRUE
mf[[1]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
mt <- attr(mf, "terms")
Y <- model.response(mf, "numeric")
## Correct the fixed.totals variable for subsetting, NA treatment, etc.
if (!is.null(fixed.totals)) {
fixed.totals <- mf$"(fixed.totals)"
if (!is.factor(fixed.totals)) stop("fixed.totals must be a factor")
rowTotals <- as.vector(tapply(Y, fixed.totals, sum))[fixed.totals]
}
## Do an initial fit of the model, with constant-adjusted counts
adj <- 0.5
X <- model.matrix(formula, data = mf)
offset <- model.offset(mf)
nFixed <- if (is.null(fixed.totals)) 0 else nlevels(fixed.totals)
mf$y.adj <- Y + adj * (ncol(X) - nFixed)/nrow(X)
formula <- update(formula, y.adj ~ .)
op <- options(warn = -1)
fit <- glm.fit(X, mf$y.adj, family = poisson(), offset = offset,
control = glm.control(maxit = 1))
options(op)
## Update the model iteratively, refining the adjustment at each iteration
criterion <- 1
iter <- 1
coefs <- coef(fit)
Xwork <- X[ , !is.na(coefs), drop = FALSE]
muAdj <- fitted(fit)
coefs <- na.omit(coefs)
while (criterion > control$epsilon && iter < control$maxit) {
iter <- iter + 1
if (!is.null(fixed.totals)){
muTotals <- as.vector(tapply(muAdj, fixed.totals,
sum))[fixed.totals]
mu <- muAdj * rowTotals / muTotals
} else { ## case fixed.totals is NULL
mu <- muAdj
}
W.X <- sqrt(mu) * Xwork
XWXinv <- chol2inv(chol(crossprod(W.X)))
coef.se <- sqrt(diag(XWXinv))
h <- diag(Xwork %*% XWXinv %*% t(mu * Xwork))
mf$y.adj <- Y + h * adj
z <- log(muAdj) + (mf$y.adj - muAdj)/muAdj
lsfit <- lm.wfit(Xwork, z, muAdj, offset = offset)
criterion <- max((abs(lsfit$coefficients - coefs))/coef.se)
if (control$trace) cat("Iteration ", iter,
": largest (scaled) coefficient change is ",
criterion, "\n", sep = "")
coefs <- lsfit$coefficients
muAdj <- exp(lsfit$fitted.values)
}
## The object `lsfit' uses *adjusted* counts, fitted values, etc. --
## so we need to correct various things before returning (so that, for example,
## deviance and standard errors are correct).
fit$coefficients[!is.na(fit$coefficients)] <- coefs
fit$y <- Y
fit$fitted.values <- fit$weights <- mu
fit$linear.predictors <- log(mu)
dev.resids <- poisson()$dev.resids
wt <- fit$prior.weights
fit$deviance <- sum(dev.resids(Y, mu, wt))
fit$null.deviance <- sum(dev.resids(Y, mean(Y), wt))
fit$aic <- NA # sum((poisson()$aic)(Y, 1, mu, wt, 0)) ?? work needed!
fit$residuals <- Y/mu - 1
## Next bit is straight from glm -- not sure it applies here (work needed!)
if (any(offset) && attr(mt, "intercept") > 0) {
fit$null.deviance <- glm.fit(x = X[, "(Intercept)", drop = FALSE],
y = Y, weights = rep(1, nrow(mf)),
offset = offset, family = poisson(),
intercept = TRUE)$deviance
}
fit$iter <- iter
fit$model <- mf
fit$na.action <- attr(mf, "na.action")
fit$x <- X
fit$qr <- qr(sqrt(mu) * X)
fit$R <- qr.R(fit$qr, complete = TRUE)
## The "effects" component of the fit object is almost certainly not
## correct as is -- but where does it actually get used? (work needed!)
## rownames(fit$R) <- colnames(fit$R) ## FIX THIS!
fit <- c(fit, list(call = call, formula = theFormula, terms = mt,
data = data, offset = offset, method = "brpr",
contrasts = attr(X, "contrasts"),
xlevels = .getXlevels(mt, mf)))
class(fit) <- c("glm", "lm")
return(fit)
}
|