1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
|
#
# Copyright 2007-2018 by the individuals mentioned in the source code history
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script (by Rob K.) demonstrates how to conduct a generalized-estimating-equations analysis in OpenMx.
# The response variables are overdispersed counts, assessed longitudinally over four waves of assessment. The time
# variable is wave of assessment (0, 1, 2, and 3), and there are three time-invariant covariates. Robust standard
# errors for the regression coefficients are obtained via the "sandwich estimator."
# GEE is a "semi-parametric" regression method. A GEE model consists of three things: (1) a link function,
# which relates the conditional mean of the response variable to a linear composite of the regressors;
# (2) a variance function, which relates the residual variance to the conditional mean; and (3) a correlation
# structure that approximates the residual dependence among clustered/repeated observations. Usually, GEE is
# intended for inference about the fixed effects (i.e., the regression coefficients) when the random effects
# (variances and correlations) are not of interest. GEE does not rely upon any particular distributional
# assumptions
require(OpenMx)
#Load and inspect data:
data(LongitudinalOverdispersedCounts)
str(longData)
str(wideData)
#Let's try ordinary least-squares (OLS) regression:
olsmod <- lm(y~tiem+x1+x2+x3, data=longData)
#We will see in the diagnostic plots that the residuals are poorly approximated by normality, and are
#heteroskedastic. We also know that the residuals are not independent of one another, because we have
#repeated-measures data:
plot(olsmod)
#In the summary, it looks like all of the regression coefficients are significantly different from zero, but we
#know that because the assumptions of OLS regression are violated that we should not trust its results:
summary(olsmod)
#Let's try a generalized linear model (GLM). We'll use the quasi-Poisson quasilikelihood function to see
#how well the y variable is approximated by a Poisson distribution (conditional on time and covariates):
glm.mod <- glm(y~tiem+x1+x2+x3, data=longData, family="quasipoisson")
#The estimate of the dispersion parameter should be about 1.0 if the data are conditionally Poisson.
#We can see that it is actually greater than 2, indicating overdispersion:
summary(glm.mod) #<--We can get good start values for the regression coefficients from here.
#Start MxModel: ###
#MxMatrix to contain time-invariant covariates, as defintion variables:
Xdef <- mxMatrix(type="Full",nrow=1, ncol=3, free=F, labels=c("data.x1","data.x2","data.x3"), name="Xdef")
#MxMatrix to contain times (assessment waves), as definition variables:
timeVec <- mxMatrix(type="Full",nrow=4,ncol=1,free=F,labels=c("data.t0","data.t1","data.t2","data.t3"),
name="timeVec")
#Note that the time variable is NA for participants who are missing their y score for that assessment.
#OpenMx will work around the NA's on the endogenous variable, y, but it does not tolerate NA's among the
#definition variables. We need to set the NA's on the time variable to a "pseudo-missing" value--something
#extreme that will throw off the results to alert us that the invalid definition-variable score was used.
wideData[,"t1"][is.na(wideData[,"y1"])] <- -999
wideData[,"t2"][is.na(wideData[,"y2"])] <- -999
wideData[,"t3"][is.na(wideData[,"y3"])] <- -999
#We're now ready to make our mydata object:
mydat <- mxData(observed = wideData, type="raw")
#a unit MxMatrix:
U <- mxMatrix(type="Unit",nrow=4,ncol=1,name="U")
#an MxMatrix of regression coefficients:
Beta <- mxMatrix(type="Full",nrow=5,ncol=1,free=T,
values=c(0.5,0.3,0.2,-0.1,0.8), #<--Start values from the GLM above
labels=c("intrcpt","betax1","betax2","betax3","betatime"),
name="Beta")
if(mxOption(NULL,"Default optimizer")=="SLSQP"){
Beta$lbound[1:5,1] <- -20
Beta$ubound[1:5,1] <- 20
}
#This MxMatrix contains the dispersion parameter. In our case, the conditional residual variance will be
#modeled as the dispersion parameter times the conditional mean of the response variable.
Disper <- mxMatrix(type="Full",nrow=1,ncol=1,free=T,values=2,lbound=0.0001,labels="dispersion",name="Disper")
#This MxMatrix will serve as the residual correlation matrix. We will use an unstructured correlation matrix,
#with reasonable box constraints on the parameters:
R <- mxMatrix(type="Stand",nrow=4,ncol=4,free=T,values=0.1,#labels="r",
labels=c("r12","r13","r14","r23","r24","r34"),
lbound=0,ubound=0.9999,name="R")
#For each participant, this MxAlgebra puts the regression constant, the time-invariant covariates, and the time
#variable together in a "long-format" matrix:
rowX <- mxAlgebra(cbind(U,(U%*%Xdef),timeVec), name="rowX")
#This MxAlgebra yields the conditional mean response, given the covariates and wave of assessment.
#Notice that we are using a loglinear link: the conditional mean equals an exponentiated linear composite of
#regressors, or equivalently, the log of the conditional mean equals a linear composite of predictors:
yhatAlg <- mxAlgebra( exp(t(rowX%*%Beta)), name="yhat")
#Vector of conditional standard deviations:
S <- mxAlgebra(vec2diag(sqrt(yhat%x%Disper)), name="S")
#Conditional covariance matrix:
V <- mxAlgebra(S %*% R %*% S, name="V")
#We will use the multivariate-normal fitfunction and expectation out of mere convenience, and not because we
#actually think multivariate normality reasonably describes the data:
expec <- mxExpectationNormal(covariance="V", means="yhat", dimnames=c("y0","y1","y2","y3"))
fitfunc <- mxFitFunctionML()
#Put everything together into our MxModel:
mygeemodel <- mxModel(
"GEE",
Beta,Disper,expec,fitfunc,mydat,R,rowX,S,timeVec,U,V,Xdef,yhatAlg
)
#We do not actually want OpenMx's Hessian and standard errors, because they are based on a likelihood
#(multivariate normal) that we do not actually think is realistic. Instead, we will later obtain standard
#errors from the "sandwich estimator":
mygeemodel <- mxOption(mygeemodel,"Calculate Hessian","No")
mygeemodel <- mxOption(mygeemodel,"Standard Errors","No")
#Run our model:
mygeerun <- mxRun(mygeemodel)
#Model summary:
summary(mygeerun)
#OpenMx developer tests--compare results to those from package 'gee': ###
#Regression coefficients:
omxCheckCloseEnough(
mygeerun$output$estimate[c("intrcpt","betax1","betax2","betax3","betatime")],
c(0.5084808,0.3078977,0.1914246,-0.1113108,0.8389867),
epsilon=0.05)
#Dispersion parameter:
omxCheckCloseEnough(mygeerun$output$estimate["dispersion"],2.178853,epsilon=0.1)
#Correlations:
omxCheckCloseEnough(
mygeerun$output$estimate[c("r12","r13","r14","r23","r24","r34")],
c(0.3226068,0.2133836,0.1502026,0.1834196,0.1224658,0.05675176),
epsilon=0.05)
#Now, we obtain the sandwich estimator of the repeated-sampling covariance matrix of the regression coefficients:
Bread <- matrix(0,5,5)#<--The "bread" of the sandwich.
Meat <- matrix(0,5,5)#<--The "meat" of the sandwich
#We loop through the 1000 participants, and calculate the cumulative sums of the contributions of each to the
#Bread and Meat:
for(i in 1:nrow(wideData)){
ycurr <- mydat$observed[i,5:8] #<--response observations (y variables) for row i.
pres <- which(!is.na(ycurr))#<--A "presence vector," indicating which y's are non-missing in row i.
ycurr <- matrix(ycurr[pres],nrow=1) #<--Filter out any NAs among the y's.
yhatcurr <- matrix(mxEval(yhat,mygeerun,T,defvar.row=i)[,pres],nrow=1)#<--Contional mean for row i.
rowXcurr <- mxEval(rowX,mygeerun,T,defvar.row=i)[pres,]#<--value of MxAlgebra 'rowX' for row i.
#D is the derivative of the conditional mean 'yhat' with respect to the regression coefficients. It has
#the form it has here because we are using a loglinear link function. If we were using a different link
#function, it would have a different form:
D <- vec2diag(yhatcurr) %*% rowXcurr
Vinvcurr <- solve(mxEval(V,mygeerun,T,defvar.row=i)[pres,pres]) #<--Inverse covariance matrix for row i.
Bread <- Bread + t(D)%*%Vinvcurr%*%D
Meat <- Meat + t(D)%*%Vinvcurr%*%t(ycurr-yhatcurr)%*%(ycurr-yhatcurr)%*%Vinvcurr%*%D
}
#Finally, the sandwich:
( sammich <- solve(Bread)%*%Meat%*%solve(Bread) )
dimnames(sammich) <- list(c("intrcpt","betax1","betax2","betax3","betatime"),
c("intrcpt","betax1","betax2","betax3","betatime"))
#Regression coefficients with standard errors:
cbind(b=mygeerun$output$estimate[c("intrcpt","betax1","betax2","betax3","betatime")],SE=sqrt(diag(sammich)))
#Again, we do not actually think the multivariate-normal likelihood is actually reasonable for the data, so we
#do not use it for a likelihood-ratio test. Intead, we'll do a Wald chi-square test, for the significance
#of the non-intercept regression coefficients:
nonIntrcptLabels <- c("betax1","betax2","betax3","betatime")
Waldchi2 <- mygeerun$output$estimate[nonIntrcptLabels] %*% solve(sammich[nonIntrcptLabels,nonIntrcptLabels]) %*%
mygeerun$output$estimate[nonIntrcptLabels]
pchisq(Waldchi2,df=4,lower.tail=F)#<--p-value
#SEs of regression coefficients:
omxCheckCloseEnough(
sqrt(diag(sammich)),
c(0.026086291,0.008078779,0.008583959,0.008247869,0.009411659),
epsilon=0.005)
#Test imxRobustSE:
mygeemodel2 <- mxOption(mygeemodel,"Calculate Hessian","Yes")
mygeemodel2 <- mxOption(mygeemodel2,"Standard Errors","Yes")
mygeerun2 <- mxRun(mygeemodel2)
rse <- imxRobustSE(mygeerun2)
omxCheckCloseEnough(sqrt(diag(sammich)),rse[1:5],epsilon=0.005)
|