File: calibration.Snw

package info (click to toggle)
r-cran-sampling 2.10-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,336 kB
  • sloc: ansic: 21; makefile: 2
file content (398 lines) | stat: -rw-r--r-- 15,852 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
\documentclass[a4paper]{article}
\usepackage{pdfpages}
%\VignetteIndexEntry{calibration and adjustment for nonresponse}
%\VignettePackage{sampling}
\newcommand{\sampling}{{\tt sampling}}
\newcommand{\R}{{\tt R}}
\setlength{\parindent}{0in}
\setlength{\parskip}{.1in}
\setlength{\textwidth}{140mm}
\setlength{\oddsidemargin}{10mm}
\title{Calibration and generalized calibration}
\author{}
\usepackage{Sweave} 
\usepackage[latin1]{inputenc}
\usepackage{amsmath}


\begin{document}
\maketitle

<<echo=FALSE, results=hide>>=
library(sampling)
ps.options(pointsize=12)
options(width=60)
@

\section{Example 1} 

Example of using the \verb@calib@ function for calibration and nonresponse adjustment (with response homogeneity groups). 

@
\noindent
We create the following population data frame (the population size is $N=250$): 
\begin{itemize}
\item there are four variables: \verb@state@, \verb@region@, \verb@income@ and \verb@sex@; 
\item the \verb@state@ variable has 2 categories: 'A' and 'B'; the \verb@region@ variable has 3 categories: 1, 2, 3 (regions within states);
\item the \verb@income@ and \verb@sex@ variables are randomly generated using the uniform distribution.
\end{itemize}

<<calib1>>=
data = rbind(matrix(rep("A", 150), 150, 1, byrow = TRUE), 
matrix(rep("B", 100), 100, 1, byrow = TRUE))
data = cbind.data.frame(data, c(rep(1, 60), rep(2,50), rep(3, 60), rep(1, 40), rep(2, 40)),
 1000 * runif(250))
sex = runif(nrow(data))
for (i in 1:length(sex)) if (sex[i] < 0.3) sex[i] = 1 else sex[i] = 2
data = cbind.data.frame(data, sex)
names(data) = c("state", "region", "income", "sex")
summary(data)
@
\noindent
We compute the population stratum sizes:

<<calib2>>=
table(data$state)
@

We select a stratified sample. The \verb@state@ variable is used as a stratification variable.
The sample stratum sizes are 25 and 20, respectively. The method is 'srswor' (equal probability, without replacement).

<<calib3, results=hide>>=
s=strata(data,c("state"),size=c(25,20), method="srswor")
@

We obtain the observed data:

<<calib31, results=hide>>=
s=getdata(data,s)
@ 

The \verb@status@ variable is used in the \verb@rhg_strata@ function.
The \verb@status@ column is added to $s$ (1 - sample respondent, 0 otherwise); it is randomly generated using the uniform distribution U(0,1). 
The response probability for all units is 0.3.

<<calib32, results=hide>>=
status=runif(nrow(s))
for(i in 1:length(status))
 if(status[i]<0.3) status[i]=0 else status[i]=1
s=cbind.data.frame(s,status) 
@

We compute the response homeogeneity groups using the \verb@region@ variable:

<<calib4, results=hide>>=
s=rhg_strata(s,selection="region")
@

We select only the sample respondents:

<<calib5, results=hide>>=
sr=s[s$status==1,]
@

We create the population data frame of sex and region indicators:
<<calib6, results=hide>>=
X=cbind(disjunctive(data$sex),disjunctive(data$region))
@

We compute the population totals for each sex and region:
<<calib7, results=hide>>=
total=c(t(rep(1,nrow(data)))%*%X)
@ 

The first method consists in calibrating with all strata. 
The respondent data frame of \verb@sex@ and \verb@region@ indicators is created.
The initial weights using the inclusion prob. and the response probabilities are computed.

<<calib8>>=
Xs = X[sr$ID_unit,]
d = 1/(sr$Prob * sr$prob_resp)
summary(d)
@

We compute the g-weights using the linear method:
<<calib9>>=
g = calib(Xs, d, total, method = "linear")
summary(g)
@

The final weights are:
<<calib10>>=
w=d*g
summary(w)
@

We check the calibration:
<<calib11>>=
checkcalibration(Xs, d, total, g)
@


The second method consists in calibrating in each stratum. 
The respondent data frame of \verb@sex@ and \verb@region@ indicators is created in each stratum.
The initial weights using the inclusion prob. and response probabilities are computed in each stratum.

<<calib12>>=
cat("stratum 1\n")
data1=data[data$state=='A',]
X1=X[data$state=='A',]
total1=c(t(rep(1, nrow(data1))) %*% X1)
sr1=sr[sr$Stratum==1,]
Xs1=X[sr1$ID_unit,]
d1 = 1/(sr1$Prob * sr1$prob_resp)
g1=calib(Xs1, d1, total1, method = "linear")
checkcalibration(Xs1, d1, total1, g1)
cat("stratum 2\n")
data2=data[data$state=='B',]
X2=X[data$state=='B',]
total2=c(t(rep(1, nrow(data2))) %*% X2)
sr2=sr[sr$Stratum==2,]
Xs2=X[sr2$ID_unit,]
d2 = 1/(sr2$Prob * sr2$prob_resp)
g2=calib(Xs2, d2, total2, method = "linear")
checkcalibration(Xs2, d2, total2, g2)
@
 
<<eval=FALSE, echo=FALSE>>=
<<calib1>>
<<calib2>>
<<calib3>>
<<calib31>>
<<calib32>>
<<calib4>>
<<calib5>>
<<calib6>>
<<calib7>>
<<calib8>>
<<calib9>>
<<calib10>>
<<calib11>>
<<calib12>>



sampling.newpage()

@

\section{Example 2} 
This is an example for 
\begin{itemize}
\item variance estimation of the calibration estimator (using the \verb@calibev@ and \verb@varest@ functions),  
\item variance estimator of the Horvitz-Thompson estimator (using the \verb@varest@ and \verb@varHT@ functions).
\end{itemize}

We generate an artificial population and use Till\'e sampling. The population size is 100, and the sample size is 20. 
There are three auxiliary variables (two categorical and one continuous; the matrix $X$). 
The vector $Z=(150, 151, \dots, 249)'$ is used to compute the first-order inclusion probabilities. The variable of interest $Y$ is computed using
the model $Y_j=5*Z_j*(\varepsilon_j+\sum_{i=1}^{100} X_{ij}), \varepsilon_j\sim N(0,1/3), iid, j=1,\dots, 100.$ The calibration estimator uses the 
linear method. Simulations are conducted to estimate the MSE of the two variance estimators of the calibration estimator.  
Since the linear method is used in calibration, the calibration estimator corresponds to the generalized regression estimator. For the latter an approximate variance can be computed on the population level
and used in the bias estimation of the variance estimators. For the Horvitz-Thompson estimator, the variance can be computed on the population level and compared with the simulations' result. 
Use 10000 simulation runs to obtain accurate results (for time consuming reason, in the following program, the number of runs is only 10).

<<ex1>>=
X=cbind(c(rep(1,50),rep(0,50)),c(rep(0,50),rep(1,50)),1:100)
# vector of population totals
total=apply(X,2,"sum")
Z=150:249 
# the variable of interest
Y=5*Z*(rnorm(100,0,sqrt(1/3))+apply(X,1,"sum"))
# inclusion probabilities
pik=inclusionprobabilities(Z,20)
# joint inclusion probabilities 
pikl=UPtillepi2(pik)
# number of runs; let nsim=10000 for an accurate result
nsim=10
c1=c2=c3=c4=c5=c6=numeric(nsim)
for(i in 1:nsim)
{
# draws a sample
s=UPtille(pik)
# computes the inclusion prob. for the sample
piks=pik[s==1]
# the sample matrix of auxiliary information
Xs=X[s==1,]
# computes the g-weights
g=calib(Xs,d=1/piks,total,method="linear")
# computes the variable of interest in the sample
Ys=Y[s==1]
# computes the joint inclusion prob. for the sample
pikls=pikl[s==1,s==1]
# computes the calibration estimator and its variance estimation
cc=calibev(Ys,Xs,total,pikls,d=1/piks,g,with=FALSE,EPS=1e-6)
c1[i]=cc$calest
c2[i]=cc$evar
# computes the variance estimator of the calibration estimator (second method)
c3[i]=varest(Ys,Xs,pik=piks,w=g/piks)
# computes the variance estimator of the HT estimator using varest()
c4[i]=varest(Ys,pik=piks) 
# computes the variance estimator of the HT estimator using varHT()
c5[i]=varHT(Ys,pikls,2) 
# computes the Horvitz-Thompson estimator
c6[i]=HTestimator(Ys,piks)
}
cat("the population total:",sum(Y),"\n")
cat("the calibration estimator under simulations:", mean(c1),"\n")
N=length(Y)
delta=matrix(0,N,N)
for(k in 1:(N-1))
  for(l in (k+1):N)
   delta[k,l]=delta[l,k]=pikl[k,l]-pik[k]*pik[l]
diag(delta)=pik*(1-pik)
var_HT=0
var_asym=0
e=lm(Y~X)$resid
for(k in 1:N)
  for(l in 1:N) {var_HT=var_HT+Y[k]*Y[l]*delta[k,l]/(pik[k]*pik[l])
                 var_asym=var_asym+e[k]*e[l]*delta[k,l]/(pik[k]*pik[l])} 
cat("the approximate variance of the calibration estimator:",var_asym,"\n")
cat("first variance estimator of the calibration est. using calibev function:\n")
cat("MSE of the first variance estimator:", var(c2)+(mean(c2)-var_asym)^2,"\n")
cat("second variance estimator of the calibration est. using varest function:\n")
cat("MSE of the second variance estimator:", var(c3)+(mean(c3)-var_asym)^2,"\n")
cat("the Horvitz-Thompson estimator under simulations:", mean(c6),"\n")
cat("the variance of the HT estimator:", var_HT, "\n")
cat("the variance estimator of the HT estimator under simulations:", mean(c4),"\n")
cat("MSE of the variance estimator 1 of HT estimator:", var(c4)+(mean(c4)-var_HT)^2,"\n")
cat("MSE of the variance estimator 2 of HT estimator:", var(c5)+(mean(c5)-var_HT)^2,"\n")
@


<<eval=FALSE, echo=FALSE>>=
<<ex1>>

sampling.newpage()

@



\section{Example 3} 
This is an example of generalized calibration used to handle unit nonresponse with different forms of response probabilities. 


Consider the population $U$, the sample $s$ and the set of respondents $r$ with $r\subseteq s \subseteq U.$ 

The response mechanism
is given by the distribution $q(r|s)$ such that for every fixed
$s$ we have
$$q(r|s)\geq 0, \mbox{ for all } r\in \mathcal{R}_s \mbox{ and } \sum_{s\in {\mathcal R}_s} q(r|s)=1,$$
where ${\mathcal R}_s=\{r | r \subseteq s\}.$ The variable of
interest $y_k$ is known only for $k\in r.$ Under unit
nonresponse we define the response indicator $R_k=1$ if unit $k\in
r$ and 0 otherwise and the response probabilities $p_k=Pr(R_k=1| k\in s).$ It is assumed
that $R_k$ are independent Bernoulli variables with expected value
equal to $p_k.$  We assume that the units respond
independently of each other and of $s$ and so
$$q(r|s)=\prod_{k\in r} p_k \prod_{k \in \bar{r}} (1-p_k).$$

The nonresponse model can be rewritten as
$$q(r|s, \boldsymbol{\gamma})=\prod_{k\in r} F_k^{-1}(\boldsymbol{\gamma}) \prod_{k
\in \bar{r}} (1-F^{-1}_k(\boldsymbol{\gamma})).$$ In calibration method it is assumed that
$$\sum_{k\in r} \mathbf{x}_kd_kF_k(\boldsymbol{\gamma})=\sum_{k\in r} \mathbf{x}_kd_kF(\boldsymbol{\gamma}^T\mathbf{x}_k)=\sum_{k\in U} \mathbf{x}_k,$$
where $F_k(\boldsymbol{\gamma})=F(\boldsymbol{\gamma}^T\mathbf{x}_k), p_k=F_k(\boldsymbol{\gamma})^{-1},$ and $d_k$ are the initial weigths.

In generalized calibration a different equation is used
$$\sum_{k\in r} \mathbf{x}_kd_kF(\boldsymbol{\gamma}^T\mathbf{z}_k)=\sum_{k\in U}    
\mathbf{x}_k,$$ where $\mathbf{z}_k$ is not necessary equal to                       
$\mathbf{x}_k,$ but $\mathbf{z}_k$ and $\mathbf{x}_k$ have to be                     
highly correlated. $\mathbf{z}_k$ should be known only                    
for $k\in r.$ The components of $\mathbf{z}_k$                                               
that are not also components of $\mathbf{x}_k$ are often known as                    
\emph{instrumental variables}. Let $w_k$ be the final weights (obtained after applying generalized calibration). 

It is possible to assume different forms of response probabilities:
\begin{itemize}
\item Linear weight adjustment (it can be implemented by using the argument \texttt{method="linear"} in gencalib() function or \texttt{method="truncated"} if bounds are allowed): $p_k=1/(1+
{\boldsymbol\gamma}^T\mathbf{z}_k)$ and $w_k=d_k(1+\mathbf{h}^T\mathbf{z}_k),$ where $\mathbf{h}$ is a consistent estimate of ${\boldsymbol\gamma}.$ \item Raking weight
adjustment (it can be implemented by using the argument \texttt{method="raking"} in gencalib()): $p_k=1/\exp(\boldsymbol{\gamma}^T\mathbf{z}_k)$ and $w_k=d_k \exp(\mathbf{h}^T\mathbf{z}_k).$ \item Logistic
weight adjustment (it can be implemented by using the argument \texttt{method="raking"} in gencalib()): $p_k=1/(1+\exp(\boldsymbol{\gamma}^T\mathbf{z}_k)), w_k=d_k (1+\exp(\mathbf{h}^T\mathbf{z}_k)),$
 but we calibrate
on $\sum_{k\in U} \mathbf{x}_k-\sum_{k\in r} \mathbf{x}_k d_k$
instead of $\sum_{k\in U} \mathbf{x}_k.$\item Generalized
exponential weight adjustment (Folsom and Singh, 2000; it can be implemented by using the argument \texttt{method="logit"} in gencalib()):
$$p_k=1/F(\boldsymbol{\gamma}^T\mathbf{z}_k), w_k=d_kF(\mathbf{h}^T\mathbf{z}_k),$$ $$F(\mathbf{h}^T\mathbf{z}_k)=\frac{L(U-C)+U(C-L)\exp(A\mathbf{h}^T\mathbf{z}_k)}{(U-C)+(C-L)\exp(A\mathbf{h}^T\mathbf{z}_k)}\in (L, U),$$
where $A=(U-L)/((C-L)(U-C))$ and $L\geq 0,1<U\leq \infty, U>C>L,$
($C=1$ in the paper of Deville and Sarndal, 1992). The g-weights are centered around of $C.$ For $L=1, C=2$
and $U=\infty, F(\mathbf{h}^T\mathbf{z}_k)$ approaches
$1+\exp(\mathbf{h}^T\mathbf{z}_k)$ and for $C=1, L=0, U=\infty,$
$\exp(\mathbf{h}^T\mathbf{z}_k).$
\end{itemize}

We exemplify the last form of response probabilities (generalized
exponential weight adjustment) using artificial data. We generate a population of size $N=400$ and consider the auxiliary information $X$ following a Gamma distribution
with parameters 3 and 4. The instrumental variable $Z$ is generated using the model $Z=2X+\varepsilon,$ where $\varepsilon\sim U(0,1).$ The variable of interest is
$Y$ generated using the model $Y=3X+\varepsilon_1,$ where $\varepsilon_1\sim N(0,1).$ We consider here that the nonresponse is not missing at random and the response
probabilities $p$ depend on the variable of interest $y$ which may be missing. We draw a simple random sampling without replecement of size $n=100$ and generate the set of respondents $r$ using
Poisson sampling with the probabilties $p.$ The bounds are fixed to 1 and 5, and the constant $C=1.5.$ Three estimators are computed: 
\begin{itemize}
\item the generalized calibration estimator using $Z$ as instrumental variable,
\item the generalized calibration estimator using $Y$ as instrumental variable,
\item the generalized calibration estimator using $X$ as instrumental variable, which is the same with the calibration estimator, but the g-weights are centered around $C$.
\end{itemize}
The convergence of the method is not guaranteed due to the bounds. Thus $g1, g2, g3$ can be null. If it the case, repeat the code (considering another $s$ and $r$).

<<gen1>>=
N=400
n=100
X=rgamma(N,3,4)
total=sum(X)
Z=2*X+runif(N)
Y=3*X+rnorm(N)
print(cor(X,Y))
print(cor(X,Z))
L=1 
U=5
C=1.5  
A=(U-L)/((C-L)*(U-C))
p=((U-C)+(C-L)*exp(A*Y*0.3))/(L*(U-C)+U*(C-L)*exp(A*Y*0.3))
summary(p)
bounds=c(L,U)
s=srswor(n,N)
r=numeric(n)
for(j in 1:n) if(runif(1)<p[s==1][j]) r[j]=1  
print("Size of r is:")
nr=sum(r)
print(nr)
Xr=X[s==1][r==1]
Yr=Y[s==1][r==1]
Zr=Z[s==1][r==1]
pikr=rep(n/N,times=nr)
d=1/(pikr)
g1=gencalib(Xr,Zr,d,total,method="logit",bounds=bounds,C=C)
g2=gencalib(Xr,Yr,d,total,method="logit",bounds=bounds,C=C)
g3=gencalib(Xr,Xr,d,total,method="logit",bounds=bounds,C=C)
if(is.null(g1))
print("g1 is null") else
if(checkcalibration(Xr,d,total,g1)$result)
{print("the gen.calibration estimator using Zs as instrumental variable")
print(sum(Yr*g1*d))
}
if(is.null(g2))
print("g2 is null") else
if(checkcalibration(Xr,d,total,g2)$result)
{
print("the gen.calibration estimator using Ys as instrumental variable")
print(sum(Yr*g2*d))
}
if(is.null(g3))
print("g3 is null") else
if(checkcalibration(Xr,d,total,g3)$result)
{
print("the calibration estimator")
print(sum(Yr*g3*d))
}
cat("The population total is:", sum(Y),"\n")
@                                
                                 
                                
<<eval=FALSE, echo=FALSE>>=      
<<gen1>>                          
                                 
sampling.newpage()               
                                 
@                                

\end{document}