File: ad.test.combined.R

package info (click to toggle)
r-cran-ksamples 1.2-10-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 456 kB
  • sloc: ansic: 1,321; makefile: 2
file content (360 lines) | stat: -rw-r--r-- 13,928 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
ad.test.combined <-
function (..., data = NULL, method = c("asymptotic","simulated","exact"),
	dist = FALSE, Nsim = 10000) 
{
#############################################################################
# This function ad.test.combined combines several Anderson-Darling 
# K-sample test statistics AD.m, m = 1,...,M, into one overall test 
# statistic AD.combined as suggested in Section 8 of Scholz F.W. and 
# Stephens M.A. (1987), K-sample Anderson-Darling Tests,
# Journal of the American Statistical Association, Vol 82, No. 399, 
# pp. 918-924.
# See also the documentation of ad.test for the comparison of a single 
# set of K samples.
# Each application of the Anderson-Darling K-sample test can be 
# based on a different K > 1. This combined version tests the hypothesis 
# that all the hypotheses underlying the individual K-sample tests are 
# true simultaneously.
# The individual K-sample test hypothesis is that all samples from 
# the m-th group come from a common population. However, that population 
# may be different from one individual K-sample situation to the next. 
# Such a combined test is useful in 
#
# 1) examining intra-laboratory measurement equivalence, when samples from 
#    the same material or batch are compared for M laboratories and such
#    comparisons are made for samples from several different materials or
#    batches and one assessment over all materials/batches is desired.
#
# 2) analyzing treatment effects in randomized complete or incomplete 
#    block designs.
#
# When there are NA's among the sample values they are removed,
# with a warning message indicating the number of NA's.
# It is up to the user to judge whether such removals make sense.
#
# Input: ...  
#        can take the form of several lists, 
#        say L.1,...,L.M, where list L.i contains 
#        K.i sample vectors of respective sizes n.i[1], ..., n.i[K.i] 
#        (n.i[j] > 4 is recommended)
#
#        or a single list of such lists
#
#        or a data frame with 3 columns, the first column representing 
#	 the responses y, the second column a factor g the levels of 
#  	 which are used to indicate the samples within each block, and 
#	 the third column a factor b indicating the block
#
#        or a formula y ~ g | b with y, g, b as in the previous situation,
#        where y, g, b may be variable names in a provided data frame, say dat,
#	 supplied via data = dat,
#
#        or just the three vectors y, g, b in this order with same meaning.
#
# data 
#  	an optional data frame containing the variable names used in formula input, 
#       default is NULL, in which case the used variables must exist in the calling
#       environment.
#
# method 
#       can take one of three values "asymptotic", "simulated",
#		and "exact", which determines the mode of P-value calculation.
#       The asymptotic P-value is always returned.
#       The simulated P-value simulates splits of the pooled samples in 
#		the i-th list L.i into K.i samples of sizes n.i[1], ..., n.i[K.i],
#       computing the corresponding AD statistic AD.i (both versions),
#       doing this independently for i = 1,...,M and adding the AD.i
#       to get the combined statistic AD.comb. This is repeated Nsim 
#       times and the simulated P-value is the proportion of these 
#       values that are >= the observed combined value.
#       The exact P-value should only be attempted for small M and small
#       sample sizes and requires that Nsim be set to >= the total number
#       of AD.comb enumerations. Otherwise Nsim simulations are run
#       to get a simulated P-value, as described above.
#       As example consider: M=2 with K.1 = 2, n.1[1] = 5, n.1[2] = 6,
#		K.2 = 2, n.2[1] = 5, n.2[2] = 7, then we would have
#       choose(11,5) = 462 splits of the first set of pooled samples
#       and choose(12,5) = 792 splits of the second set of pooled samples
#       and thus 462 * 792 = 365904 combinations of AD.1+AD.2 = AD.comb.
#       Thus we would need to set Nsim >= 365904 to enable exact 
#       exact enumeration evaluation of the P-value. Since these enumerated
# 		values of AD.comb need to be held inside R in a single vector,
#  		we are limited by the object size in R. In simulations the length
#       of the simulated vector of AD.comb is only Nsim and is manageable.
#
# dist
#       takes values FALSE (default) or TRUE, where TRUE enables the 
#		return of the simulated or exact vectors of generated values
#       for both versions of AD.comb.
#       Otherwise NULL is returned for both versions
#
# Nsim  = 10000 (default), number of simulations as discussed above.
#       
#       
#
# An example:
# x1 <- c(1, 3, 2, 5, 7), x2 <- c(2, 8, 1, 6, 9, 4), and 
# x3 <- c(12,  5,  7,  9, 11)
# y1 <- c(51, 43, 31, 53, 21, 75) and y2 <- c(23, 45, 61, 17, 60)
# then 
# set.seed(2627)
# ad.test.combined(list(x1,x2,x3),list(y1,y2),method="simulated",Nsim=100000) 
# or equivalently ad.test.combined(list(list(x1,x2,x3),list(y1,y2)),
#	method="simulated",Nsim=100000)
# produces the outcome below.
##########################################################################
# 
# Combination of Anderson-Darling K-Sample Tests.
# 
# Number of data sets = 2 
# 
# Sample sizes within each data set:
# Data set 1 :  5 6 5
# Data set 2 :  6 5
# Total sample size per data set: 16 11 
# Number of unique values per data set: 11 11 
# 
# AD.i = Anderson-Darling Criterion for i-th data set
# Means: 2 1 
# Standard deviations: 0.92837 0.64816 
# 
# T.i = (AD.i - mean.i)/sigma.i
#  
# Null Hypothesis:
# All samples within a data set come from a common distribution.
# The common distribution may change between data sets.
# 
# Based on Nsim = 1e+05 simulations
# for data set 1 we get
#               AD   T.AD  asympt. P-value  sim. P-value
# version 1: 3.316 1.4176         0.088063       0.09868
# version 2: 3.510 1.6286         0.070278       0.09115
# 
# for data set 2 we get
#                 AD     T.AD  asympt. P-value  sim. P-value
# version 1: 0.37267 -0.96786          0.96305       0.94529
# version 2: 0.33300 -1.02930          0.98520       0.93668
# 
#
# Combined Anderson-Darling Criterion: AD.comb = AD.1+AD.2 
# Mean = 3    Standard deviation = 1.13225 
# 
# T.comb = (AD.comb - mean)/sigma
# 
# Based on Nsim = 1e+05 simulations
#            AD.comb    T.comb  asympt. P-value  sim. P-value
# version 1: 3.68867 0.6082333        0.2205062       0.23733
# version 2: 3.84300 0.7445375        0.1902867       0.21825
#
#
###############################################################################
# For out.ad.combined <- ad.test.combined(list(x1,x2,x3),list(y1,y2))
# or out.ad.combined <- ad.test.combined(list(list(x1,x2,x3),list(y1,y2)))
# we get the object out.ad.combined of class ksamples with the following 
# components
# > names(out.ad.combined)
# > names(ad.c.out)
#  [1] "test.name"  "M"          "n.samples"  "nt"         "n.ties"    
#  [6] "ad.list"    "mu"         "sig"        "ad.c"       "mu.c"      
# [11] "sig.c"      "warning"    "null.dist1" "null.dist2" "method"    
# [16] "Nsim" 
# where 
# test.name = "Anderson-Darling"
# M = number of sets of samples being compared
# n.samples = is a list of the vectors giving the sample sizes for each 
#             set of samples being compared
# nt = vector of total sample sizes involved in each of the M comparisons
# n.ties = vector of lenth M giving the number of ties in each comparison group
# ad.list = list of M data frames for the results for each of the test results
#           corresponding to the M block
#
# mu = vector of means of the M AD statistics
# sig = vector of standard deviations of the M AD statistics
# ad.c = 2 x 3 (or 2 x 4) matrix containing the AD statistics, 
#		standardized AD statistics, its asymptotic P-value, 
#      	(and its exact or simulated P-value), for version 1 in the first row 
#		and for version 2 in the second row.
# mu.c = mean of the combined AD statistic
# sig.c = standard deviation of the combined AD statistic
# warning = logical indicator, warning = TRUE when at least one of 
#           the sample sizes is < 5.
# null.dist1 enumerated or simulated values of AD statistic, version 1
# null.dist2 enumerated or simulated values of AD statistic, version 2
# method the method that was used: "asymptotic", "simulated", "exact".
# Nsim the number of simulations that were used.
#
# Fritz Scholz, August 2012
#####################################################################################
convvec <- function(x1,x2){
#----------------------------------------------------
# R routine for calling convvec in conv.c
# created 02.05.2012  Fritz Scholz
#----------------------------------------------------
n1 <- length(x1)
n2 <-length(x2)
n <- n1*n2
x <- numeric(n)
 out <- .C("convvec",  x1=as.double(x1), n1=as.integer(n1), 
					x2=as.double(x2),n2=as.integer(n2), 
					x=as.double(x),n=as.integer(n), PACKAGE= "kSamples")
out$x
}


# the following converts individual data sets into a list of such,
# if not already in this form.

data.sets <- io2(...,data=data)
data.sets <- test.list(data.sets)

# end of data.sets list conversion

method <- match.arg(method)
n.sizes <- NULL
M <- length(data.sets) # number of data sets (blocks)

n.data <- sapply(data.sets, length) 
		# gets vector of number of samples in each component of data.sets
n.samples <- list() # intended to contain vectors of sample sizes for 
                    # for each respective data set.
na.t <- 0 # intended to tally total of NA cases
ncomb <- 1 # intended to hold the total number of evaluations 
           # of the full convolution distribution, to check
           # whether exact computation is reasonable. 

for(i in 1:M){
	out <- na.remove(data.sets[i])
	na.t<- na.t + out$na.total
	data.sets[i] <- out$x.new # replace data set i with the one that has
                              # NA's removed
	n.sample <- sapply(data.sets[[i]], length) 
				# contains sample sizes for i-th data set
	n.sizes <- c(n.sizes, n.sample) 
				# accumulates all sample size, warning purpose only
	if(any(n.sample==0))
		stop(paste("one or more samples in data set", i,
                       "has no observations"))
	n.samples[[i]] <- n.sample 
	N <- sum(n.sample) # total observations in i-th data set
    	k <- length(n.sample) # number of samples in i-th data set

    # updating ncomb
   	ncomb <- ncomb * choose(N,n.sample[1])
	for(j in 2:k){
		N <- N-n.sample[j-1]
		ncomb <- ncomb * choose(N,n.sample[j])
	} # end of ncomb update for data set i
}
Nsim <- min(Nsim,1e7) 

if(ncomb > Nsim & method == "exact") method <- "simulated"	

if( na.t > 1) cat(paste("\n",na.t," NAs were removed!\n\n"))
if( na.t == 1) cat(paste("\n",na.t," NA was removed!\n\n"))

warning <- min(n.sizes) < 5 # set warning flag for caution on
                            # trusting asymptotic p-values


# Initializing output objects
AD <- 0
sig <- NULL
n.ties <- NULL
nt <- NULL
mu <- NULL
ad.list <- list()
mu.c <- 0
dist1 <- NULL
dist2 <- NULL
if(method == "asymptotic"){dist0 <- FALSE}else{dist0 <- TRUE}
# the following loop aggregates the (estimated or exact)
# convolution distribution of the combined AD statistic versions
for(i in 1:M){
	out <- ad.test(data.sets[[i]],method=method,dist=dist0,Nsim=Nsim)
	if(dist0==TRUE){
	    if(i == 1){
			dist1 <- out$null.dist1
			dist2 <- out$null.dist2
		}else{
			if(method=="simulated"){
				dist1 <- dist1+out$null.dist1
				dist2 <- dist2+out$null.dist2
			}else{
				dist1 <- convvec(dist1,out$null.dist1)
				dist2 <- convvec(dist2,out$null.dist2)
			}
		}
	}
	ad.list[[i]] <- out$ad
	sig.i <- out$sig
	mu <- c(mu, length(data.sets[[i]])-1)
	AD.i <- out$ad[,1]
	sig <- c(sig, sig.i) # accumulated stand. dev.'s of AD stats
	AD <- AD+AD.i # aggregates combined AD stats (version 1 and 2)
	mu.c <- mu.c + length(data.sets[[i]]) - 1 # aggregates mean of combined AD stats
	n.ties <- c(n.ties, out$n.ties)
	nt <- c(nt, sum(out$ns)) # accumulates total observations in data sets
}
AD <- as.numeric(AD)
# get exact or simulated P-value
if(dist0==T){
    	nrow <- length(dist1)
	ad.pval1.dist <- sum(dist1 >= AD[1])/nrow
	ad.pval2.dist <- sum(dist2 >= AD[2])/nrow
}
sig.c <- sqrt(sum(sig^2)) # standard deviation of combined AD stats
if(sig.c >0){
	tc.obs <- (AD - mu.c)/sig.c # standardized values of AD stats
	}else{
	tc.obs <- NA
}

# get asymptotic P-value
if(sig.c >0){
	ad.pval1 <- ad.pval(tc.obs[1], mu.c,1)
	ad.pval2 <- ad.pval(tc.obs[2], mu.c,2)
	}else{
	ad.pval1 <- 1
	ad.pval2 <- 1
}

    if(method == "asymptotic"){
		ad.c <- matrix(c(signif(AD[1],7),signif(tc.obs[1],7),round(ad.pval1,7),
           signif(AD[2],7),signif(tc.obs[2],7), round(ad.pval2,7)),
           byrow=TRUE, ncol=3)
		dimnames(ad.c) <- list(c("version 1:","version 2:"),
						c("AD.comb","T.comb"," asympt. P-value"))
    }
    if(method == "exact"){
		ad.c <- matrix(c(signif(AD[1],7),signif(tc.obs[1],7),round(ad.pval1,7),
		   round(ad.pval1.dist,7),
           signif(AD[2],7),signif(tc.obs[2],7), round(ad.pval2,7),
		   round(ad.pval2.dist,7)),byrow=TRUE, ncol=4)
		dimnames(ad.c) <- list(c("version 1:","version 2:"),
						c("AD.comb","T.comb"," asympt. P-value"," exact P-value"))
    }
    if(method == "simulated"){
		ad.c <- matrix(c(signif(AD[1],7),signif(tc.obs[1],7),round(ad.pval1,7),
		   round(ad.pval1.dist,7),
           signif(AD[2],7),signif(tc.obs[2],7), round(ad.pval2,7),
		   round(ad.pval2.dist,7)),byrow=TRUE, ncol=4)

		dimnames(ad.c) <- list(c("version 1:","version 2:"),
						c("AD.comb","T.comb"," asympt. P-value"," sim. P-value"))
    }

if(dist==FALSE){
	dist1 <- NULL
	dist2 <- NULL
}


object <- list(test.name ="Anderson-Darling",
				M=M, n.samples=n.samples, nt=nt, n.ties=n.ties, ad.list=ad.list,
           		mu=mu, sig=sig, ad.c = ad.c, mu.c=mu.c,
           		sig.c=round(sig.c,5), warning=warning,null.dist1=dist1,
				null.dist2=dist2,method=method,Nsim=Nsim)
class(object) <-  "kSamples"
object
}