File: simClustDesign.R

package info (click to toggle)
r-cran-clustergeneration 1.3.8-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 368 kB
  • sloc: makefile: 2
file content (434 lines) | stat: -rw-r--r-- 18,736 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434

# Generating data sets via a factorial design (Qiu and Joe, 2006), 
# which has factors degree of separation, number of clusters, 
# number of non-noisy variables, number of noisy variables, 
# number of outliers, number of replicates, etc. 
# The separation between any cluster and its nearest neighboring clusters 
# can be set to a specified value. The covariance matrices of clusters 
# can have arbitrary diameters, shapes and orientations.

#   Qiu, W.-L. and Joe, H. (2006)
#   Generation of Random Clusters with Specified Degree of Separaion.
#   \emph{Journal of Classification}, \bold{23}(2),  315--334.
#
#  Factors                          Levels
#---------------------------------------------
# Number of clusters                3, 6, 9
# Degree of separation              close, separated, well-separated
# Number of non-noisy variables     4, 8, 20
# Number of noisy variables         1, 0.5p1, p1
#---------------------------------------------
# p1 is the number of non-noisy variables
#
# sepVal -- Levels of the degree of separation. 
#          It has been "calibrated" against two univariate 
#          clusters generated from N(0, 1) and N(0, A), respectively,  
#          With A=4, sepVal=0.01 indicates a close cluster structure. 
#          With A=6, sepVal=0.21 indicates a separate cluster structure. 
#          With A=8, sepVal=0.34 indicates a well-separated cluster structure.
#      'close' -- 'sepVal=0.01'
#      'separated' -- 'sepVal=0.21'
#      'well-separated' -- 'sepVal=0.342'
# sepLabels -- labels for 'close', 'separated', and 'well-separated' 
# numClust -- levels of numbers of clusters
# numNonNoisy -- levels of numbers of non-noisy variables
# numNoisy -- levels of numbers of noisy variables
# numOutlier -- number/ratio of outliers. If numOutlier is a positive integer,
#   then numOutlier means the number of outliers. If numOutlier is a real
#   number in (0,1), then numOutlier means the ratio of outliers, i.e. 
#   the number of outliers is equal to round(numOutlier*N), 
#   where N is the total number of non-outliers.
# numReplicate -- the number of data sets to generate for each combination.
#   the default value is 3 as in the design in Milligan (1985) 
#   (An algorithm for generating artificial test clusters,
#   Psychometrika, 50, 123-127)
# fileName -- the output data file names have the format 
#       [fileName]J[j]G[g]v[p1]nv[p2]out[numOutlier]_[numReplicate].[extension] 
#   where
#      'extension' can be 'dat', 'log', 'mem', or 'noisy',
#      'J' means separation index, 'G' means number of clusters,
#      'v' means the number of non-noisy variables,
#      'nv' means the number of noisy variables,
#      'out' means the number of outliers
# clustszind -- cluster size indicator. 
#      clustszind=1 indicates that all cluster have equal size.  
#         The size is specified by clustSizeEq.
#      clustszind=2 indicates that the cluster sizes are randomly 
#         generated from the range rangeN.
#      clustszind=3 indicates that the cluster sizes are specified
#         via the vector clustSizes.
#      The default value is 2 so that the generated clusters are more realistic.
# clustSizeEq -- if clustszind=1, then this is the constant cluster size 
#      The default value 100 is a reasonable cluster size.
# rangeN -- if clustszind=2, then the cluster sizes are randomly generaged
#         from the range 'rangeN'. The default range is [50, 200] which
#         can produce reasonable variability of cluster sizes.
# clustSizes -- if clustszind=3, then the cluster sizes are specified via
#         clustSizes. An input is required with
#         the default value of 'clustSizes' set as 'NULL'.
# covMethod -- methods to generate random covariance matrices. 
#      'eigen' method first generates the eigenvalues of the covariance matrix,
#          then generate eigenvectors to construct the covariance matrix. 
#      'unifcorrmat' method first generates a correlation 
#          matrix via the method proposed by 
#           Joe H (2006). Generating random correlation matrices based on 
#           partial correlations. J. Mult. Anal. Vol. 97, 2177--2189
#          Then, it generate variances from the range 'rangeVar' to
#          construct the covariance matrix. 
#
#      'onion' method
# extension of onion method, using Cholesky instead of msqrt
#Ghosh, S., Henderson, S. G. (2003). Behavior of the NORTA method for
#correlated random vector generation as the dimension increases.
#ACM Transactions on Modeling and Computer Simulation (TOMACS)
#v 13 (3), 276-294.
#
#      'c-vine' method
#      # Random correlation with the C-vine (different order of partial
# correlations). Reference for vines: Kurowicka and Cooke, 2006,
# Uncertainty Analysis with High Dimensional Dependence Modelling,
# Wiley, 2006.
#
#      The default method is 'eigen' so that the user can directly 
#      specify the range of the 'diameters' of clusters.
# rangeVar -- if 'covMethod="unifcorrmat"', then to generate a covariance 
#      matrix, we first generate a correlation matrix, then randomly generate
#      variances from the range specified by 'rangeVar'.
#      The default range is [1, 10] which can generate reasonable
#      variability of variances.
# lambdaLow -- if 'covMethod="eigen"', when generating a covariance matrix, 
#      we first generate its eigenvalues. The eigenvalues are randomly 
#      generated from the interval [lambdaLow, lambdaLow*ratioLambda]. 
#      Note that lambdaLow should be positive.
#      In our experience, the range [lambdaLow=1, ratioLambda=10]
#      can give reasonable variability of the diameters of clusters.
# ratioLambda -- if 'covMethod="eigen"', lambdaUpp=lambdaLow*ratioLambda, 
#      is the upper bound of the eigenvalues of a covariance matrix
#      and lambdaLow is the lower bound.
#      In our experience, the range [lambdaLow=1, lambdaUpp=10]
#      can give reasonable variability of the diameters of clusters.
# alphad -- parameter for c-vine and onion method to generate random correlation matrix
#       eta=1 for uniform. eta should be > 0
# eta -- parameter for c-vine and onion method to generate random correlation matrix
#       eta=1 for uniform. eta should be > 0
# rotateind-- if rotateind =TRUE, then rotate data so that we may not detect the
#      full cluster structure from scatterplots, otherwise do not rotate.
#      By default, 'rotateind=TRUE' to generate more realistic data sets.
# iniProjDirMethod -- indicating the method to get initial projection direction 
#      By default, the sample version of the Su and Liu (SL) projection 
#      direction ((n1-1)*S1+(n2-1)*S2)^{-1}(mu2-mu1) is used,
#      where mui and Si are the mean vector and covariance
#      matrix of cluster i. Alternatively, the naive projection
#      direction (mu2-mu1) is used. 
#      Su and Liu (1993) JASA 1993, vol. 88 1350-1355.
# projDirMethod -- takes values "fixedpoint" or "newton"
#      "fixedpoint" means that we get optimal projection direction
#        by solving the equation d J(w) / d w = 0, where
#        J(w) is the separation index with projection direction 'w'
#      "newton" method means that we get optimal projection driection
#        by the method proposed in the appendix of Qiu and Joe (2006) 
#        "Generation of random clusters with specified degree of separation",
#        Journal of Classification Vol 23(2), 315-334.
# alpha -- tuning parameter for separation index to indicating the percentage 
#      of data points to downweight. We set 'alpha=0.05' like we set
#      the significance level in hypothesis testing as 0.05.
# ITMAX -- when calculating the projection direction, we need iterations. 
#      ITMAX gives the maximum iteration allowed.
#      The actual #iterations is usually much less than the default value 20.
# eps -- A small positive number to check if a quantitiy \eqn{q} is 
#      equal to zero. If \eqn{|q|<}\code{eps}, then we regard \eqn{q} 
#      as equal to zero.  \code{eps} is used to check if an algorithm 
#      converges. The default value is \eqn{1.0e-10}.
# quiet -- a flag to switch on/off the outputs of intermediate results.
#      The default value is 'TRUE'.
# outputEmpirical -- indicates if empirical projection directions
#      and separation indices should be output to files.
#      These information usually are useful to check the cluster
#      structures. Hence, by default, 'outputEmpirical=TRUE'.
# outputInfo -- indicates if separation information dataframe should be output.
#      The file name extension is .log
simClustDesign<-function(numClust = c(3,6,9), 
                         sepVal = c(0.01, 0.21, 0.342), 
                         sepLabels = c("L", "M", "H"), 
                         numNonNoisy = c(4,8,20), 
                         numNoisy = NULL, 
                         numOutlier = 0, 
                         numReplicate = 3, 
                         fileName = "test", 
                         clustszind = 2, 
                         clustSizeEq = 50, 
                         rangeN = c(50,200), 
                         clustSizes = NULL,
                         covMethod = c("eigen", "onion", "c-vine", "unifcorrmat"), 
			 eigenvalue = NULL,
                         rangeVar = c(1, 10), 
                         lambdaLow = 1, 
                         ratioLambda = 10, 
                         alphad = 1, 
                         eta = 1, 
                         rotateind = TRUE, 
                         iniProjDirMethod = c("SL", "naive"), 
                         projDirMethod = c("newton", "fixedpoint"), 
                         alpha = 0.05, 
                         ITMAX = 20, 
                         eps = 1.0e-10, 
                         quiet = TRUE, 
                         outputDatFlag = TRUE,
                         outputLogFlag = TRUE,
                         outputEmpirical = TRUE, 
                         outputInfo = TRUE)
{
  iniProjDirMethod<-match.arg(arg=iniProjDirMethod, choices=c("SL", "naive"))
  projDirMethod<-match.arg(arg=projDirMethod, choices=c("newton", "fixedpoint"))
  covMethod<-match.arg(arg=covMethod, choices=c("eigen", "onion", "c-vine", "unifcorrmat"))
  numClust<-as.integer(numClust)

  # checks for valid inputs
  if(prod(numClust<1, na.rm=TRUE) || !is.integer(numClust)) 
  { 
    stop("The number 'numClust' of clusters should be a positive integer!\n")
  }
  numNonNoisy<-as.integer(numNonNoisy)
  if(prod(numNonNoisy<2) || !is.integer(numNonNoisy)) 
  { 
    stop("The number 'numNonNoisy' of non-noisy variables should be an integer greater than 1!\n") 
  }
  if(prod(sepVal<= -0.999, na.rm=TRUE) || prod(sepVal >= 0.999, na.rm=TRUE)) 
  { 
    stop("The desired separation index 'sepVal' should be in the range (-0.999, 0.999)!\n")
  }
  numReplicate<-as.integer(numReplicate)
  if(numReplicate<1 || !is.integer(numReplicate))
  { 
    stop("The number 'numReplicate' should be a positive integer!\n")
  }
  if(!is.null(numNoisy))
  {
    numNoisy<-as.integer(numNoisy)
    if(numNoisy<0 || !is.integer(numNoisy)) 
    { 
      stop("The number 'numNoisy' of noisy variables should be a non-negative integer!\n") 
    }
  }
  if(numOutlier<0)
  {
    stop("'numOutlier' should be positive!\n")
  }
  if(!is.element(clustszind, c(1,2,3))) 
  { 
    stop("Cluster size indicator 'clustszind' should be 1, 2, or 3!\n")
  }
  clustSizeEq<-as.integer(clustSizeEq)
  if(clustSizeEq<2 || !is.integer(clustSizeEq)) 
  { 
    stop("Cluster size 'clustSizeEq' should be an integer greater than 1!\n") 
  }
  rangeN<-as.integer(rangeN)
  if(length(rangeN)!=2) 
  { 
    stop("The range 'rangeN' for cluster sizes should be a numeric vector of length 2!\n") 
  }
  if(rangeN[1]>rangeN[2]) 
  { 
    stop("First element of 'rangeN' should be smaller than second!\n") 
  }
  if(rangeN[1]<2 || !is.integer(rangeN[1])) 
  { 
    stop("The lower bound 'rangeN[1]' for the range of cluster sizes should be an integer greater than 1!\n") 
  }
  if(!is.integer(rangeN[2])) 
  { 
    stop("The upper bound 'rangeN[2]' for the range of cluster sizes should be integer!\n") 
  }
  if(clustszind==3)
  {
    len<-length(clustSizes)
    if(len!=numClust || is.null(clustSizes)) 
    {
      stop("The number of elements in 'clustSizes' should be equal the number of elements in 'numClust' when the value of 'clustszind' is equal to 3!\n")
    }
    clustSizes<-as.integer(clustSizes)
    for(i in 1:len)
    { if(clustSizes[i]<1 || !is.integer(clustSizes[i]))
      { stop(paste("The cluster size for the ", i, "-th cluster should be an integer greater than 1!\n", sep=""))
      }
    }
  }
  if(rangeVar[1]>rangeVar[2]) 
  { 
    stop("First element of 'rangeVar' should be smaller than second!\n") 
  }
  if(rangeVar[1]<0) 
  { 
    stop("The lower bound 'rangeVar[1]' for the range of variances should be positive!\n") 
  }
  if(lambdaLow<0) 
  { 
    stop("The lower bound 'lambdaLow' of eigenvalues of cluster covariance matrices should be greater than zero!\n") 
  }
  if(ratioLambda<1) 
  { 
    stop("The ratio 'ratioLambda' of the upper bound of the eigenvalues to the lower bound of the eigenvalues of cluster covariance matrices should be greater than 1!\n")
  }
  alphad<-as.numeric(alphad)
  if(alphad<0)
  {
    stop("'alphad' should be positive!\n")
  }
  eta<-as.numeric(eta)
  if(eta<0)
  {
    stop("'eta' should be positive!\n")
  }

  if(!is.logical(rotateind))
  {
    stop("The value of the rotation indicator 'rotateind' should be logical, i.e., either 'TRUE' or 'FALSE'!\n")
  }
  if(alpha<=0 || alpha>0.5)
  {
    stop("The tuning parameter 'alpha' should be in the range (0, 0.5]!\n")
  }
  ITMAX<-as.integer(ITMAX)
  if(ITMAX<=0 || !is.integer(ITMAX))
  {
    stop("The maximum iteration number allowed 'ITMAX' should be a positive integer!\n")
  }
  if(eps<=0 || eps > 0.01)
  {
    stop("The convergence threshold 'eps' should be between (0, 0.01]!\n")
  }
  if(!is.logical(quiet))
  {
    stop("The value of the quiet indicator 'quiet' should be logical, i.e., either 'TRUE' or 'FALSE'!\n")
  }
  if(!is.logical(outputDatFlag))
  {
    stop("The value of the indicator 'outputDatFlag' should be logical, i.e., either 'TRUE' or 'FALSE'!\n")
  }
  if(!is.logical(outputLogFlag))
  {
    stop("The value of the indicator 'outputLogFlag' should be logical, i.e., either 'TRUE' or 'FALSE'!\n")
  }
  if(!is.logical(outputEmpirical))
  {
    stop("The value of the indicator 'outputEmpirical' should be logical, i.e., either 'TRUE' or 'FALSE'!\n")
  }
  if(!is.logical(outputInfo))
  {
    stop("The value of the indicator 'outputInfo' should be logical, i.e., either 'TRUE' or 'FALSE'!\n")
  }
  # end of checks of valid inputs, loop begins

  loop<-0
  cat("Generating data sets. Please wait ...\n")
   
  for(j in 1:length(sepVal))
  { for(g in numClust)
    { for(p1 in numNonNoisy)
      { if(is.null(numNoisy))
        { numNoisy<-c(1, round(p1/2), p1) } 
        for(p2 in numNoisy)
        { p<-p1+p2 # total number of variables
          loop<-loop+1
          tmpfileName<-paste(fileName,"J", sepLabels[j],"G",g,
            "v", p1, "nv",p2, "out", numOutlier, sep="")
          res<-genRandomClust(
			      numClust = g, 
			      numNonNoisy = p1, 
			      sepVal = sepVal[j], 
                              numNoisy = p2, 
			      numReplicate = numReplicate, 
			      numOutlier = numOutlier, 
                              fileName = tmpfileName,  
			      clustszind = clustszind, 
                              clustSizeEq = clustSizeEq, 
			      rangeN = rangeN, 
                              clustSizes = clustSizes, 
			      covMethod = covMethod, 
			      eigenvalue = eigenvalue,
                              rangeVar = rangeVar, 
			      lambdaLow = lambdaLow, 
                              ratioLambda = ratioLambda,  
			      rotateind = rotateind, 
                              iniProjDirMethod = iniProjDirMethod, 
                              projDirMethod = projDirMethod, 
			      alpha = alpha, 
                              ITMAX = ITMAX, 
			      eps = eps, 
			      quiet = quiet, 
                              outputDatFlag = outputDatFlag,
                              outputLogFlag = outputLogFlag,
                              outputEmpirical = outputEmpirical, 
                              outputInfo = FALSE)

          if(loop>1) 
          { infoFrameTheory<-rbind(infoFrameTheory, res$infoFrameTheory)
            if(outputEmpirical)
            { infoFrameData<-rbind(infoFrameData, res$infoFrameData) }
            else { infoFrameData<-NULL }
            datList[[loop]]<-res$datList
            memList[[loop]]<-res$memList
            noisyList[[loop]]<-res$noisyList
          } else {
            infoFrameTheory<-res$infoFrameTheory
            if(outputEmpirical)
            { infoFrameData<-res$infoFrameData }
            else { infoFrameData<-NULL }
           
            datList<-list(res$datList)
            memList<-list(res$memList)
            noisyList<-list(res$noisyList)
          } 
 
        }
      }
    }
  }

  infoFrameTheory<-data.frame(infoFrameTheory)
  nr<-nrow(infoFrameTheory)
  rownames(infoFrameTheory)<-1:nr
  
  if(outputEmpirical)
  { infoFrameData<-data.frame(infoFrameData) 
    nr<-nrow(infoFrameData)
    rownames(infoFrameData)<-1:nr
  }
  else { infoFrameData<-NULL }

  names(datList)<-1:loop
  names(memList)<-1:loop
  names(noisyList)<-1:loop

  if(outputInfo)
  { fileNameInfo<-paste(fileName, "_info.log", sep="")
    msg<-"Theoretical separation information data frame>>>>>>>>>>>\n"
    write.table(msg, append=FALSE, file=fileNameInfo, quote=FALSE,
                row.names=FALSE, col.names=FALSE)
    msg<-colnames(infoFrameTheory)
    write(msg, append=TRUE, file=fileNameInfo, ncolumns=7, sep=" ")
    tt<-infoFrameTheory
    tt[,1:6]<-round(tt[,1:6], 3)
    write.table(tt, file=fileNameInfo, quote=FALSE, sep="\t",
                row.names=FALSE, col.names=FALSE, append=TRUE)

    if(outputEmpirical)
    { msg<-"\nEmpirical separation information data frame>>>>>>>>>>>\n"
      write.table(msg, append=TRUE, file=fileNameInfo, quote=FALSE,
                  row.names=FALSE, col.names=FALSE)
      msg<-colnames(infoFrameData)
      write(msg, append=TRUE, file=fileNameInfo, ncolumns=7, sep=" ")
      tt<-infoFrameData
      tt[,1:6]<-round(tt[,1:6], 3)
      write.table(tt, file=fileNameInfo, quote=FALSE, sep="\t",
                  row.names=FALSE, col.names=FALSE, append=TRUE)
    }
  }

  cat("The process is completed successfully!\n")
  invisible(list(infoFrameTheory=infoFrameTheory, 
              infoFrameData=infoFrameData, 
              datList=datList, memList=memList, noisyList=noisyList))
}