File: clusGapGen.R

package info (click to toggle)
cluster 2.0.7-1-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,496 kB
  • sloc: ansic: 2,981; fortran: 123; sh: 18; makefile: 2
file content (141 lines) | stat: -rw-r--r-- 5,796 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
### From Master Thesis of Emmanuel Profumo (w/ M.Maechler) Autumn 2016--March 2017
### "Generalized clusGap()" :  We cannot be 100% compatible to clusGap()

#' @param x the data, can be a data frame or a matrix
#' @param algo, a clustering algorithm function taking the prepared data and
#' a number of clusters as arguments
#' @param index, a function taking a clustering vector and the prepared data
#' which returns the value of a validity index. Index can also be a list of such
#' functions to obtain results for different indices.
#' For coherence with the originally proposed Gap Statistic in Tibshirani et al's
#' a LOWER value of the validity index implies a better clustering quality, so
#' indices such as average silhouette width should be added a minus sign.
#' This can be changed by setting the argument low=FALSE.
#' @param modelH0, a function which takes as argument at least the data x,
#' parameters estimated from the data, and further arguments in ...
#' @param K.max, number of different clusters for which the index should be
#' evaluated.
#' @param B, the number of bootstraps sample.
#' @param transformData, a function which takes the data x as argument and
#' processed it for clustering
#' @param modelH0Param, a function which takes as argument the data x and returns
#' a list of modelH0 parameters with matching names.
#' @param low, logical, if FALSE a HIGHER value of the index or the indices in the
#' user provided list implies a better clustering quality
#'
#' @return if index is just one function a list with components:
#' Indks, the bootstrap validity plots
#' Ind, the validity plot corresponding to the data
#' E.Ind, the sample mean of the bootstrap validity plots
#' gap, the calibrated validity plot, difference between E.Ind and Ind
#' gapHen, the gap divided by the standard deviation of the bootstraps validity plots
#' SE.sim, the sample standard error of the bootstrap validity plots with a correction
#' term for bootstrap estimation
#' SE, the sample standard error
#' if index is a list of index then each of the components above are lists with values
#' for each index
clusGapGen <- function(x, algo, index, modelH0, K.max, B = 100,
                       transformData = identity,
                       modelH0Param = function(y) list(),
                       low=TRUE, verbose = interactive(), ...)
{
  ind.isList <- is.list(index)
  if (is.function(index))
    index <- list(index)
  else if (!ind.isList || !all(vapply(index, is.function, NA)))
    stop("index has to be a function or a list of function")

  Ind <- E.Ind <- SE.sim <- SE <- index
  for (i in seq_along(index)) Ind[[i]] <- E.Ind[[i]] <- SE.sim[[i]] <- numeric(K.max)

  if(verbose) cat("Clustering k = 1,2,..., K.max (= ",K.max,"): .. ", sep='')
  xt <- transformData(x)
  for(k in 1:K.max){
    cls <- algo(xt,k)
    for (i in seq_along(index))
      Ind[[i]][k] <- index[[i]](cls,xt)
  }
  if(verbose) cat("done\n")
  Indks <- index
  for (i in seq_along(index)) Indks[[i]] <- matrix(0, B, K.max)

  param <- modelH0Param(x)
  if(verbose) cat("Bootstrapping, b = 1,2,..., B (= ", B,
                  ")  [one \".\" per sample]:\n", sep="")
  for (b in 1:B) {
    z <- do.call(modelH0,c(list(x=x),param,list(...)))
    zt <- transformData(z)
    for(k in 1:K.max) {
      cls <- algo(zt,k)
      for (i in seq_along(index))
        Indks[[i]][b,k] <- index[[i]](cls,zt)
    }
    if(verbose) cat(".", if(b %% 50 == 0) paste(b,"\n"))
  }
  if(verbose && (B %% 50 != 0)) cat("",B,"\n")
  gap <- gapHen <- index
  for (i in seq_along(index)){
    E.Ind[[i]] <- colMeans(Indks[[i]])
    var.i <- apply(Indks[[i]], 2, var)
    SE[[i]] <- sqrt(var.i)
    SE.sim[[i]] <- sqrt((1 + 1/B) * var.i)
    gap[[i]] <- gap.i <- E.Ind[[i]] - Ind[[i]]
    gapHen[[i]] <- gap.i/SE[[i]]
    if (!low) {
      gap[[i]] <- -gap[[i]]
      gapHen[[i]] <- -gapHen[[i]]
    }
  }
  ## TODO: really? make distinction of *list* of indices vs 1 index?
  ## ---   well maybe, keep it: *The* usual case = _one_ index (or not?)
  if (ind.isList) {
    list(Indks=Indks,Ind=Ind, E.Ind=E.Ind, gap = gap,
         gapHen = gapHen ,SE.sim=SE.sim,SE=SE)
  }
  else {list(Indks=Indks[[1]],Ind=Ind[[1]], E.Ind=E.Ind[[1]], gap = gap[[1]],
             gapHen = gapHen[[1]] ,SE.sim=SE.sim[[1]],SE=SE[[1]])
  }
}


#' @param clusGapRes, a list returned by a call of function clusGapGen
#' @param main, the main title to the plots
#' @param divBySd, logical, if TRUE plot for the standardize version of the gap

clusGapGen.plot <- function(clusGapRes,divBySd=FALSE,main=""){


  if (!is.list(clusGapRes$Ind)) clusGapRes <- lapply(clusGapRes,
                                                     function(el) list(el))
  for (i in seq_along(clusGapRes$Ind)){
    B <- nrow(clusGapRes$Indks[[i]])
    K.max <- ncol(clusGapRes$Indks[[i]])
    std <- t(replicate(B,rep(1,K.max)))
    ylm <- range(rbind(clusGapRes$Indks[[i]],clusGapRes$Ind[[i]]),na.rm=TRUE)
    ylb <- names(clusGapRes$Ind[i])
    if (is.null(ylb)) ylb <- paste("Index",as.character(i))
    gp <- "gap"
    namegap <- paste(gp,ylb)
    if (divBySd) {
      std <- t(replicate(B,clusGapRes$SE.sim[[i]]))
      gp <- "gapHen"
      namegap <- paste(gp,ylb)
    }
    matplot(replicate(B,1:K.max),t(clusGapRes$Indks[[i]]),
            pch = "-",xlab = "k",ylab = ylb,
            type="l",ylim=ylm,main=main
    )

    lines(1:K.max,clusGapRes$E.Ind[[i]],type="l",col="white",lwd=2)
    lines(1:K.max,clusGapRes$Ind[[i]],lwd=2)

    boxplot((clusGapRes$Indks[[i]]-t(replicate(B,clusGapRes$Ind[[i]])))/std,
            pch = "*",xlab = "k", ylab = namegap,type="l",col=c("light blue"),
            notch=TRUE, border="grey",main=main
    )

    lines(1:K.max,clusGapRes[[gp]][[i]],type="l",xlab="k",ylab="",
          col="orangered",lwd=1.5)

  }
}