1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
|
### From Master Thesis of Emmanuel Profumo (w/ M.Maechler) Autumn 2016--March 2017
### "Generalized clusGap()" : We cannot be 100% compatible to clusGap()
#' @param x the data, can be a data frame or a matrix
#' @param algo, a clustering algorithm function taking the prepared data and
#' a number of clusters as arguments
#' @param index, a function taking a clustering vector and the prepared data
#' which returns the value of a validity index. Index can also be a list of such
#' functions to obtain results for different indices.
#' For coherence with the originally proposed Gap Statistic in Tibshirani et al's
#' a LOWER value of the validity index implies a better clustering quality, so
#' indices such as average silhouette width should be added a minus sign.
#' This can be changed by setting the argument low=FALSE.
#' @param modelH0, a function which takes as argument at least the data x,
#' parameters estimated from the data, and further arguments in ...
#' @param K.max, number of different clusters for which the index should be
#' evaluated.
#' @param B, the number of bootstraps sample.
#' @param transformData, a function which takes the data x as argument and
#' processed it for clustering
#' @param modelH0Param, a function which takes as argument the data x and returns
#' a list of modelH0 parameters with matching names.
#' @param low, logical, if FALSE a HIGHER value of the index or the indices in the
#' user provided list implies a better clustering quality
#'
#' @return if index is just one function a list with components:
#' Indks, the bootstrap validity plots
#' Ind, the validity plot corresponding to the data
#' E.Ind, the sample mean of the bootstrap validity plots
#' gap, the calibrated validity plot, difference between E.Ind and Ind
#' gapHen, the gap divided by the standard deviation of the bootstraps validity plots
#' SE.sim, the sample standard error of the bootstrap validity plots with a correction
#' term for bootstrap estimation
#' SE, the sample standard error
#' if index is a list of index then each of the components above are lists with values
#' for each index
clusGapGen <- function(x, algo, index, modelH0, K.max, B = 100,
transformData = identity,
modelH0Param = function(y) list(),
low=TRUE, verbose = interactive(), ...)
{
ind.isList <- is.list(index)
if (is.function(index))
index <- list(index)
else if (!ind.isList || !all(vapply(index, is.function, NA)))
stop("index has to be a function or a list of function")
Ind <- E.Ind <- SE.sim <- SE <- index
for (i in seq_along(index)) Ind[[i]] <- E.Ind[[i]] <- SE.sim[[i]] <- numeric(K.max)
if(verbose) cat("Clustering k = 1,2,..., K.max (= ",K.max,"): .. ", sep='')
xt <- transformData(x)
for(k in 1:K.max){
cls <- algo(xt,k)
for (i in seq_along(index))
Ind[[i]][k] <- index[[i]](cls,xt)
}
if(verbose) cat("done\n")
Indks <- index
for (i in seq_along(index)) Indks[[i]] <- matrix(0, B, K.max)
param <- modelH0Param(x)
if(verbose) cat("Bootstrapping, b = 1,2,..., B (= ", B,
") [one \".\" per sample]:\n", sep="")
for (b in 1:B) {
z <- do.call(modelH0,c(list(x=x),param,list(...)))
zt <- transformData(z)
for(k in 1:K.max) {
cls <- algo(zt,k)
for (i in seq_along(index))
Indks[[i]][b,k] <- index[[i]](cls,zt)
}
if(verbose) cat(".", if(b %% 50 == 0) paste(b,"\n"))
}
if(verbose && (B %% 50 != 0)) cat("",B,"\n")
gap <- gapHen <- index
for (i in seq_along(index)){
E.Ind[[i]] <- colMeans(Indks[[i]])
var.i <- apply(Indks[[i]], 2, var)
SE[[i]] <- sqrt(var.i)
SE.sim[[i]] <- sqrt((1 + 1/B) * var.i)
gap[[i]] <- gap.i <- E.Ind[[i]] - Ind[[i]]
gapHen[[i]] <- gap.i/SE[[i]]
if (!low) {
gap[[i]] <- -gap[[i]]
gapHen[[i]] <- -gapHen[[i]]
}
}
## TODO: really? make distinction of *list* of indices vs 1 index?
## --- well maybe, keep it: *The* usual case = _one_ index (or not?)
if (ind.isList) {
list(Indks=Indks,Ind=Ind, E.Ind=E.Ind, gap = gap,
gapHen = gapHen ,SE.sim=SE.sim,SE=SE)
}
else {list(Indks=Indks[[1]],Ind=Ind[[1]], E.Ind=E.Ind[[1]], gap = gap[[1]],
gapHen = gapHen[[1]] ,SE.sim=SE.sim[[1]],SE=SE[[1]])
}
}
#' @param clusGapRes, a list returned by a call of function clusGapGen
#' @param main, the main title to the plots
#' @param divBySd, logical, if TRUE plot for the standardize version of the gap
clusGapGen.plot <- function(clusGapRes,divBySd=FALSE,main=""){
if (!is.list(clusGapRes$Ind)) clusGapRes <- lapply(clusGapRes,
function(el) list(el))
for (i in seq_along(clusGapRes$Ind)){
B <- nrow(clusGapRes$Indks[[i]])
K.max <- ncol(clusGapRes$Indks[[i]])
std <- t(replicate(B,rep(1,K.max)))
ylm <- range(rbind(clusGapRes$Indks[[i]],clusGapRes$Ind[[i]]),na.rm=TRUE)
ylb <- names(clusGapRes$Ind[i])
if (is.null(ylb)) ylb <- paste("Index",as.character(i))
gp <- "gap"
namegap <- paste(gp,ylb)
if (divBySd) {
std <- t(replicate(B,clusGapRes$SE.sim[[i]]))
gp <- "gapHen"
namegap <- paste(gp,ylb)
}
matplot(replicate(B,1:K.max),t(clusGapRes$Indks[[i]]),
pch = "-",xlab = "k",ylab = ylb,
type="l",ylim=ylm,main=main
)
lines(1:K.max,clusGapRes$E.Ind[[i]],type="l",col="white",lwd=2)
lines(1:K.max,clusGapRes$Ind[[i]],lwd=2)
boxplot((clusGapRes$Indks[[i]]-t(replicate(B,clusGapRes$Ind[[i]])))/std,
pch = "*",xlab = "k", ylab = namegap,type="l",col=c("light blue"),
notch=TRUE, border="grey",main=main
)
lines(1:K.max,clusGapRes[[gp]][[i]],type="l",xlab="k",ylab="",
col="orangered",lwd=1.5)
}
}
|