1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
|
\name{clusGap}
\title{Gap Statistic for Estimating the Number of Clusters}
\alias{clusGap}
\alias{maxSE}
\alias{print.clusGap}
\alias{plot.clusGap}
\description{
\code{clusGap()} calculates a goodness of clustering measure, the
\dQuote{gap} statistic. For each number of clusters \eqn{k}, it
compares \eqn{\log(W(k))}{log(W(k))} with
\eqn{E^*[\log(W(k))]}{E*[log(W(k))]} where the latter is defined via
bootstrapping, i.e., simulating from a reference (\eqn{H_0})
distribution, a uniform distribution on the hypercube determined by
the ranges of \code{x}, after first centering, and then
\code{\link{svd}} (aka \sQuote{PCA})-rotating them when (as by
default) \code{spaceH0 = "scaledPCA"}.
\code{maxSE(f, SE.f)} determines the location of the \bold{maximum}
of \code{f}, taking a \dQuote{1-SE rule} into account for the
\code{*SE*} methods. The default method \code{"firstSEmax"} looks for
the smallest \eqn{k} such that its value \eqn{f(k)} is not more than 1
standard error away from the first local maximum.
This is similar but not the same as \code{"Tibs2001SEmax"}, Tibshirani
et al's recommendation of determining the number of clusters from the
gap statistics and their standard deviations.
}
\usage{
clusGap(x, FUNcluster, K.max, B = 100, d.power = 1,
spaceH0 = c("scaledPCA", "original"),
verbose = interactive(), \dots)
maxSE(f, SE.f,
method = c("firstSEmax", "Tibs2001SEmax", "globalSEmax",
"firstmax", "globalmax"),
SE.factor = 1)
\S3method{print}{clusGap}(x, method = "firstSEmax", SE.factor = 1, \dots)
\S3method{plot}{clusGap}(x, type = "b", xlab = "k", ylab = expression(Gap[k]),
main = NULL, do.arrows = TRUE,
arrowArgs = list(col="red3", length=1/16, angle=90, code=3), \dots)
}
\arguments{
\item{x}{numeric matrix or \code{\link{data.frame}}.}
\item{FUNcluster}{a \code{\link{function}} which accepts as first
argument a (data) matrix like \code{x}, second argument, say
\eqn{k, k\geq 2}{k, k >= 2}, the number of clusters desired,
and returns a \code{\link{list}} with a component named (or shortened to)
\code{cluster} which is a vector of length \code{n = nrow(x)} of
integers in \code{1:k} determining the clustering or grouping of the
\code{n} observations.}
\item{K.max}{the maximum number of clusters to consider, must be at
least two.}
\item{B}{integer, number of Monte Carlo (\dQuote{bootstrap}) samples.}
\item{d.power}{a positive integer specifying the power \eqn{p} which
is applied to the euclidean distances (\code{\link{dist}}) before
they are summed up to give \eqn{W(k)}. The default, \code{d.power = 1},
corresponds to the \dQuote{historical} \R implementation, whereas
\code{d.power = 2} corresponds to what Tibshirani et al had
proposed. This was found by Juan Gonzalez, in 2016-02.}%Feb.\sspace{}2016.}
\item{spaceH0}{a \code{\link{character}} string specifying the
space of the \eqn{H_0} distribution (of \emph{no} cluster). Both
\code{"scaledPCA"} and \code{"original"} use a uniform distribution
in a hyper cube and had been mentioned in the reference;
\code{"original"} been added after a proposal (including code) by
Juan Gonzalez.}
\item{verbose}{integer or logical, determining if \dQuote{progress}
output should be printed. The default prints one bit per bootstrap
sample.}
\item{\dots}{(for \code{clusGap()}:) optionally further arguments for
\code{FUNcluster()}, see \code{kmeans} example below.}
\item{f}{numeric vector of \sQuote{function values}, of length
\eqn{K}, whose (\dQuote{1 SE respected}) maximum we want.}
\item{SE.f}{numeric vector of length \eqn{K} of standard errors of \code{f}.}
\item{method}{character string indicating how the \dQuote{optimal}
number of clusters, \eqn{\hat k}{k^}, is computed from the gap
statistics (and their standard deviations), or more generally how
the location \eqn{\hat k}{k^} of the maximum of \eqn{f_k}{f[k]}
should be determined.
%% -> ../R/clusGap.R
\describe{
\item{\code{"globalmax"}:}{simply corresponds to the global maximum,
i.e., is \code{which.max(f)}}
\item{\code{"firstmax"}:}{gives the location of the first \emph{local}
maximum.}
\item{\code{"Tibs2001SEmax"}:}{uses the criterion, Tibshirani et
al (2001) proposed: \dQuote{the smallest \eqn{k} such that \eqn{f(k)
\ge f(k+1) - s_{k+1}}}. Note that this chooses \eqn{k = 1}
when all standard deviations are larger than the differences
\eqn{f(k+1) - f(k)}.}
\item{\code{"firstSEmax"}:}{location of the first \eqn{f()} value
which is not larger than the first \emph{local} maximum minus
\code{SE.factor * SE.f[]}, i.e, within an \dQuote{f S.E.} range
of that maximum (see also \code{SE.factor}).
This, the default, has been proposed by Martin Maechler in 2012,
when adding \code{clusGap()} to the \pkg{cluster} package, after
having seen the \code{"globalSEmax"} proposal (in code) and read
the \code{"Tibs2001SEmax"} proposal.}
\item{\code{"globalSEmax"}:}{(used in Dudoit and Fridlyand (2002),
supposedly following Tibshirani's proposition):
location of the first \eqn{f()} value which is not larger than
the \emph{global} maximum minus \code{SE.factor * SE.f[]}, i.e,
within an \dQuote{f S.E.} range of that maximum (see also
\code{SE.factor}).}
}
See the examples for a comparison in a simple case.
}
\item{SE.factor}{[When \code{method} contains \code{"SE"}] Determining
the optimal number of clusters, Tibshirani et al. proposed the
\dQuote{1 S.E.}-rule. Using an \code{SE.factor} \eqn{f}, the
\dQuote{f S.E.}-rule is used, more generally.}
%% plot():
\item{type, xlab, ylab, main}{arguments with the same meaning as in
\code{\link{plot.default}()}, with different default.}
\item{do.arrows}{logical indicating if (1 SE -)\dQuote{error bars}
should be drawn, via \code{\link{arrows}()}.}
\item{arrowArgs}{a list of arguments passed to \code{\link{arrows}()};
the default, notably \code{angle} and \code{code}, provide a style
matching usual error bars.}
}
\details{
The main result \code{<res>$Tab[,"gap"]} of course is from
bootstrapping aka Monte Carlo simulation and hence random, or
equivalently, depending on the initial random seed (see
\code{\link{set.seed}()}).
On the other hand, in our experience, using \code{B = 500} gives
quite precise results such that the gap plot is basically unchanged
after an another run.
}
\value{
\code{clusGap(..)} returns an object of S3 class \code{"clusGap"},
basically a list with components
\item{Tab}{a matrix with \code{K.max} rows and 4 columns, named
"logW", "E.logW", "gap", and "SE.sim",
where \code{gap = E.logW - logW}, and \code{SE.sim} corresponds to
the standard error of \code{gap}, \code{SE.sim[k]=}\eqn{s_k}{s[k]},
where \eqn{s_k := \sqrt{1 + 1/B} sd^*(gap_j)}{s[k] := sqrt(1 + 1/B)
sd^*(gap[])}, and \eqn{sd^*()} is the standard deviation of the
simulated (\dQuote{bootstrapped}) gap values.
}
\item{call}{the \code{clusGap(..)} \code{\link{call}}.}
\item{spaceH0}{the \code{spaceH0} argument (\code{\link{match.arg}()}ed).}
\item{n}{number of observations, i.e., \code{nrow(x)}.}
\item{B}{input \code{B}}
\item{FUNcluster}{input function \code{FUNcluster}}
}
\references{
Tibshirani, R., Walther, G. and Hastie, T. (2001).
Estimating the number of data clusters via the Gap statistic.
\emph{Journal of the Royal Statistical Society B}, \bold{63}, 411--423.
Tibshirani, R., Walther, G. and Hastie, T. (2000).
Estimating the number of clusters in a dataset via the Gap statistic.
Technical Report. Stanford.
Per Broberg (2006). SAGx: Statistical Analysis of the GeneChip.
R package version 1.9.7.% moved to Bioconductor sometime after 2006
\url{http://home.swipnet.se/pibroberg/expression_hemsida1.html}
}
\author{
This function is originally based on the functions \code{gap} of
(Bioconductor) package \pkg{SAGx} by Per Broberg,
\code{gapStat()} from former package \pkg{SLmisc} by Matthias Kohl
and ideas from \code{gap()} and its methods of package \pkg{lga} by
Justin Harrington.
The current implementation is by Martin Maechler.
The implementation of \code{spaceH0 = "original"} is based on code
proposed by Juan Gonzalez.
}
\seealso{
\code{\link{silhouette}} for a much simpler less sophisticated
goodness of clustering measure.
\code{\link[fpc]{cluster.stats}()} in package \pkg{fpc} for
alternative measures.
%\code{\link[SGAx]{gap}} in Bioconductor package \pkg{SGAx}.
}
\examples{
### --- maxSE() methods -------------------------------------------
(mets <- eval(formals(maxSE)$method))
fk <- c(2,3,5,4,7,8,5,4)
sk <- c(1,1,2,1,1,3,1,1)/2
## use plot.clusGap():
plot(structure(class="clusGap", list(Tab = cbind(gap=fk, SE.sim=sk))))
## Note that 'firstmax' and 'globalmax' are always at 3 and 6 :
sapply(c(1/4, 1,2,4), function(SEf)
sapply(mets, function(M) maxSE(fk, sk, method = M, SE.factor = SEf)))
### --- clusGap() -------------------------------------------------
## ridiculously nicely separated clusters in 3 D :
x <- rbind(matrix(rnorm(150, sd = 0.1), ncol = 3),
matrix(rnorm(150, mean = 1, sd = 0.1), ncol = 3),
matrix(rnorm(150, mean = 2, sd = 0.1), ncol = 3),
matrix(rnorm(150, mean = 3, sd = 0.1), ncol = 3))
## Slightly faster way to use pam (see below)
pam1 <- function(x,k) list(cluster = pam(x,k, cluster.only=TRUE))
## We do not recommend using hier.clustering here, but if you want,
## there is factoextra::hcut () or a cheap version of it
hclusCut <- function(x, k, d.meth = "euclidean", ...)
list(cluster = cutree(hclust(dist(x, method=d.meth), ...), k=k))
## You can manually set it before running this : doExtras <- TRUE # or FALSE
if(!(exists("doExtras") && is.logical(doExtras)))
doExtras <- cluster:::doExtras()
if(doExtras) {
## Note we use B = 60 in the following examples to keep them "speedy".
## ---- rather keep the default B = 500 for your analysis!
## note we can pass 'nstart = 20' to kmeans() :
gskmn <- clusGap(x, FUN = kmeans, nstart = 20, K.max = 8, B = 60)
gskmn #-> its print() method
plot(gskmn, main = "clusGap(., FUN = kmeans, n.start=20, B= 60)")
set.seed(12); system.time(
gsPam0 <- clusGap(x, FUN = pam, K.max = 8, B = 60)
)
set.seed(12); system.time(
gsPam1 <- clusGap(x, FUN = pam1, K.max = 8, B = 60)
)
## and show that it gives the "same":
not.eq <- c("call", "FUNcluster"); n <- names(gsPam0)
eq <- n[!(n \%in\% not.eq)]
stopifnot(identical(gsPam1[eq], gsPam0[eq]))
print(gsPam1, method="globalSEmax")
print(gsPam1, method="globalmax")
print(gsHc <- clusGap(x, FUN = hclusCut, K.max = 8, B = 60))
}# end {doExtras}
gs.pam.RU <- clusGap(ruspini, FUN = pam1, K.max = 8, B = 60)
gs.pam.RU
plot(gs.pam.RU, main = "Gap statistic for the 'ruspini' data")
mtext("k = 4 is best .. and k = 5 pretty close")
\donttest{## This takes a minute..
## No clustering ==> k = 1 ("one cluster") should be optimal:
Z <- matrix(rnorm(256*3), 256,3)
gsP.Z <- clusGap(Z, FUN = pam1, K.max = 8, B = 200)
plot(gsP.Z, main = "clusGap(<iid_rnorm_p=3>) ==> k = 1 cluster is optimal")
gsP.Z
}%end{dont..}
}
\keyword{cluster}
|