File: clusGap.Rd

package info (click to toggle)
cluster 2.0.7-1-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,496 kB
  • sloc: ansic: 2,981; fortran: 123; sh: 18; makefile: 2
file content (252 lines) | stat: -rw-r--r-- 11,345 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
\name{clusGap}
\title{Gap Statistic for Estimating the Number of Clusters}
\alias{clusGap}
\alias{maxSE}
\alias{print.clusGap}
\alias{plot.clusGap}
\description{
  \code{clusGap()} calculates a goodness of clustering measure, the
  \dQuote{gap} statistic.  For each number of clusters \eqn{k}, it
  compares \eqn{\log(W(k))}{log(W(k))} with
  \eqn{E^*[\log(W(k))]}{E*[log(W(k))]} where the latter is defined via
  bootstrapping, i.e., simulating from a reference (\eqn{H_0})
  distribution, a uniform distribution on the hypercube determined by
  the ranges of \code{x}, after first centering, and then
  \code{\link{svd}} (aka \sQuote{PCA})-rotating them when (as by
  default) \code{spaceH0 = "scaledPCA"}.

  \code{maxSE(f, SE.f)} determines the location of the \bold{maximum}
  of \code{f}, taking a \dQuote{1-SE rule} into account for the
  \code{*SE*} methods.  The default method \code{"firstSEmax"} looks for
  the smallest \eqn{k} such that its value \eqn{f(k)} is not more than 1
  standard error away from the first local maximum.
  This is similar but not the same as \code{"Tibs2001SEmax"}, Tibshirani
  et al's recommendation of determining the number of clusters from the
  gap statistics and their standard deviations.
}
\usage{
clusGap(x, FUNcluster, K.max, B = 100, d.power = 1,
        spaceH0 = c("scaledPCA", "original"),
        verbose = interactive(), \dots)

maxSE(f, SE.f,
      method = c("firstSEmax", "Tibs2001SEmax", "globalSEmax",
                 "firstmax", "globalmax"),
      SE.factor = 1)

\S3method{print}{clusGap}(x, method = "firstSEmax", SE.factor = 1, \dots)

\S3method{plot}{clusGap}(x, type = "b", xlab = "k", ylab = expression(Gap[k]),
     main = NULL, do.arrows = TRUE,
     arrowArgs = list(col="red3", length=1/16, angle=90, code=3), \dots)
}
\arguments{
  \item{x}{numeric matrix or \code{\link{data.frame}}.}
  \item{FUNcluster}{a \code{\link{function}} which accepts as first
    argument a (data) matrix like \code{x}, second argument, say
    \eqn{k, k\geq 2}{k, k >= 2}, the number of clusters desired,
    and returns a \code{\link{list}} with a component named (or shortened to)
    \code{cluster} which is a vector of length \code{n = nrow(x)} of
    integers in \code{1:k} determining the clustering or grouping of the
    \code{n} observations.}
  \item{K.max}{the maximum number of clusters to consider, must be at
    least two.}
  \item{B}{integer, number of Monte Carlo (\dQuote{bootstrap}) samples.}
  \item{d.power}{a positive integer specifying the power \eqn{p} which
    is applied to the euclidean distances (\code{\link{dist}}) before
    they are summed up to give \eqn{W(k)}.  The default, \code{d.power = 1},
    corresponds to the \dQuote{historical} \R implementation, whereas
    \code{d.power = 2} corresponds to what Tibshirani et al had
    proposed.  This was found by Juan Gonzalez, in 2016-02.}%Feb.\sspace{}2016.}
  \item{spaceH0}{a \code{\link{character}} string specifying the
    space of the \eqn{H_0} distribution (of \emph{no} cluster).  Both
     \code{"scaledPCA"} and \code{"original"} use a uniform distribution
     in a hyper cube and had been mentioned in the reference;
     \code{"original"} been added after a proposal (including code) by
     Juan Gonzalez.}
  \item{verbose}{integer or logical, determining if \dQuote{progress}
    output should be printed.  The default prints one bit per bootstrap
    sample.}
  \item{\dots}{(for \code{clusGap()}:) optionally further arguments for
    \code{FUNcluster()}, see \code{kmeans} example below.}
  \item{f}{numeric vector of \sQuote{function values}, of length
    \eqn{K}, whose (\dQuote{1 SE respected}) maximum we want.}
  \item{SE.f}{numeric vector of length \eqn{K} of standard errors of \code{f}.}
  \item{method}{character string indicating how the \dQuote{optimal}
    number of clusters, \eqn{\hat k}{k^}, is computed from the gap
    statistics (and their standard deviations), or more generally how
    the location \eqn{\hat k}{k^} of the maximum of \eqn{f_k}{f[k]}
    should be determined.

    %% -> ../R/clusGap.R
    \describe{
      \item{\code{"globalmax"}:}{simply corresponds to the global maximum,
	i.e., is \code{which.max(f)}}
      \item{\code{"firstmax"}:}{gives the location of the first \emph{local}
	maximum.}
      \item{\code{"Tibs2001SEmax"}:}{uses the criterion, Tibshirani et
	al (2001) proposed: \dQuote{the smallest \eqn{k} such that \eqn{f(k)
	    \ge f(k+1) - s_{k+1}}}.  Note that this chooses \eqn{k = 1}
	when all standard deviations are larger than the differences
	\eqn{f(k+1) - f(k)}.}
      \item{\code{"firstSEmax"}:}{location of the first \eqn{f()} value
	which is not larger than the first \emph{local} maximum minus
	\code{SE.factor * SE.f[]}, i.e, within an \dQuote{f S.E.} range
	of that maximum (see also \code{SE.factor}).

	This, the default, has been proposed by Martin Maechler in 2012,
	when adding \code{clusGap()} to the \pkg{cluster} package, after
	having seen the \code{"globalSEmax"} proposal (in code) and read
	the \code{"Tibs2001SEmax"} proposal.}

      \item{\code{"globalSEmax"}:}{(used in Dudoit and Fridlyand (2002),
	supposedly following Tibshirani's proposition):
	location of the first \eqn{f()} value which is not larger than
	the \emph{global} maximum minus \code{SE.factor * SE.f[]}, i.e,
	within an \dQuote{f S.E.} range of that maximum (see also
	\code{SE.factor}).}
    }
    See the examples for a comparison in a simple case.
  }
  \item{SE.factor}{[When \code{method} contains \code{"SE"}] Determining
    the optimal number of clusters, Tibshirani et al. proposed the
    \dQuote{1 S.E.}-rule.  Using an \code{SE.factor} \eqn{f}, the
    \dQuote{f S.E.}-rule is used, more generally.}
  %% plot():
  \item{type, xlab, ylab, main}{arguments with the same meaning as in
    \code{\link{plot.default}()}, with different default.}
  \item{do.arrows}{logical indicating if (1 SE -)\dQuote{error bars}
    should be drawn, via \code{\link{arrows}()}.}
  \item{arrowArgs}{a list of arguments passed to \code{\link{arrows}()};
    the default, notably \code{angle} and \code{code}, provide a style
    matching usual error bars.}
}
\details{
  The main result \code{<res>$Tab[,"gap"]} of course is from
  bootstrapping aka Monte Carlo simulation and hence random, or
  equivalently, depending on the initial random seed (see
  \code{\link{set.seed}()}).
  On the other hand, in our experience, using \code{B = 500} gives
  quite precise results such that the gap plot is basically unchanged
  after an another run.
}
\value{
  \code{clusGap(..)} returns an object of S3 class \code{"clusGap"},
  basically a list with components
  \item{Tab}{a matrix with \code{K.max} rows and 4 columns, named
    "logW", "E.logW", "gap", and "SE.sim",
    where \code{gap = E.logW - logW}, and \code{SE.sim} corresponds to
    the standard error of \code{gap}, \code{SE.sim[k]=}\eqn{s_k}{s[k]},
    where \eqn{s_k := \sqrt{1 + 1/B} sd^*(gap_j)}{s[k] := sqrt(1 + 1/B)
    sd^*(gap[])}, and \eqn{sd^*()} is the standard deviation of the
    simulated (\dQuote{bootstrapped}) gap values.
  }
  \item{call}{the \code{clusGap(..)} \code{\link{call}}.}
  \item{spaceH0}{the \code{spaceH0} argument (\code{\link{match.arg}()}ed).}
  \item{n}{number of observations, i.e., \code{nrow(x)}.}
  \item{B}{input \code{B}}
  \item{FUNcluster}{input function \code{FUNcluster}}
}
\references{
  Tibshirani, R., Walther, G. and Hastie, T. (2001).
  Estimating the number of data clusters via the Gap statistic.
  \emph{Journal of the Royal Statistical Society B}, \bold{63}, 411--423.

  Tibshirani, R., Walther, G. and Hastie, T. (2000).
  Estimating the number of clusters in a dataset via the Gap statistic.
  Technical Report. Stanford.

  Per Broberg (2006). SAGx: Statistical Analysis of the GeneChip.
  R package version 1.9.7.% moved to Bioconductor sometime after 2006
  \url{http://home.swipnet.se/pibroberg/expression_hemsida1.html}
}
\author{
  This function is originally based on the functions \code{gap} of
  (Bioconductor) package \pkg{SAGx} by Per Broberg,
  \code{gapStat()} from former package \pkg{SLmisc} by Matthias Kohl
  and ideas from \code{gap()} and its methods of package \pkg{lga} by
  Justin Harrington.

  The current implementation is by Martin Maechler.

  The implementation of \code{spaceH0 = "original"} is based on code
  proposed by Juan Gonzalez.
}
\seealso{
  \code{\link{silhouette}} for a much simpler less sophisticated
  goodness of clustering measure.

  \code{\link[fpc]{cluster.stats}()} in package \pkg{fpc} for
  alternative measures.

  %\code{\link[SGAx]{gap}} in Bioconductor package \pkg{SGAx}.
}
\examples{
### --- maxSE() methods -------------------------------------------
(mets <- eval(formals(maxSE)$method))
fk <- c(2,3,5,4,7,8,5,4)
sk <- c(1,1,2,1,1,3,1,1)/2
## use plot.clusGap():
plot(structure(class="clusGap", list(Tab = cbind(gap=fk, SE.sim=sk))))
## Note that 'firstmax' and 'globalmax' are always at 3 and 6 :
sapply(c(1/4, 1,2,4), function(SEf)
        sapply(mets, function(M) maxSE(fk, sk, method = M, SE.factor = SEf)))

### --- clusGap() -------------------------------------------------
## ridiculously nicely separated clusters in 3 D :
x <- rbind(matrix(rnorm(150,           sd = 0.1), ncol = 3),
           matrix(rnorm(150, mean = 1, sd = 0.1), ncol = 3),
           matrix(rnorm(150, mean = 2, sd = 0.1), ncol = 3),
           matrix(rnorm(150, mean = 3, sd = 0.1), ncol = 3))

## Slightly faster way to use pam (see below)
pam1 <- function(x,k) list(cluster = pam(x,k, cluster.only=TRUE))

## We do not recommend using hier.clustering here, but if you want,
## there is  factoextra::hcut () or a cheap version of it
hclusCut <- function(x, k, d.meth = "euclidean", ...)
   list(cluster = cutree(hclust(dist(x, method=d.meth), ...), k=k))

## You can manually set it before running this :    doExtras <- TRUE  # or  FALSE
if(!(exists("doExtras") && is.logical(doExtras)))
  doExtras <- cluster:::doExtras()

if(doExtras) {
  ## Note we use  B = 60 in the following examples to keep them "speedy".
  ## ---- rather keep the default B = 500 for your analysis!

  ## note we can  pass 'nstart = 20' to kmeans() :
  gskmn <- clusGap(x, FUN = kmeans, nstart = 20, K.max = 8, B = 60)
  gskmn #-> its print() method
  plot(gskmn, main = "clusGap(., FUN = kmeans, n.start=20, B= 60)")
  set.seed(12); system.time(
    gsPam0 <- clusGap(x, FUN = pam, K.max = 8, B = 60)
  )
  set.seed(12); system.time(
    gsPam1 <- clusGap(x, FUN = pam1, K.max = 8, B = 60)
  )
  ## and show that it gives the "same":
  not.eq <- c("call", "FUNcluster"); n <- names(gsPam0)
  eq <- n[!(n \%in\% not.eq)]
  stopifnot(identical(gsPam1[eq], gsPam0[eq]))
  print(gsPam1, method="globalSEmax")
  print(gsPam1, method="globalmax")

  print(gsHc <- clusGap(x, FUN = hclusCut, K.max = 8, B = 60))

}# end {doExtras}

gs.pam.RU <- clusGap(ruspini, FUN = pam1, K.max = 8, B = 60)
gs.pam.RU
plot(gs.pam.RU, main = "Gap statistic for the 'ruspini' data")
mtext("k = 4 is best .. and  k = 5  pretty close")

\donttest{## This takes a minute..
## No clustering ==> k = 1 ("one cluster") should be optimal:
Z <- matrix(rnorm(256*3), 256,3)
gsP.Z <- clusGap(Z, FUN = pam1, K.max = 8, B = 200)
plot(gsP.Z, main = "clusGap(<iid_rnorm_p=3>)  ==> k = 1  cluster is optimal")
gsP.Z
}%end{dont..}
}
\keyword{cluster}