File: clusGap.R

package info (click to toggle)
cluster 2.0.7-1-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,496 kB
  • sloc: ansic: 2,981; fortran: 123; sh: 18; makefile: 2
file content (174 lines) | stat: -rw-r--r-- 6,825 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#### Originally from orphaned package SLmisc
#### (Version: 1.4.1, 2007-04-12, Maintainer: Matthias Kohl <kohl@sirs-lab.com>)
#### License: GPL (version 2 or later)
####
#### which said
####  "function corresponds to function gap in package SAGx"

## MM: SAGx is now in Bioconductor --- 1.10.1{devel} or 1.11.1{release}
##     had gap() *corrected* to re-cluster using FUNcluster --> see ./gap-SAGx.R.~orig~
##
## MM: Package 'lga' -- has gap() and lga and robust lga [-> UBC]
##    - it uses  boot() nicely  [2012-01: ORPHANED because  Justin Harrington is amiss]
## MM: renamed arguments, and changed almost everything

clusGap <- function (x, FUNcluster, K.max, B = 100, d.power = 1,
		     spaceH0 = c("scaledPCA", "original"),
                     verbose = interactive(), ...)
{
    stopifnot(is.function(FUNcluster), length(dim(x)) == 2, K.max >= 2,
	      (n <- nrow(x)) >= 1, ncol(x) >= 1)
    if(B != (B. <- as.integer(B)) || (B <- B.) <= 0)
        stop("'B' has to be a positive integer")
    cl. <- match.call()

    if(is.data.frame(x))
        x <- as.matrix(x)
    ii <- seq_len(n)
    W.k <- function(X, kk) {
        clus <- if(kk > 1) FUNcluster(X, kk, ...)$cluster else rep.int(1L, nrow(X))
        ##                 ---------- =  =       -------- kmeans() has 'cluster'; pam() 'clustering'
	0.5* sum(vapply(split(ii, clus),
			function(I) { xs <- X[I,, drop=FALSE]
				      sum(dist(xs)^d.power/nrow(xs)) }, 0.))
    }
    logW <- E.logW <- SE.sim <- numeric(K.max)
    if(verbose) cat("Clustering k = 1,2,..., K.max (= ",K.max,"): .. ", sep='')
    for(k in 1:K.max)
        logW[k] <- log(W.k(x, k))
    if(verbose) cat("done\n")

    spaceH0 <- match.arg(spaceH0)
    ## Scale 'x' into hypercube -- later fill with H0-generated data
    xs <- scale(x, center=TRUE, scale=FALSE)
    m.x <- rep(attr(xs,"scaled:center"), each = n) # for back-trafo later
    switch(spaceH0,
	   "scaledPCA" =
	       {
		   ## (These & (xs,m.x) above basically do stats:::prcomp.default()
		   V.sx <- svd(xs, nu=0)$v
		   xs <- xs %*% V.sx # = transformed(x)
	       },
	   "original" = {}, # (do nothing, use 'xs')
	   ## otherwise
	   stop("invalid 'spaceH0':", spaceH0))

    rng.x1 <- apply(xs, 2L, range)
    logWks <- matrix(0, B, K.max)
    if(verbose) cat("Bootstrapping, b = 1,2,..., B (= ", B,
                    ")  [one \".\" per sample]:\n", sep="")
    for (b in 1:B) {
        ## Generate "H0"-data as "parametric bootstrap sample" :
        z1 <- apply(rng.x1, 2,
                    function(M, nn) runif(nn, min=M[1], max=M[2]),
                    nn=n)
	z <- switch(spaceH0,
		    "scaledPCA" = tcrossprod(z1, V.sx), # back transformed
		    "original" = z1
		    ) + m.x
        for(k in 1:K.max) {
            logWks[b,k] <- log(W.k(z, k))
        }
        if(verbose) cat(".", if(b %% 50 == 0) paste(b,"\n"))
    }
    if(verbose && (B %% 50 != 0)) cat("",B,"\n")
    E.logW <- colMeans(logWks)
    SE.sim <- sqrt((1 + 1/B) * apply(logWks, 2, var))
    structure(class = "clusGap",
              list(Tab = cbind(logW, E.logW, gap = E.logW - logW, SE.sim),
                   ## K.max == nrow(T)
                   call = cl., spaceH0=spaceH0,
                   n = n, B = B, FUNcluster=FUNcluster))
}

## lga/R/gap.R   --- has for Tibshirani et al (2001):
        ## ElogWks[k,] <- c(mean(BootOutput), sqrt(var(BootOutput)*(1+1/B)))
        ## GAP[k] <- ElogWks[k,1] - logWks[k]
        ## if (k > 1)
        ##     if(GAP[k-1] >= GAP[k]-ElogWks[k,2] & !doall)
        ##         finished <- TRUE
##  so they effectively only look for the *first* (local) maximum which ..
## MM: <==> diff(GAP) = GAP[k] - GAP[k-1] <= +SE.sim[k]


## criteria.DandF() -- Dudoit and Fridlyand (2002)
## ---------------- looks at the *global* maximum and then to the left..
    ## y <- x$data
    ## crit <- diff(y[which.max(y[,"Gap"]), c("Sks", "Gap")])
    ## nclust <- min(which(y[,"Gap"] > crit))
    ## return(ifelse(nclust == nrow(y), NA, nclust))

maxSE <- function(f, SE.f,
		  method = c("firstSEmax", "Tibs2001SEmax",
		  "globalSEmax", "firstmax", "globalmax"),
		  SE.factor = 1)
{
    method <- match.arg(method)
    stopifnot((K <- length(f)) >= 1, K == length(SE.f), SE.f >= 0, SE.factor >= 0)
    fSE <- SE.factor * SE.f
    switch(method,
	   "firstmax" = { ## the first local maximum  (== firstSEmax with SE.factor == 0)
	       decr <- diff(f) <= 0 # length K-1
	       if(any(decr)) which.max(decr) else K # the first TRUE, or K
	   },
	   "globalmax" = {
	       which.max(f)
	   },
	   "Tibs2001SEmax" = { ## The one Tibshirani et al (2001) proposed:
	       ## "the smallest k such that f(k) >= f(k+1) - s_{k+1}"
	       g.s <- f - fSE
	       if(any(mp <- f[-K] >= g.s[-1])) which.max(mp) else K
	   },
	   "firstSEmax" = { ## M.Maechler(2012): rather ..
	       ## look at the first *local* maximum and then to the left ..:
	       decr <- diff(f) <= 0 # length K-1
	       nc <- if(any(decr)) which.max(decr) else K # the first TRUE, or K
	       if(any(mp <- f[seq_len(nc - 1)] >= f[nc] - fSE[nc]))
		   which(mp)[1]
	       else nc
	   },
	   "globalSEmax" = { ## Dudoit and Fridlyand (2002) *thought* Tibshirani proposed..
	       ## in 'lga', see criteria.DandF():
	       ## looks at the *global* maximum and then to the left..
	       nc <- which.max(f)
	       if(any(mp <- f[seq_len(nc - 1)] >= f[nc] - fSE[nc]))
		   which(mp)[1]
	       else nc
	   })
}

print.clusGap <- function(x, method="firstSEmax", SE.factor = 1, ...)
{
    method <- match.arg(method, choices = eval(formals(maxSE)$method))
    stopifnot((K <- nrow(T <- x$Tab)) >= 1, SE.factor >= 0)
    cat("Clustering Gap statistic [\"clusGap\"] from call:\n", deparse(x$call),
        sprintf("\nB=%d simulated reference sets, k = 1..%d; spaceH0=\"%s\"\n",
                x$B, K, x$spaceH0), sep="")
    nc <- maxSE(f = T[,"gap"], SE.f = T[,"SE.sim"],
                method=method, SE.factor=SE.factor)
    cat(sprintf(" --> Number of clusters (method '%s'%s): %d\n",
		method, if(grepl("SE", method))
		sprintf(", SE.factor=%g",SE.factor) else "", nc))
    print(T, ...)
    invisible(x)
}

plot.clusGap <- function(x, type="b", xlab = "k", ylab = expression(Gap[k]),
                         main = NULL,
                         do.arrows = TRUE,
                         arrowArgs = list(col="red3", length=1/16, angle=90, code=3),
                         ...)
{
    stopifnot(is.matrix(Tab <- x$Tab), is.numeric(Tab))
    K <- nrow(Tab)
    k <- seq_len(K) # == 1,2,... k
    if(is.null(main))
	main <- paste(strwrap(deparse(x$call, 150)[1], width = 60, exdent = 7),
		      collapse="\n")
    gap <- Tab[, "gap"]
    plot(k, gap, type=type, xlab=xlab, ylab=ylab, main=main, ...)
    if(do.arrows)
	do.call(arrows,
		c(list(k, gap+ Tab[, "SE.sim"], k, gap- Tab[, "SE.sim"]), arrowArgs))
    invisible()
}