File: find_k.Rd

package info (click to toggle)
r-cran-dendextend 1.14.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,888 kB
  • sloc: sh: 13; makefile: 2
file content (69 lines) | stat: -rw-r--r-- 2,302 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/find_k.R
\name{find_k}
\alias{find_k}
\alias{plot.find_k}
\title{Find the (estimated) number of clusters for a dendrogram using average silhouette width}
\usage{
find_k(dend, krange = 2:min(10, (nleaves(dend) - 1)), ...)

\method{plot}{find_k}(
  x,
  xlab = "Number of clusters (k)",
  ylab = "Average silhouette width",
  main = "Estimating the number of clusters using\\n average silhouette width",
  ...
)
}
\arguments{
\item{dend}{A dendrogram (or hclust) tree object}

\item{krange}{integer vector. Numbers of clusters which are to be compared
by the average silhouette width criterion.
Note: average silhouette width and Calinski-Harabasz can't estimate number
of clusters nc=1. If 1 is included, a Duda-Hart test is applied and 1 is
estimated if this is not significant.}

\item{...}{passed to \link[fpc]{pamk} (the current defaults criterion="asw" and usepam=TRUE can not be changes).}

\item{x}{An object of class "find_k" (has its own S3 plot method).}

\item{xlab, ylab, main}{parameters passed to plot.}
}
\value{
A \link[fpc]{pamk} output. This is a list with the following components:
1) pamobject - The output of the optimal run of the pam-function.
2) nc	- the optimal number of clusters.
3) crit - vector of criterion values for numbers of clusters. crit[1] is the p-value of the Duda-Hart test if 1 is in krange and diss=FALSE.
4) k - a copy of nc (just to make it easier to extract - since k is often used in other functions)
}
\description{
This function estimates the number of clusters based on the maximal average \link[cluster]{silhouette} width
derived from running \link[cluster]{pam} on the \link[stats]{cophenetic} distance matrix of
the \link[stats]{dendrogram}. The output is based on the \link[fpc]{pamk} output.
}
\examples{

dend <- iris[, -5] \%>\%
  dist() \%>\%
  hclust() \%>\%
  as.dendrogram()
dend_k <- find_k(dend)
plot(dend_k)
plot(color_branches(dend, k = dend_k$nc))

library(cluster)
sil <- silhouette(dend_k$pamobject)
plot(sil)

dend <- USArrests \%>\%
  dist() \%>\%
  hclust(method = "ave") \%>\%
  as.dendrogram()
dend_k <- find_k(dend)
plot(dend_k)
plot(color_branches(dend, k = dend_k$nc))
}
\seealso{
\link[fpc]{pamk}, \link[cluster]{pam}, \link[cluster]{silhouette}.
}