File: FM_index_permutation.Rd

package info (click to toggle)
r-cran-dendextend 1.14.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,888 kB
  • sloc: sh: 13; makefile: 2
file content (97 lines) | stat: -rw-r--r-- 2,897 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/bk_method.R
\name{FM_index_permutation}
\alias{FM_index_permutation}
\title{Calculating Fowlkes-Mallows Index under H0}
\usage{
FM_index_permutation(
  A1_clusters,
  A2_clusters,
  warn = dendextend_options("warn"),
  ...
)
}
\arguments{
\item{A1_clusters}{a numeric vector of cluster grouping (numeric) of items,
with a name attribute of item name for each element from group A1.
These are often obtained by using some k cut on a dendrogram.}

\item{A2_clusters}{a numeric vector of cluster grouping (numeric) of items,
with a name attribute of item name for each element from group A2.
These are often obtained by using some k cut on a dendrogram.}

\item{warn}{logical (default from dendextend_options("warn") is FALSE).
Set if warning are to be issued, it is safer to keep this at TRUE,
but for keeping the noise down, the default is FALSE.}

\item{...}{Ignored}
}
\value{
The Fowlkes-Mallows index between two vectors of clustering groups.
Under H0. (a double without attr)
}
\description{
Calculating Fowlkes-Mallows index under the null hypothesis of no relation
between the clusterings (random order of the items labels).
}
\examples{

\dontrun{

set.seed(23235)
ss <- TRUE # sample(1:150, 10 )
hc1 <- hclust(dist(iris[ss, -5]), "com")
hc2 <- hclust(dist(iris[ss, -5]), "single")
# dend1 <- as.dendrogram(hc1)
# dend2 <- as.dendrogram(hc2)
#    cutree(dend1)

# small k
A1_clusters <- cutree(hc1, k = 3) # will give a right tailed distribution
# large k
A1_clusters <- cutree(hc1, k = 50) # will give a discrete distribution
# "medium" k
A1_clusters <- cutree(hc1, k = 25) # gives almost the normal distribution!
A2_clusters <- A1_clusters

R <- 10000
set.seed(414130)
FM_index_H0 <- replicate(R, FM_index_permutation(A1_clusters, A2_clusters)) # can take 10 sec
plot(density(FM_index_H0), main = "FM Index distribution under H0\n (10000 permutation)")
abline(v = mean(FM_index_H0), col = 1, lty = 2)
# The permutation distribution is with a heavy right tail:
library(psych)
skew(FM_index_H0) # 1.254
kurtosi(FM_index_H0) # 2.5427

mean(FM_index_H0)
var(FM_index_H0)
the_FM_index <- FM_index(A1_clusters, A2_clusters)
the_FM_index
our_dnorm <- function(x) {
  dnorm(x,
    mean = attr(the_FM_index, "E_FM"),
    sd = sqrt(attr(the_FM_index, "V_FM"))
  )
}
# our_dnorm(0.35)
curve(our_dnorm,
  col = 4,
  from = -1, to = 1, n = R, add = TRUE
)
abline(v = attr(the_FM_index, "E_FM"), col = 4, lty = 2)

legend("topright", legend = c("asymptotic", "permutation"), fill = c(4, 1))
}
}
\references{
Fowlkes, E. B.; Mallows, C. L. (1 September 1983).
"A Method for Comparing Two Hierarchical Clusterings".
Journal of the American Statistical Association 78 (383): 553.

\url{https://en.wikipedia.org/wiki/Fowlkes-Mallows_index}
}
\seealso{
\link{cor_bakers_gamma},
\code{\link{FM_index_R}}, \code{\link{FM_index}}
}