File: cluster_analysis.Rd

package info (click to toggle)
r-cran-parameters 0.24.2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,852 kB
  • sloc: sh: 16; makefile: 2
file content (165 lines) | stat: -rw-r--r-- 6,737 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cluster_analysis.R
\name{cluster_analysis}
\alias{cluster_analysis}
\title{Cluster Analysis}
\usage{
cluster_analysis(
  x,
  n = NULL,
  method = "kmeans",
  include_factors = FALSE,
  standardize = TRUE,
  verbose = TRUE,
  distance_method = "euclidean",
  hclust_method = "complete",
  kmeans_method = "Hartigan-Wong",
  dbscan_eps = 15,
  iterations = 100,
  ...
)
}
\arguments{
\item{x}{A data frame (with at least two variables), or a matrix (with at
least two columns).}

\item{n}{Number of clusters used for supervised cluster methods. If \code{NULL},
the number of clusters to extract is determined by calling \code{\link[=n_clusters]{n_clusters()}}.
Note that this argument does not apply for unsupervised clustering methods
like \code{dbscan}, \code{hdbscan}, \code{mixture}, \code{pvclust}, or \code{pamk}.}

\item{method}{Method for computing the cluster analysis. Can be \code{"kmeans"}
(default; k-means using \code{kmeans()}), \code{"hkmeans"} (hierarchical k-means
using \code{factoextra::hkmeans()}), \code{pam} (K-Medoids using \code{cluster::pam()}),
\code{pamk} (K-Medoids that finds out the number of clusters), \code{"hclust"}
(hierarchical clustering using \code{hclust()} or \code{pvclust::pvclust()}),
\code{dbscan} (DBSCAN using \code{dbscan::dbscan()}), \code{hdbscan} (Hierarchical DBSCAN
using \code{dbscan::hdbscan()}), or \code{mixture} (Mixture modeling using
\code{mclust::Mclust()}, which requires the user to run \code{library(mclust)}
before).}

\item{include_factors}{Logical, if \code{TRUE}, factors are converted to numerical
values in order to be included in the data for determining the number of
clusters. By default, factors are removed, because most methods that
determine the number of clusters need numeric input only.}

\item{standardize}{Standardize the dataframe before clustering (default).}

\item{verbose}{Toggle warnings and messages.}

\item{distance_method}{Distance measure to be used for methods based on
distances (e.g., when \code{method = "hclust"} for hierarchical clustering. For
other methods, such as \code{"kmeans"}, this argument will be ignored). Must be
one of \code{"euclidean"}, \code{"maximum"}, \code{"manhattan"}, \code{"canberra"}, \code{"binary"}
or \code{"minkowski"}. See \code{\link[=dist]{dist()}} and \code{pvclust::pvclust()} for more
information.}

\item{hclust_method}{Agglomeration method to be used when \code{method = "hclust"}
or \code{method = "hkmeans"} (for hierarchical clustering). This should be one
of \code{"ward"}, \code{"ward.D2"}, \code{"single"}, \code{"complete"}, \code{"average"},
\code{"mcquitty"}, \code{"median"} or \code{"centroid"}. Default is \code{"complete"} (see
\code{\link[=hclust]{hclust()}}).}

\item{kmeans_method}{Algorithm used for calculating kmeans cluster. Only applies,
if \code{method = "kmeans"}. May be one of \code{"Hartigan-Wong"} (default),
\code{"Lloyd"} (used by SPSS), or \code{"MacQueen"}. See \code{\link[=kmeans]{kmeans()}} for details on
this argument.}

\item{dbscan_eps}{The \code{eps} argument for DBSCAN method. See \code{\link[=n_clusters_dbscan]{n_clusters_dbscan()}}.}

\item{iterations}{The number of replications.}

\item{...}{Arguments passed to or from other methods.}
}
\value{
The group classification for each observation as vector. The
returned vector includes missing values, so it has the same length
as \code{nrow(x)}.
}
\description{
Compute hierarchical or kmeans cluster analysis and return the group
assignment for each observation as vector.
}
\details{
The \code{print()} and \code{plot()} methods show the (standardized) mean value for
each variable within each cluster. Thus, a higher absolute value indicates
that a certain variable characteristic is more pronounced within that
specific cluster (as compared to other cluster groups with lower absolute
mean values).

Clusters classification can be obtained via \code{print(x, newdata = NULL, ...)}.
}
\note{
There is also a \href{https://easystats.github.io/see/articles/parameters.html}{\code{plot()}-method}
implemented in the \href{https://easystats.github.io/see/}{\strong{see}-package}.
}
\examples{
set.seed(33)
# K-Means ====================================================
rez <- cluster_analysis(iris[1:4], n = 3, method = "kmeans")
rez # Show results
predict(rez) # Get clusters
summary(rez) # Extract the centers values (can use 'plot()' on that)
if (requireNamespace("MASS", quietly = TRUE)) {
  cluster_discrimination(rez) # Perform LDA
}

# Hierarchical k-means (more robust k-means)
if (require("factoextra", quietly = TRUE)) {
  rez <- cluster_analysis(iris[1:4], n = 3, method = "hkmeans")
  rez # Show results
  predict(rez) # Get clusters
}

# Hierarchical Clustering (hclust) ===========================
rez <- cluster_analysis(iris[1:4], n = 3, method = "hclust")
rez # Show results
predict(rez) # Get clusters

# K-Medoids (pam) ============================================
if (require("cluster", quietly = TRUE)) {
  rez <- cluster_analysis(iris[1:4], n = 3, method = "pam")
  rez # Show results
  predict(rez) # Get clusters
}

# PAM with automated number of clusters
if (require("fpc", quietly = TRUE)) {
  rez <- cluster_analysis(iris[1:4], method = "pamk")
  rez # Show results
  predict(rez) # Get clusters
}

# DBSCAN ====================================================
if (require("dbscan", quietly = TRUE)) {
  # Note that you can assimilate more outliers (cluster 0) to neighbouring
  # clusters by setting borderPoints = TRUE.
  rez <- cluster_analysis(iris[1:4], method = "dbscan", dbscan_eps = 1.45)
  rez # Show results
  predict(rez) # Get clusters
}

# Mixture ====================================================
if (require("mclust", quietly = TRUE)) {
  library(mclust) # Needs the package to be loaded
  rez <- cluster_analysis(iris[1:4], method = "mixture")
  rez # Show results
  predict(rez) # Get clusters
}
}
\references{
\itemize{
\item Maechler M, Rousseeuw P, Struyf A, Hubert M, Hornik K (2014) cluster: Cluster
Analysis Basics and Extensions. R package.
}
}
\seealso{
\itemize{
\item \code{\link[=n_clusters]{n_clusters()}} to determine the number of clusters to extract.
\item \code{\link[=cluster_discrimination]{cluster_discrimination()}} to determine the accuracy of cluster group
classification via linear discriminant analysis (LDA).
\item \code{\link[performance:check_clusterstructure]{performance::check_clusterstructure()}} to check suitability of data
for clustering.
\item https://www.datanovia.com/en/lessons/
}
}