1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
#' Parameters from Cluster Models (k-means, ...)
#'
#' Format cluster models obtained for example by [kmeans()].
#'
#' @param model Cluster model.
#' @param data A data frame.
#' @param clusters A vector with clusters assignments (must be same length as
#' rows in data).
#' @param ... Arguments passed to or from other methods.
#'
#' @examplesIf require("factoextra", quietly = TRUE) && require("dbscan", quietly = TRUE) && require("cluster", quietly = TRUE) && require("fpc", quietly = TRUE)
#' \donttest{
#' #
#' # K-means -------------------------------
#' model <- kmeans(iris[1:4], centers = 3)
#' rez <- model_parameters(model)
#' rez
#'
#' # Get clusters
#' predict(rez)
#'
#' # Clusters centers in long form
#' attributes(rez)$means
#'
#' # Between and Total Sum of Squares
#' attributes(rez)$Sum_Squares_Total
#' attributes(rez)$Sum_Squares_Between
#'
#' #
#' # Hierarchical clustering (hclust) ---------------------------
#' data <- iris[1:4]
#' model <- hclust(dist(data))
#' clusters <- cutree(model, 3)
#'
#' rez <- model_parameters(model, data, clusters)
#' rez
#'
#' # Get clusters
#' predict(rez)
#'
#' # Clusters centers in long form
#' attributes(rez)$means
#'
#' # Between and Total Sum of Squares
#' attributes(rez)$Total_Sum_Squares
#' attributes(rez)$Between_Sum_Squares
#'
#' #
#' # Hierarchical K-means (factoextra::hkclust) ----------------------
#' data <- iris[1:4]
#' model <- factoextra::hkmeans(data, k = 3)
#'
#' rez <- model_parameters(model)
#' rez
#'
#' # Get clusters
#' predict(rez)
#'
#' # Clusters centers in long form
#' attributes(rez)$means
#'
#' # Between and Total Sum of Squares
#' attributes(rez)$Sum_Squares_Total
#' attributes(rez)$Sum_Squares_Between
#'
#' # K-Medoids (PAM and HPAM) ==============
#' model <- cluster::pam(iris[1:4], k = 3)
#' model_parameters(model)
#'
#' model <- fpc::pamk(iris[1:4], criterion = "ch")
#' model_parameters(model)
#'
#' # DBSCAN ---------------------------
#' model <- dbscan::dbscan(iris[1:4], eps = 1.45, minPts = 10)
#'
#' rez <- model_parameters(model, iris[1:4])
#' rez
#'
#' # Get clusters
#' predict(rez)
#'
#' # Clusters centers in long form
#' attributes(rez)$means
#'
#' # Between and Total Sum of Squares
#' attributes(rez)$Sum_Squares_Total
#' attributes(rez)$Sum_Squares_Between
#'
#' # HDBSCAN
#' model <- dbscan::hdbscan(iris[1:4], minPts = 10)
#' model_parameters(model, iris[1:4])
#' }
#' @export
model_parameters.hclust <- function(model, data = NULL, clusters = NULL, ...) {
if (is.null(data)) {
insight::format_error(
"This function requires the data used to compute the clustering to be provided via `data` as it is not accessible from the clustering object itself."
)
}
if (is.null(clusters)) {
insight::format_error(
"This function requires a vector of clusters assignments of same length as data to be passed, as it is not contained in the clustering object itself."
)
}
params <- cluster_centers(data, clusters, ...)
# Long means
means <- datawizard::reshape_longer(params,
select = 4:ncol(params),
values_to = "Mean",
names_to = "Variable"
)
attr(params, "variance") <- attributes(params)$variance
attr(params, "Sum_Squares_Between") <- attributes(params)$Sum_Squares_Between
attr(params, "Sum_Squares_Total") <- attributes(params)$Sum_Squares_Total
attr(params, "means") <- means
attr(params, "model") <- model
attr(params, "scores") <- clusters
attr(params, "type") <- "hclust"
class(params) <- c("parameters_clusters", class(params))
params
}
#' @export
model_parameters.pvclust <- function(model, data = NULL, clusters = NULL, ci = 0.95, ...) {
if (is.null(data)) {
insight::format_error(
"This function requires the data used to compute the clustering to be provided via `data` as it is not accessible from the clustering object itself."
)
}
if (is.null(clusters)) {
clusters <- .model_parameters_pvclust_clusters(model, data, ci)$Cluster
}
params <- .cluster_centers_params(data, clusters, ...)
attr(params, "model") <- model
attr(params, "type") <- "pvclust"
attr(params, "title") <- "Bootstrapped Hierarchical Clustering (PVCLUST)"
params
}
# Utils -------------------------------------------------------------------
#' @keywords internal
.model_parameters_pvclust_clusters <- function(model, data, ci = 0.95) {
insight::check_if_installed("pvclust")
rez <- pvclust::pvpick(model, alpha = ci, pv = "si")
# Assign clusters
out <- data.frame()
for (cluster in seq_along(rez$clusters)) {
out <- rbind(out, data.frame(Cluster = cluster, Row = rez$clusters[[cluster]], stringsAsFactors = FALSE), make.row.names = FALSE, stringsAsFactors = FALSE)
}
# Add points not in significant clusters
remaining_rows <- row.names(data)[!row.names(data) %in% out$Row]
if (length(remaining_rows) > 0) {
out <- rbind(out, data.frame(Cluster = 0, Row = remaining_rows, stringsAsFactors = FALSE), make.row.names = FALSE, stringsAsFactors = FALSE)
}
# Reorder according to original order of rows
out <- out[order(match(out$Row, row.names(data))), ]
row.names(out) <- NULL
out
}
|