1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
|
#######################################################################
# dbscan - Density Based Clustering of Applications with Noise
# and Related Algorithms
# Copyright (C) 2015 Michael Hahsler, Matt Piekenbrock
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#' Framework for the Optimal Extraction of Clusters from Hierarchies
#'
#' Generic reimplementation of the _Framework for Optimal Selection of Clusters_
#' (FOSC; Campello et al, 2013) to extract clusterings from hierarchical clustering (i.e.,
#' [hclust] objects).
#' Can be parameterized to perform unsupervised
#' cluster extraction through a stability-based measure, or semisupervised
#' cluster extraction through either a constraint-based extraction (with a
#' stability-based tiebreaker) or a mixed (weighted) constraint and
#' stability-based objective extraction.
#'
#' Campello et al (2013) suggested a _Framework for Optimal Selection of
#' Clusters_ (FOSC) as a framework to make local (non-horizontal) cuts to any
#' cluster tree hierarchy. This function implements the original extraction
#' algorithms as described by the framework for hclust objects. Traditional
#' cluster extraction methods from hierarchical representations (such as
#' [hclust] objects) generally rely on global parameters or cutting values
#' which are used to partition a cluster hierarchy into a set of disjoint, flat
#' clusters. This is implemented in R in function [cutree()].
#' Although such methods are widespread, using global parameter
#' settings are inherently limited in that they cannot capture patterns within
#' the cluster hierarchy at varying _local_ levels of granularity.
#'
#' Rather than partitioning a hierarchy based on the number of the cluster one
#' expects to find (\eqn{k}) or based on some linkage distance threshold
#' (\eqn{H}), the FOSC proposes that the optimal clusters may exist at varying
#' distance thresholds in the hierarchy. To enable this idea, FOSC requires one
#' parameter (minPts) that represents _the minimum number of points that
#' constitute a valid cluster._ The first step of the FOSC algorithm is to
#' traverse the given cluster hierarchy divisively, recording new clusters at
#' each split if both branches represent more than or equal to minPts. Branches
#' that contain less than minPts points at one or both branches inherit the
#' parent clusters identity. Note that using FOSC, due to the constraint that
#' minPts must be greater than or equal to 2, it is possible that the optimal
#' cluster solution chosen makes local cuts that render parent branches of
#' sizes less than minPts as noise, which are denoted as 0 in the final
#' solution.
#'
#' Traversing the original cluster tree using minPts creates a new, simplified
#' cluster tree that is then post-processed recursively to extract clusters
#' that maximize for each cluster \eqn{C_i}{Ci} the cost function
#'
#' \deqn{\max_{\delta_2, \dots, \delta_k} J = \sum\limits_{i=2}^{k} \delta_i
#' S(C_i)}{ J = \sum \delta S(Ci) for all i clusters, } where
#' \eqn{S(C_i)}{S(Ci)} is the stability-based measure as \deqn{ S(C_i) =
#' \sum_{x_j \in C_i}(\frac{1}{h_{min} (x_j, C_i)} - \frac{1}{h_{max} (C_i)})
#' }{ S(Ci) = \sum (1/Hmin(Xj, Ci) - 1/Hmax(Ci)) for all Xj in Ci.}
#'
#' \eqn{\delta_i}{\delta} represents an indicator function, which constrains
#' the solution space such that clusters must be disjoint (cannot assign more
#' than 1 label to each cluster). The measure \eqn{S(C_i)}{S(Ci)} used by FOSC
#' is an unsupervised validation measure based on the assumption that, if you
#' vary the linkage/distance threshold across all possible values, more
#' prominent clusters that survive over many threshold variations should be
#' considered as stronger candidates of the optimal solution. For this reason,
#' using this measure to detect clusters is referred to as an unsupervised,
#' _stability-based_ extraction approach. In some cases it may be useful
#' to enact _instance-level_ constraints that ensure the solution space
#' conforms to linkage expectations known _a priori_. This general idea of
#' using preliminary expectations to augment the clustering solution will be
#' referred to as _semisupervised clustering_. If constraints are given in
#' the call to `extractFOSC()`, the following alternative objective function
#' is maximized:
#'
#' \deqn{J = \frac{1}{2n_c}\sum\limits_{j=1}^n \gamma (x_j)}{J = 1/(2 * nc)
#' \sum \gamma(Xj)}
#'
#' \eqn{n_c}{nc} is the total number of constraints given and
#' \eqn{\gamma(x_j)}{\gamma(Xj)} represents the number of constraints involving
#' object \eqn{x_j}{Xj} that are satisfied. In the case of ties (such as
#' solutions where no constraints were given), the unsupervised solution is
#' used as a tiebreaker. See Campello et al (2013) for more details.
#'
#' As a third option, if one wishes to prioritize the degree at which the
#' unsupervised and semisupervised solutions contribute to the overall optimal
#' solution, the parameter \eqn{\alpha} can be set to enable the extraction of
#' clusters that maximize the `mixed` objective function
#'
#' \deqn{J = \alpha S(C_i) + (1 - \alpha) \gamma(C_i))}{J = \alpha S(Ci) + (1 -
#' \alpha) \gamma(Ci).}
#'
#' FOSC expects the pairwise constraints to be passed as either 1) an
#' \eqn{n(n-1)/2} vector of integers representing the constraints, where 1
#' represents should-link, -1 represents should-not-link, and 0 represents no
#' preference using the unsupervised solution (see below for examples).
#' Alternatively, if only a few constraints are needed, a named list
#' representing the (symmetric) adjacency list can be used, where the names
#' correspond to indices of the points in the original data, and the values
#' correspond to integer vectors of constraints (positive indices for
#' should-link, negative indices for should-not-link). Again, see the examples
#' section for a demonstration of this.
#'
#' The parameters to the input function correspond to the concepts discussed
#' above. The `minPts` parameter to represent the minimum cluster size to
#' extract. The optional `constraints` parameter contains the pairwise,
#' instance-level constraints of the data. The optional `alpha` parameters
#' controls whether the mixed objective function is used (if `alpha` is
#' greater than 0). If the `validate_constraints` parameter is set to
#' true, the constraints are checked (and fixed) for symmetry (if point A has a
#' should-link constraint with point B, point B should also have the same
#' constraint). Asymmetric constraints are not supported.
#'
#' Unstable branch pruning was not discussed by Campello et al (2013), however
#' in some data sets it may be the case that specific subbranches scores are
#' significantly greater than sibling and parent branches, and thus sibling
#' branches should be considered as noise if their scores are cumulatively
#' lower than the parents. This can happen in extremely nonhomogeneous data
#' sets, where there exists locally very stable branches surrounded by unstable
#' branches that contain more than `minPts` points.
#' `prune_unstable = TRUE` will remove the unstable branches.
#'
#' @family clustering functions
#'
#' @param x a valid [hclust] object created via [hclust()] or [hdbscan()].
#' @param constraints Either a list or matrix of pairwise constraints. If
#' missing, an unsupervised measure of stability is used to make local cuts and
#' extract the optimal clusters. See details.
#' @param alpha numeric; weight between \eqn{[0, 1]} for mixed-objective
#' semi-supervised extraction. Defaults to 0.
#' @param minPts numeric; Defaults to 2. Only needed if class-less noise is a
#' valid label in the model.
#' @param prune_unstable logical; should significantly unstable subtrees be
#' pruned? The default is `FALSE` for the original optimal extraction
#' framework (see Campello et al, 2013). See details for what `TRUE`
#' implies.
#' @param validate_constraints logical; should constraints be checked for
#' validity? See details for what are considered valid constraints.
#'
#' @returns A list with the elements:
#'
#' \item{cluster }{A integer vector with cluster assignments. Zero
#' indicates noise points (if any).}
#' \item{hc }{The original [hclust] object with additional list elements
#' `"stability"`, `"constraint"`, and `"total"`
#' for the \eqn{n - 1} cluster-wide objective scores from the extraction.}
#'
#' @author Matt Piekenbrock
#' @seealso [hclust()], [hdbscan()], [stats::cutree()]
#' @references Campello, Ricardo JGB, Davoud Moulavi, Arthur Zimek, and Joerg
#' Sander (2013). A framework for semi-supervised and unsupervised optimal
#' extraction of clusters from hierarchies. _Data Mining and Knowledge
#' Discovery_ 27(3): 344-371.
#' \doi{10.1007/s10618-013-0311-4}
#' @keywords model clustering
#' @examples
#' data("moons")
#'
#' ## Regular HDBSCAN using stability-based extraction (unsupervised)
#' cl <- hdbscan(moons, minPts = 5)
#' cl$cluster
#'
#' ## Constraint-based extraction from the HDBSCAN hierarchy
#' ## (w/ stability-based tiebreaker (semisupervised))
#' cl_con <- extractFOSC(cl$hc, minPts = 5,
#' constraints = list("12" = c(49, -47)))
#' cl_con$cluster
#'
#' ## Alternative formulation: Constraint-based extraction from the HDBSCAN hierarchy
#' ## (w/ stability-based tiebreaker (semisupervised)) using distance thresholds
#' dist_moons <- dist(moons)
#' cl_con2 <- extractFOSC(cl$hc, minPts = 5,
#' constraints = ifelse(dist_moons < 0.1, 1L,
#' ifelse(dist_moons > 1, -1L, 0L)))
#'
#' cl_con2$cluster # same as the second example
#' @export
extractFOSC <-
function(x,
constraints,
alpha = 0,
minPts = 2L,
prune_unstable = FALSE,
validate_constraints = FALSE) {
if (!inherits(x, "hclust"))
stop("extractFOSC expects 'x' to be a valid hclust object.")
# if constraints are given then they need to be a list, a matrix or a vector
if (!(
missing(constraints) ||
is.list(constraints) ||
is.matrix(constraints) ||
is.numeric(constraints)
))
stop("extractFOSC expects constraints to be either an adjacency list or adjacency matrix.")
if (!minPts >= 2)
stop("minPts must be at least 2.")
if (alpha < 0 ||
alpha > 1)
stop("alpha can only takes values between [0, 1].")
n <- nrow(x$merge) + 1L
## First step for both unsupervised and semisupervised - compute stability scores
cl_tree <- computeStability(x, minPts)
## Unsupervised Extraction
if (missing(constraints)) {
cl_tree <- extractUnsupervised(cl_tree, prune_unstable)
}
## Semi-supervised Extraction
else {
## If given as adjacency-list form
if (is.list(constraints)) {
## Checks for proper indexing, symmetry of constraints, etc.
if (validate_constraints) {
is_valid <- max(as.integer(names(constraints))) < n
is_valid <- is_valid &&
all(vapply(constraints, function(ilc) all(ilc <= n), logical(1L)))
if (!is_valid) {
stop("Detected constraint indices not in the interval [1, n]")
}
constraints <- validateConstraintList(constraints, n)
}
cl_tree <-
extractSemiSupervised(cl_tree, constraints, alpha, prune_unstable)
}
## Adjacency matrix given (probably from dist object), retrieve adjacency list form
else if (is.vector(constraints)) {
if (!all(constraints %in% c(-1, 0, 1))) {
stop(
"'extractFOSC' only accepts instance-level constraints. See ?extractFOSC for more details."
)
}
## Checks for proper integer labels, symmetry of constraints, length of vector, etc.
if (validate_constraints) {
is_valid <- length(constraints) == choose(n, 2)
constraints_list <-
validateConstraintList(distToAdjacency(constraints, n), n)
} else {
constraints_list <- distToAdjacency(constraints, n)
}
cl_tree <-
extractSemiSupervised(cl_tree, constraints_list, alpha, prune_unstable)
}
## Full nxn adjacency-matrix given, give warning and retrieve adjacency list form
else if (is.matrix(constraints)) {
if (!all(constraints %in% c(-1, 0, 1))) {
stop(
"'extractFOSC' only accepts instance-level constraints. See ?extractFOSC for more details."
)
}
if (!all(dim(constraints) == c(n, n))) {
stop("Given matrix is not square.")
}
warning(
"Full nxn matrix given; extractFOCS does not support asymmetric relational constraints. Using lower triangular."
)
constraints <- constraints[lower.tri(constraints)]
## Checks for proper integer labels, symmetry of constraints, length of vector, etc.
if (validate_constraints) {
is_valid <- length(constraints) == choose(n, 2)
constraints_list <-
validateConstraintList(distToAdjacency(constraints, n), n)
} else {
constraints_list <- distToAdjacency(constraints, n)
}
cl_tree <-
extractSemiSupervised(cl_tree, constraints_list, alpha, prune_unstable)
} else {
stop(
"'extractFOSC' doesn't know how to handle constraints of type ",
class(constraints)
)
}
}
cl_track <- attr(cl_tree, "cl_tracker")
stability_score <-
vapply(cl_track, function(cid)
cl_tree[[as.character(cid)]]$stability, numeric(1L))
constraint_score <-
vapply(cl_track, function(cid)
cl_tree[[as.character(cid)]]$vscore %||% 0, numeric(1L))
total_score <-
vapply(cl_track, function(cid)
cl_tree[[as.character(cid)]]$score %||% 0, numeric(1L))
out <- append(
x,
list(
cluster = cl_track,
stability = stability_score,
constraint = constraint_score,
total = total_score
)
)
extraction_type <-
if (missing(constraints)) {
"(w/ stability-based extraction)"
} else if (alpha == 0) {
"(w/ constraint-based extraction)"
} else {
"(w/ mixed-objective extraction)"
}
substrs <- strsplit(x$method, split = " \\(w\\/")[[1L]]
out[["method"]] <-
if (length(substrs) > 1)
paste(substrs[[1]], extraction_type)
else
paste(out[["method"]], extraction_type)
class(out) <- "hclust"
return(list(cluster = attr(cl_tree, "cluster"), hc = out))
}
|