File: extractFOSC.R

package info (click to toggle)
r-cran-dbscan 1.2.2%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,196 kB
  • sloc: cpp: 4,359; sh: 13; makefile: 5
file content (322 lines) | stat: -rw-r--r-- 15,155 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#######################################################################
# dbscan - Density Based Clustering of Applications with Noise
#          and Related Algorithms
# Copyright (C) 2015 Michael Hahsler, Matt Piekenbrock

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

#' Framework for the Optimal Extraction of Clusters from Hierarchies
#'
#' Generic reimplementation of the _Framework for Optimal Selection of Clusters_
#' (FOSC; Campello et al, 2013) to extract clusterings from hierarchical clustering (i.e.,
#' [hclust] objects).
#' Can be parameterized to perform unsupervised
#' cluster extraction through a stability-based measure, or semisupervised
#' cluster extraction through either a constraint-based extraction (with a
#' stability-based tiebreaker) or a mixed (weighted) constraint and
#' stability-based objective extraction.
#'
#' Campello et al (2013) suggested a _Framework for Optimal Selection of
#' Clusters_ (FOSC) as a framework to make local (non-horizontal) cuts to any
#' cluster tree hierarchy. This function implements the original extraction
#' algorithms as described by the framework for hclust objects. Traditional
#' cluster extraction methods from hierarchical representations (such as
#' [hclust] objects) generally rely on global parameters or cutting values
#' which are used to partition a cluster hierarchy into a set of disjoint, flat
#' clusters. This is implemented in R in function [cutree()].
#' Although such methods are widespread, using global parameter
#' settings are inherently limited in that they cannot capture patterns within
#' the cluster hierarchy at varying _local_ levels of granularity.
#'
#' Rather than partitioning a hierarchy based on the number of the cluster one
#' expects to find (\eqn{k}) or based on some linkage distance threshold
#' (\eqn{H}), the FOSC proposes that the optimal clusters may exist at varying
#' distance thresholds in the hierarchy. To enable this idea, FOSC requires one
#' parameter (minPts) that represents _the minimum number of points that
#' constitute a valid cluster._ The first step of the FOSC algorithm is to
#' traverse the given cluster hierarchy divisively, recording new clusters at
#' each split if both branches represent more than or equal to minPts. Branches
#' that contain less than minPts points at one or both branches inherit the
#' parent clusters identity. Note that using FOSC, due to the constraint that
#' minPts must be greater than or equal to 2, it is possible that the optimal
#' cluster solution chosen makes local cuts that render parent branches of
#' sizes less than minPts as noise, which are denoted as 0 in the final
#' solution.
#'
#' Traversing the original cluster tree using minPts creates a new, simplified
#' cluster tree that is then post-processed recursively to extract clusters
#' that maximize for each cluster \eqn{C_i}{Ci} the cost function
#'
#' \deqn{\max_{\delta_2, \dots, \delta_k} J = \sum\limits_{i=2}^{k} \delta_i
#' S(C_i)}{ J = \sum \delta S(Ci) for all i clusters, } where
#' \eqn{S(C_i)}{S(Ci)} is the stability-based measure as \deqn{ S(C_i) =
#' \sum_{x_j \in C_i}(\frac{1}{h_{min} (x_j, C_i)} - \frac{1}{h_{max} (C_i)})
#' }{ S(Ci) = \sum (1/Hmin(Xj, Ci) - 1/Hmax(Ci)) for all Xj in Ci.}
#'
#' \eqn{\delta_i}{\delta} represents an indicator function, which constrains
#' the solution space such that clusters must be disjoint (cannot assign more
#' than 1 label to each cluster). The measure \eqn{S(C_i)}{S(Ci)} used by FOSC
#' is an unsupervised validation measure based on the assumption that, if you
#' vary the linkage/distance threshold across all possible values, more
#' prominent clusters that survive over many threshold variations should be
#' considered as stronger candidates of the optimal solution. For this reason,
#' using this measure to detect clusters is referred to as an unsupervised,
#' _stability-based_ extraction approach. In some cases it may be useful
#' to enact _instance-level_ constraints that ensure the solution space
#' conforms to linkage expectations known _a priori_. This general idea of
#' using preliminary expectations to augment the clustering solution will be
#' referred to as _semisupervised clustering_. If constraints are given in
#' the call to `extractFOSC()`, the following alternative objective function
#' is maximized:
#'
#' \deqn{J = \frac{1}{2n_c}\sum\limits_{j=1}^n \gamma (x_j)}{J = 1/(2 * nc)
#' \sum \gamma(Xj)}
#'
#' \eqn{n_c}{nc} is the total number of constraints given and
#' \eqn{\gamma(x_j)}{\gamma(Xj)} represents the number of constraints involving
#' object \eqn{x_j}{Xj} that are satisfied. In the case of ties (such as
#' solutions where no constraints were given), the unsupervised solution is
#' used as a tiebreaker. See Campello et al (2013) for more details.
#'
#' As a third option, if one wishes to prioritize the degree at which the
#' unsupervised and semisupervised solutions contribute to the overall optimal
#' solution, the parameter \eqn{\alpha} can be set to enable the extraction of
#' clusters that maximize the `mixed` objective function
#'
#' \deqn{J = \alpha S(C_i) + (1 - \alpha) \gamma(C_i))}{J = \alpha S(Ci) + (1 -
#' \alpha) \gamma(Ci).}
#'
#' FOSC expects the pairwise constraints to be passed as either 1) an
#' \eqn{n(n-1)/2} vector of integers representing the constraints, where 1
#' represents should-link, -1 represents should-not-link, and 0 represents no
#' preference using the unsupervised solution (see below for examples).
#' Alternatively, if only a few constraints are needed, a named list
#' representing the (symmetric) adjacency list can be used, where the names
#' correspond to indices of the points in the original data, and the values
#' correspond to integer vectors of constraints (positive indices for
#' should-link, negative indices for should-not-link). Again, see the examples
#' section for a demonstration of this.
#'
#' The parameters to the input function correspond to the concepts discussed
#' above. The `minPts` parameter to represent the minimum cluster size to
#' extract. The optional `constraints` parameter contains the pairwise,
#' instance-level constraints of the data. The optional `alpha` parameters
#' controls whether the mixed objective function is used (if `alpha` is
#' greater than 0). If the `validate_constraints` parameter is set to
#' true, the constraints are checked (and fixed) for symmetry (if point A has a
#' should-link constraint with point B, point B should also have the same
#' constraint). Asymmetric constraints are not supported.
#'
#' Unstable branch pruning was not discussed by Campello et al (2013), however
#' in some data sets it may be the case that specific subbranches scores are
#' significantly greater than sibling and parent branches, and thus sibling
#' branches should be considered as noise if their scores are cumulatively
#' lower than the parents. This can happen in extremely nonhomogeneous data
#' sets, where there exists locally very stable branches surrounded by unstable
#' branches that contain more than `minPts` points.
#' `prune_unstable = TRUE` will remove the unstable branches.
#'
#' @family clustering functions
#'
#' @param x a valid [hclust] object created via [hclust()] or [hdbscan()].
#' @param constraints Either a list or matrix of pairwise constraints. If
#' missing, an unsupervised measure of stability is used to make local cuts and
#' extract the optimal clusters. See details.
#' @param alpha numeric; weight between \eqn{[0, 1]} for mixed-objective
#' semi-supervised extraction. Defaults to 0.
#' @param minPts numeric; Defaults to 2. Only needed if class-less noise is a
#' valid label in the model.
#' @param prune_unstable logical; should significantly unstable subtrees be
#' pruned? The default is `FALSE` for the original optimal extraction
#' framework (see Campello et al, 2013). See details for what `TRUE`
#' implies.
#' @param validate_constraints logical; should constraints be checked for
#' validity? See details for what are considered valid constraints.
#'
#' @returns A list with the elements:
#'
#' \item{cluster }{A integer vector with cluster assignments. Zero
#' indicates noise points (if any).}
#' \item{hc }{The original [hclust] object with additional list elements
#' `"stability"`, `"constraint"`, and `"total"`
#' for the \eqn{n - 1} cluster-wide objective scores from the extraction.}
#'
#' @author Matt Piekenbrock
#' @seealso [hclust()], [hdbscan()], [stats::cutree()]
#' @references Campello, Ricardo JGB, Davoud Moulavi, Arthur Zimek, and Joerg
#' Sander (2013). A framework for semi-supervised and unsupervised optimal
#' extraction of clusters from hierarchies. _Data Mining and Knowledge
#' Discovery_ 27(3): 344-371.
#' \doi{10.1007/s10618-013-0311-4}
#' @keywords model clustering
#' @examples
#' data("moons")
#'
#' ## Regular HDBSCAN using stability-based extraction (unsupervised)
#' cl <- hdbscan(moons, minPts = 5)
#' cl$cluster
#'
#' ## Constraint-based extraction from the HDBSCAN hierarchy
#' ## (w/ stability-based tiebreaker (semisupervised))
#' cl_con <- extractFOSC(cl$hc, minPts = 5,
#'   constraints = list("12" = c(49, -47)))
#' cl_con$cluster
#'
#' ## Alternative formulation: Constraint-based extraction from the HDBSCAN hierarchy
#' ## (w/ stability-based tiebreaker (semisupervised)) using distance thresholds
#' dist_moons <- dist(moons)
#' cl_con2 <- extractFOSC(cl$hc, minPts = 5,
#'   constraints = ifelse(dist_moons < 0.1, 1L,
#'                 ifelse(dist_moons > 1, -1L, 0L)))
#'
#' cl_con2$cluster # same as the second example
#' @export
extractFOSC <-
  function(x,
    constraints,
    alpha = 0,
    minPts = 2L,
    prune_unstable = FALSE,
    validate_constraints = FALSE) {
    if (!inherits(x, "hclust"))
      stop("extractFOSC expects 'x' to be a valid hclust object.")

    # if constraints are given then they need to be a list, a matrix or a vector
    if (!(
      missing(constraints) ||
        is.list(constraints) ||
        is.matrix(constraints) ||
        is.numeric(constraints)
    ))
      stop("extractFOSC expects constraints to be either an adjacency list or adjacency matrix.")

    if (!minPts >= 2)
      stop("minPts must be at least 2.")
    if (alpha < 0 ||
        alpha > 1)
      stop("alpha can only takes values between [0, 1].")
    n <- nrow(x$merge) + 1L

    ## First step for both unsupervised and semisupervised - compute stability scores
    cl_tree <- computeStability(x, minPts)

    ## Unsupervised Extraction
    if (missing(constraints)) {
      cl_tree <- extractUnsupervised(cl_tree, prune_unstable)
    }
    ## Semi-supervised Extraction
    else {
      ## If given as adjacency-list form
      if (is.list(constraints)) {
        ## Checks for proper indexing, symmetry of constraints, etc.
        if (validate_constraints) {
          is_valid <- max(as.integer(names(constraints))) < n
          is_valid <- is_valid &&
            all(vapply(constraints, function(ilc) all(ilc <= n), logical(1L)))
          if (!is_valid) {
            stop("Detected constraint indices not in the interval [1, n]")
          }
          constraints <- validateConstraintList(constraints, n)
        }
        cl_tree <-
          extractSemiSupervised(cl_tree, constraints, alpha, prune_unstable)
      }
      ## Adjacency matrix given (probably from dist object), retrieve adjacency list form
      else if (is.vector(constraints)) {
        if (!all(constraints %in% c(-1, 0, 1))) {
          stop(
            "'extractFOSC' only accepts instance-level constraints. See ?extractFOSC for more details."
          )
        }
        ## Checks for proper integer labels, symmetry of constraints, length of vector, etc.
        if (validate_constraints) {
          is_valid <- length(constraints) == choose(n, 2)
          constraints_list <-
            validateConstraintList(distToAdjacency(constraints, n), n)
        } else {
          constraints_list <-  distToAdjacency(constraints, n)
        }
        cl_tree <-
          extractSemiSupervised(cl_tree, constraints_list, alpha, prune_unstable)
      }
      ## Full nxn adjacency-matrix given, give warning and retrieve adjacency list form
      else if (is.matrix(constraints)) {
        if (!all(constraints %in% c(-1, 0, 1))) {
          stop(
            "'extractFOSC' only accepts instance-level constraints. See ?extractFOSC for more details."
          )
        }
        if (!all(dim(constraints) == c(n, n))) {
          stop("Given matrix is not square.")
        }
        warning(
          "Full nxn matrix given; extractFOCS does not support asymmetric relational constraints. Using lower triangular."
        )

        constraints <- constraints[lower.tri(constraints)]

        ## Checks for proper integer labels, symmetry of constraints, length of vector, etc.
        if (validate_constraints) {
          is_valid <- length(constraints) == choose(n, 2)
          constraints_list <-
            validateConstraintList(distToAdjacency(constraints, n), n)
        } else {
          constraints_list <- distToAdjacency(constraints, n)
        }
        cl_tree <-
          extractSemiSupervised(cl_tree, constraints_list, alpha, prune_unstable)
      } else {
        stop(
          "'extractFOSC' doesn't know how to handle constraints of type ",
          class(constraints)
        )
      }
    }
    cl_track <- attr(cl_tree, "cl_tracker")
    stability_score <-
      vapply(cl_track, function(cid)
        cl_tree[[as.character(cid)]]$stability, numeric(1L))
    constraint_score <-
      vapply(cl_track, function(cid)
        cl_tree[[as.character(cid)]]$vscore %||% 0, numeric(1L))
    total_score <-
      vapply(cl_track, function(cid)
        cl_tree[[as.character(cid)]]$score %||% 0, numeric(1L))
    out <- append(
      x,
      list(
        cluster = cl_track,
        stability = stability_score,
        constraint = constraint_score,
        total = total_score
      )
    )
    extraction_type <-
      if (missing(constraints)) {
        "(w/ stability-based extraction)"
      } else if (alpha == 0) {
        "(w/ constraint-based extraction)"
      } else {
        "(w/ mixed-objective extraction)"
      }
    substrs <- strsplit(x$method, split = " \\(w\\/")[[1L]]
    out[["method"]] <-
      if (length(substrs) > 1)
        paste(substrs[[1]], extraction_type)
    else
      paste(out[["method"]], extraction_type)
    class(out) <- "hclust"
    return(list(cluster = attr(cl_tree, "cluster"), hc = out))
  }