File: jpclust.R

package info (click to toggle)
r-cran-dbscan 1.2.2%2Bds-2
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 2,196 kB
sloc: cpp: 4,359; sh: 13; makefile: 5
file content (142 lines) | stat: -rw-r--r-- 4,713 bytes
#######################################################################
# dbscan - Density Based Clustering of Applications with Noise
#          and Related Algorithms
# Copyright (C) 2017 Michael Hahsler

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

#' Jarvis-Patrick Clustering
#'
#' Fast C++ implementation of the Jarvis-Patrick clustering which first builds
#' a shared nearest neighbor graph (k nearest neighbor sparsification) and then
#' places two points in the same cluster if they are in each others nearest
#' neighbor list and they share at least kt nearest neighbors.
#'
#' Following the original paper, the shared nearest neighbor list is
#' constructed as the k neighbors plus the point itself (as neighbor zero).
#' Therefore, the threshold `kt` needs to be in the range \eqn{[1, k]}.
#'
#' Fast nearest neighbors search with [kNN()] is only used if `x` is
#' a matrix. In this case Euclidean distance is used.
#'
#' @aliases jpclust print.general_clustering
#' @family clustering functions
#'
#' @param x a data matrix/data.frame (Euclidean distance is used), a
#' precomputed [dist] object or a kNN object created with [kNN()].
#' @param k Neighborhood size for nearest neighbor sparsification. If `x`
#' is a kNN object then `k` may be missing.
#' @param kt threshold on the number of shared nearest neighbors (including the
#' points themselves) to form clusters. Range: \eqn{[1, k]}
#' @param ...  additional arguments are passed on to the k nearest neighbor
#' search algorithm. See [kNN()] for details on how to control the
#' search strategy.
#'
#' @return A object of class `general_clustering` with the following
#' components:
#' \item{cluster }{A integer vector with cluster assignments. Zero
#' indicates noise points.}
#' \item{type }{ name of used clustering algorithm.}
#' \item{metric }{ the distance metric used for clustering.}
#' \item{param }{ list of used clustering parameters. }
#'
#' @author Michael Hahsler
#' @references R. A. Jarvis and E. A. Patrick. 1973. Clustering Using a
#' Similarity Measure Based on Shared Near Neighbors. _IEEE Trans. Comput.
#' 22,_ 11 (November 1973), 1025-1034.
#' \doi{10.1109/T-C.1973.223640}
#' @keywords model clustering
#' @examples
#' data("DS3")
#'
#' # use a shared neighborhood of 20 points and require 12 shared neighbors
#' cl <- jpclust(DS3, k = 20, kt = 12)
#' cl
#'
#' clplot(DS3, cl)
#' # Note: JP clustering does not consider noise and thus,
#' # the sine wave points chain clusters together.
#'
#' # use a precomputed kNN object instead of the original data.
#' nn <- kNN(DS3, k = 30)
#' nn
#'
#' cl <- jpclust(nn, k = 20, kt = 12)
#' cl
#'
#' # cluster with noise removed (use low pointdensity to identify noise)
#' d <- pointdensity(DS3, eps = 25)
#' hist(d, breaks = 20)
#' DS3_noiseless <- DS3[d > 110,]
#'
#' cl <- jpclust(DS3_noiseless, k = 20, kt = 10)
#' cl
#'
#' clplot(DS3_noiseless, cl)
#' @export
jpclust <- function(x, k, kt, ...) {
  # Create NN graph
  if (missing(k) && inherits(x, "kNN"))
      k <- x$k
  if (length(kt) != 1 || kt < 1 || kt > k)
    stop("kt needs to be a threshold in range [1, k].")

  nn <- kNN(x, k, sort = FALSE, ...)

  # Perform clustering
  cl <- JP_int(nn$id, kt = as.integer(kt))

  structure(
    list(
      cluster = as.integer(factor(cl)),
      type = "Jarvis-Patrick clustering",
      metric = nn$metric,
      param = list(k = k, kt = kt)
    ),
    class = c("general_clustering")
  )
}

#' @export
print.general_clustering <- function(x, ...) {
  cl <- unique(x$cluster)
  cl <- length(cl[cl != 0L])

  writeLines(c(
    paste0(x$type, " for ", length(x$cluster), " objects."),
    paste0("Parameters: ",
      paste(
        names(x$param),
        unlist(x$param, use.names = FALSE),
        sep = " = ",
        collapse = ", "
      )),
    paste0(
      "The clustering contains ",
      cl,
      " cluster(s) and ",
      sum(x$cluster == 0L),
      " noise points."
    )
  ))

  print(table(x$cluster))
  cat("\n")

  writeLines(strwrap(paste0(
    "Available fields: ",
    toString(names(x))
  ), exdent = 18))
}