1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
|
#' Computes the local density of points in a distance matrix
#'
#' This function takes a distance matrix and a distance cutoff and calculate the
#' local density for each point in the matrix. The computation can either be
#' done using a simple summation of the points with the distance cutoff for each
#' observation, or by applying a gaussian kernel scaled by the distance cutoff
#' (more robust for low-density data)
#'
#' @param distance A distance matrix
#'
#' @param dc A numeric value specifying the distance cutoff
#'
#' @param gaussian Logical. Should a gaussian kernel be used to estimate the
#' density (defaults to `FALSE`)
#'
#' @return A vector of local density values, the index matching row and column
#' indexes in the distance matrix
#'
#' @noRd
#'
localDensity <- function(distance, dc, gaussian = FALSE) {
# These implementations are faster by virtue of being written in C++
# They also avoid the need to convert `distance` to a matrix.
if (gaussian) {
res <- gaussianLocalDensity(distance, attr(distance, "Size"), dc)
} else {
res <- nonGaussianLocalDensity(distance, attr(distance, "Size"), dc)
}
if (is.null(attr(distance, 'Labels'))) {
names(res) <- NULL
} else {
names(res) <- attr(distance, 'Labels')
}
res
}
#' Calculate distance to closest observation of higher density
#'
#' This function finds, for each observation, the minimum distance to an
#' observation of higher local density.
#'
#' @param distance A distance matrix
#'
#' @param rho A vector of local density values as outputted by [localDensity()]
#'
#' @return A vector of distances with index matching the index in rho
#'
#' @noRd
#'
distanceToPeak <- function(distance, rho) {
# This implementation is faster by virtue of being written in C++.
# It also avoids the need to convert `distance` to a matrix.
res <- distanceToPeakCpp(as.numeric(distance), as.numeric(rho));
names(res) <- names(rho)
res
}
#' Estimate the distance cutoff for a specified neighbor rate
#'
#' This function calculates a distance cutoff value for a specific distance
#' matrix that makes the average neighbor rate (number of points within the
#' distance cutoff value) fall between the provided range. The authors of the
#' algorithm suggests aiming for a neighbor rate between 1 and 2 percent, but
#' also states that the algorithm is quite robust with regards to more extreme
#' cases.
#'
#' @note If the number of points is larger than 448 (resulting in 100,128
#' pairwise distances), 100,128 distance pairs will be randomly selected to
#' speed up computation time. Use [set.seed()] prior to calling
#' `estimateDc` in order to ensure reproducable results.
#'
#' @param distance A distance matrix
#'
#' @param neighborRateLow The lower bound of the neighbor rate
#'
#' @param neighborRateHigh The upper bound of the neighbor rate
#'
#' @return A numeric value giving the estimated distance cutoff value
#'
#' @examples
#' irisDist <- dist(iris[,1:4])
#' estimateDc(irisDist)
#'
#' @references Rodriguez, A., & Laio, A. (2014). *Clustering by fast search and find of density peaks.* Science, **344**(6191), 1492-1496. doi:10.1126/science.1242072
#'
#' @export
#'
estimateDc <- function(distance, neighborRateLow = 0.01, neighborRateHigh = 0.02) {
# This implementation uses binary search instead of linear search.
size <- attr(distance, 'Size')
# If size is greater than 448, there will be >100000 elements in the distance
# object. Subsampling to 100000 elements will speed performance for very
# large dist objects while retaining good accuracy in estimating the cutoff
if (size > 448) {
distance <- distance[sample.int(length(distance), 100128)]
size <- 448
}
low <- min(distance)
high <- max(distance)
dc <- 0
while (TRUE) {
dc <- (low + high) / 2
# neighborRate = average of number of elements of comb per row that are
# less than dc minus 1 divided by size.
# This implementation avoids converting `distance` to a matrix. The matrix is
# symmetrical, so doubling the result from `distance` (half of the matrix) is
# equivalent. The diagonal of the matrix will always be 0, so as long as dc
# is greater than 0, we add 1 for every element of the diagonal, which is
# the same as size
neighborRate <- (((sum(distance < dc) * 2 + (if (0 <= dc) size)) / size - 1)) / size
if (neighborRate >= neighborRateLow && neighborRate <= neighborRateHigh) break
if (neighborRate < neighborRateLow) {
low <- dc
} else {
high <- dc
}
}
cat('Distance cutoff calculated to', dc, '\n')
dc
}
#' Calculate clustering attributes based on the densityClust algorithm
#'
#' This function takes a distance matrix and optionally a distance cutoff and
#' calculates the values necessary for clustering based on the algorithm
#' proposed by Alex Rodrigues and Alessandro Laio (see references). The actual
#' assignment to clusters are done in a later step, based on user defined
#' threshold values. If a distance matrix is passed into `distance` the
#' original algorithm described in the paper is used. If a matrix or data.frame
#' is passed instead it is interpretted as point coordinates and rho will be
#' estimated based on k-nearest neighbors of each point (rho is estimated as
#' `exp(-mean(x))` where `x` is the distance to the nearest
#' neighbors). This can be useful when data is so large that calculating the
#' full distance matrix can be prohibitive.
#'
#' @details
#' The function calculates rho and delta for the observations in the provided
#' distance matrix. If a distance cutoff is not provided this is first estimated
#' using [estimateDc()] with default values.
#'
#' The information kept in the densityCluster object is:
#' \describe{
#' \item{`rho`}{A vector of local density values}
#' \item{`delta`}{A vector of minimum distances to observations of higher density}
#' \item{`distance`}{The initial distance matrix}
#' \item{`dc`}{The distance cutoff used to calculate rho}
#' \item{`threshold`}{A named vector specifying the threshold values for rho and delta used for cluster detection}
#' \item{`peaks`}{A vector of indexes specifying the cluster center for each cluster}
#' \item{`clusters`}{A vector of cluster affiliations for each observation. The clusters are referenced as indexes in the peaks vector}
#' \item{`halo`}{A logical vector specifying for each observation if it is considered part of the halo}
#' \item{`knn_graph`}{kNN graph constructed. It is only applicable to the case where coordinates are used as input. Currently it is set as NA.}
#' \item{`nearest_higher_density_neighbor`}{index for the nearest sample with higher density. It is only applicable to the case where coordinates are used as input.}
#' \item{`nn.index`}{indices for each cell's k-nearest neighbors. It is only applicable for the case where coordinates are used as input.}
#' \item{`nn.dist`}{distance to each cell's k-nearest neighbors. It is only applicable for the case where coordinates are used as input.}
#' }
#' Before running findClusters the threshold, peaks, clusters and halo data is
#' `NA`.
#'
#' @param distance A distance matrix or a matrix (or data.frame) for the
#' coordinates of the data. If a matrix or data.frame is used the distances and
#' local density will be estimated using a fast k-nearest neighbor approach.
#'
#' @param dc A distance cutoff for calculating the local density. If missing it
#' will be estimated with `estimateDc(distance)`
#'
#' @param gaussian Logical. Should a gaussian kernel be used to estimate the
#' density (defaults to FALSE)
#'
#' @param verbose Logical. Should the running details be reported
#'
#' @param ... Additional parameters passed on to [get.knn][FNN::get.knn]
#'
#' @return A densityCluster object. See details for a description.
#'
#' @examples
#' irisDist <- dist(iris[,1:4])
#' irisClust <- densityClust(irisDist, gaussian=TRUE)
#' plot(irisClust) # Inspect clustering attributes to define thresholds
#'
#' irisClust <- findClusters(irisClust, rho=2, delta=2)
#' plotMDS(irisClust)
#' split(iris[,5], irisClust$clusters)
#'
#' @seealso [estimateDc()], [findClusters()]
#'
#' @references Rodriguez, A., & Laio, A. (2014). *Clustering by fast search and find of density peaks.* Science, **344**(6191), 1492-1496. doi:10.1126/science.1242072
#'
#' @export
#'
densityClust <- function(distance, dc, gaussian=FALSE, verbose = FALSE, ...) {
if (is.data.frame(distance) || is.matrix(distance)) {
dp_knn_args <- list(mat = distance, verbose = verbose, ...)
res <- do.call(densityClust.knn, dp_knn_args)
} else {
if (missing(dc)) {
if (verbose) message('Calculating the distance cutoff')
dc <- estimateDc(distance)
}
if (verbose) message('Calculating the local density for each sample based on distance cutoff')
rho <- localDensity(distance, dc, gaussian = gaussian)
if (verbose) message('Calculating the minimal distance of a sample to another sample with higher density')
delta <- distanceToPeak(distance, rho)
if (verbose) message('Returning result...')
res <- list(
rho = rho,
delta = delta,
distance = distance,
dc = dc,
threshold = c(rho = NA, delta = NA),
peaks = NA,
clusters = NA,
halo = NA,
knn_graph = NA,
nearest_higher_density_neighbor = NA,
nn.index = NA,
nn.dist = NA
)
class(res) <- 'densityCluster'
}
res
}
#' @export
#' @importFrom graphics plot points
#'
plot.densityCluster <- function(x, ...) {
plot(x$rho, x$delta, main = 'Decision graph', xlab = expression(rho),
ylab = expression(delta))
if (!is.na(x$peaks[1])) {
points(x$rho[x$peaks], x$delta[x$peaks], col = 2:(1 + length(x$peaks)),
pch = 19)
}
}
#' Plot observations using multidimensional scaling and colour by cluster
#'
#' This function produces an MDS scatterplot based on the distance matrix of the
#' densityCluster object (if there is only the coordinates information, a distance
#' matrix will be calculate first), and, if clusters are defined, colours each
#' observation according to cluster affiliation. Observations belonging to a cluster
#' core is plotted with filled circles and observations belonging to the halo with
#' hollow circles. This plotting is not suitable for running large datasets (for example
#' datasets with > 1000 samples). Users are suggested to use other methods, for example
#' tSNE, etc. to visualize their clustering results too.
#'
#' @param x A densityCluster object as produced by [densityClust()]
#'
#' @param ... Additional parameters. Currently ignored
#'
#' @examples
#' irisDist <- dist(iris[,1:4])
#' irisClust <- densityClust(irisDist, gaussian=TRUE)
#' plot(irisClust) # Inspect clustering attributes to define thresholds
#'
#' irisClust <- findClusters(irisClust, rho=2, delta=2)
#' plotMDS(irisClust)
#' split(iris[,5], irisClust$clusters)
#'
#' @seealso [densityClust()] for creating `densityCluster`
#' objects, and [plotTSNE()] for an alternative plotting approach.
#'
#' @export
#'
plotMDS <- function(x, ...) {
UseMethod('plotMDS')
}
#' @export
#' @importFrom stats cmdscale
#' @importFrom graphics plot points legend
#' @importFrom stats dist
plotMDS.densityCluster <- function(x, ...) {
if (is.data.frame(x$distance) || is.matrix(x$distance)) {
mds <- cmdscale(dist(x$distance))
} else {
mds <- cmdscale(x$distance)
}
plot(mds[,1], mds[,2], xlab = '', ylab = '', main = 'MDS plot of observations')
if (!is.na(x$peaks[1])) {
for (i in 1:length(x$peaks)) {
ind <- which(x$clusters == i)
points(mds[ind, 1], mds[ind, 2], col = i + 1, pch = ifelse(x$halo[ind], 1, 19))
}
legend('topright', legend = c('core', 'halo'), pch = c(19, 1), horiz = TRUE)
}
}
#' Plot observations using t-distributed neighbor embedding and colour by cluster
#'
#' This function produces an t-SNE scatterplot based on the distance matrix of the
#' densityCluster object (if there is only the coordinates information, a distance
#' matrix will be calculate first), and, if clusters are defined, colours each
#' observation according to cluster affiliation. Observations belonging to a cluster
#' core is plotted with filled circles and observations belonging to the halo with
#' hollow circles.
#'
#' @param x A densityCluster object as produced by [densityClust()]
#'
#' @param ... Additional parameters. Currently ignored
#'
#' @examples
#' irisDist <- dist(iris[,1:4])
#' irisClust <- densityClust(irisDist, gaussian=TRUE)
#' plot(irisClust) # Inspect clustering attributes to define thresholds
#'
#' irisClust <- findClusters(irisClust, rho=2, delta=2)
#' plotTSNE(irisClust)
#' split(iris[,5], irisClust$clusters)
#'
#' @seealso [densityClust()] for creating `densityCluster`
#' objects, and [plotMDS()] for an alternative plotting approach.
#'
#' @export
#'
plotTSNE <- function(x, ...) {
UseMethod('plotTSNE')
}
#' @export
#' @importFrom graphics plot points legend
#' @importFrom stats dist
#' @importFrom stats rnorm
#' @importFrom Rtsne Rtsne
plotTSNE.densityCluster <- function(x, max_components = 2, ...) {
if (is.data.frame(x$distance) || is.matrix(x$distance)) {
data <- as.matrix(dist(x$distance))
} else {
data <- as.matrix(x$distance)
}
# avoid issues related to repetitions
dup_id <- which(duplicated(data))
if (length(dup_id) > 0) {
data[dup_id, ] <- data[dup_id, ] + rnorm(length(dup_id) * ncol(data), sd = 1e-10)
}
tsne_res <- Rtsne::Rtsne(as.matrix(data), dims = max_components,
pca = T)
tsne_data <- tsne_res$Y[, 1:max_components]
plot(tsne_data[,1], tsne_data[,2], xlab = '', ylab = '', main = 'tSNE plot of observations')
if (!is.na(x$peaks[1])) {
for (i in 1:length(x$peaks)) {
ind <- which(x$clusters == i)
points(tsne_data[ind, 1], tsne_data[ind, 2], col = i + 1, pch = ifelse(x$halo[ind], 1, 19))
}
legend('topright', legend = c('core', 'halo'), pch = c(19, 1), horiz = TRUE)
}
}
#' @export
#'
print.densityCluster <- function(x, ...) {
if (is.na(x$peaks[1])) {
cat('A densityCluster object with no clusters defined\n\n')
cat('Number of observations:', length(x$rho), '\n')
} else {
cat('A densityCluster object with', length(x$peaks), 'clusters defined\n\n')
cat('Number of observations:', length(x$rho), '\n')
cat('Observations in core: ', sum(!x$halo), '\n\n')
cat('Parameters:\n')
cat('dc (distance cutoff) rho threshold delta threshold\n')
cat(formatC(x$dc, width = -22), formatC(x$threshold[1], width = -22), x$threshold[2])
}
}
#' Detect clusters in a densityCluster obejct
#'
#' This function uses the supplied rho and delta thresholds to detect cluster
#' peaks and assign the rest of the observations to one of these clusters.
#' Furthermore core/halo status is calculated. If either rho or delta threshold
#' is missing the user is presented with a decision plot where they are able to
#' click on the plot area to set the treshold. If either rho or delta is set,
#' this takes presedence over the value found by clicking.
#'
#' @param x A densityCluster object as produced by [densityClust()]
#'
#' @param ... Additional parameters passed on
#'
#' @return A densityCluster object with clusters assigned to all observations
#'
#' @examples
#' irisDist <- dist(iris[,1:4])
#' irisClust <- densityClust(irisDist, gaussian=TRUE)
#' plot(irisClust) # Inspect clustering attributes to define thresholds
#'
#' irisClust <- findClusters(irisClust, rho=2, delta=2)
#' plotMDS(irisClust)
#' split(iris[,5], irisClust$clusters)
#'
#' @references Rodriguez, A., & Laio, A. (2014). *Clustering by fast search and find of density peaks.* Science, **344**(6191), 1492-1496. doi:10.1126/science.1242072
#'
#' @export
#'
findClusters <- function(x, ...) {
UseMethod("findClusters")
}
#' @rdname findClusters
#'
#' @param rho The threshold for local density when detecting cluster peaks
#'
#' @param delta The threshold for minimum distance to higher density when detecting cluster peaks
#'
#' @param plot Logical. Should a decision plot be shown after cluster detection
#'
#' @param peaks A numeric vector indicates the index of density peaks used for clustering. This vector should be retrieved from the decision plot with caution. No checking involved.
#'
#' @param verbose Logical. Should the running details be reported
#'
#' @export
#' @importFrom graphics plot locator
findClusters.densityCluster <- function(x, rho, delta, plot = FALSE, peaks = NULL, verbose = FALSE, ...) {
if (is.data.frame(x$distance) || is.matrix(x$distance)) {
peak_ind <- which(x$rho > rho & x$delta > delta)
x$peaks <- peak_ind
# Assign observations to clusters
runOrder <- order(x$rho, decreasing = TRUE)
cluster <- rep(NA, length(x$rho))
for (i in x$peaks) {
cluster[i] <- match(i, x$peaks)
}
for (ind in setdiff(runOrder, x$peaks)) {
target_lower_density_samples <- which(x$nearest_higher_density_neighbor == ind) #all the target cells should have the same cluster id as current higher density cell
cluster[ind] <- cluster[x$nearest_higher_density_neighbor[ind]]
}
potential_duplicates <- which(is.na(cluster))
for (ind in potential_duplicates) {
res <- as.integer(names(which.max(table(cluster[x$nn.index[ind, ]]))))
if (length(res) > 0) {
cluster[ind] <- res #assign NA samples to the majority of its clusters
} else {
message('try to increase the number of kNN (through argument k) at step of densityClust.')
cluster[ind] <- NA
}
}
x$clusters <- factor(cluster)
# Calculate core/halo status of observation
border <- rep(0, length(x$peaks))
if (verbose) message('Identifying core and halo for each cluster')
for (i in 1:length(x$peaks)) {
if (verbose) message('the current index of the peak is ', i)
connect_samples_ind <- intersect(unique(x$nn.index[cluster == i, ]), which(cluster != i))
averageRho <- outer(x$rho[cluster == i], x$rho[connect_samples_ind], '+') / 2
if (any(connect_samples_ind)) border[i] <- max(averageRho[connect_samples_ind])
}
x$halo <- x$rho < border[cluster]
x$threshold['rho'] <- rho
x$threshold['delta'] <- delta
}
else {
# Detect cluster peaks
if (!is.null(peaks)) {
if (verbose) message('peaks are provided, clustering will be performed based on them')
x$peaks <- peaks
} else {
if (missing(rho) || missing(delta)) {
x$peaks <- NA
plot(x)
cat('Click on plot to select thresholds\n')
threshold <- locator(1)
if (missing(rho)) rho <- threshold$x
if (missing(delta)) delta <- threshold$y
plot = TRUE
}
x$peaks <- which(x$rho > rho & x$delta > delta)
x$threshold['rho'] <- rho
x$threshold['delta'] <- delta
}
if (plot) {
plot(x)
}
# Assign observations to clusters
runOrder <- order(x$rho, decreasing = TRUE)
cluster <- rep(NA, length(x$rho))
if (verbose) message('Assigning each sample to a cluster based on its nearest density peak')
for (i in runOrder) {
if ((i %% round(length(runOrder) / 25)) == 0) {
if (verbose) message(paste('the runOrder index is', i))
}
if (i %in% x$peaks) {
cluster[i] <- match(i, x$peaks)
} else {
higherDensity <- which(x$rho > x$rho[i])
cluster[i] <- cluster[higherDensity[which.min(findDistValueByRowColInd(as.numeric(x$distance), as.integer(attr(x$distance, 'Size')), i, higherDensity))]]
}
}
x$clusters <- cluster
# Calculate core/halo status of observation
border <- rep(0, length(x$peaks))
if (verbose) message('Identifying core and halo for each cluster')
for (i in 1:length(x$peaks)) {
if (verbose) message('the current index of the peak is ', i)
averageRho <- outer(x$rho[cluster == i], x$rho[cluster != i], '+')/2
index <- findDistValueByRowColInd(as.numeric(x$distance), as.integer(attr(x$distance, 'Size')), which(cluster == i), which(cluster != i)) <= x$dc
if (any(index)) border[i] <- max(averageRho[index])
}
x$halo <- x$rho < border[cluster]
}
x$halo <- x$rho < border[cluster]
# Sort cluster designations by gamma (= rho * delta)
gamma <- x$rho * x$delta
pk.ordr <- order(gamma[x$peaks], decreasing = TRUE)
x$peaks <- x$peaks[pk.ordr]
x$clusters <- match(x$clusters, pk.ordr)
x
}
#' Extract cluster membership from a densityCluster object
#'
#' This function allows the user to extract the cluster membership of all the
#' observations in the given densityCluster object. The output can be formatted
#' in two ways as described below. Halo observations can be chosen to be removed
#' from the output.
#'
#' @details
#' Two formats for the output are available. Either a vector of integers
#' denoting for each observation, which cluster the observation belongs to. If
#' halo observations are removed, these are set to NA. The second format is a
#' list with a vector for each group containing the index for the member
#' observations in the group. If halo observations are removed their indexes are
#' omitted. The list format correspond to the following transform of the vector
#' format `split(1:length(clusters), clusters)`, where `clusters` are
#' the cluster information in vector format.
#'
#' @param x The densityCluster object. [findClusters()] must have
#' been performed prior to this call to avoid throwing an error.
#'
#' @param ... Currently ignored
#'
#' @return A vector or list with cluster memberships for the observations in the
#' initial distance matrix
#'
#' @export
#'
clusters <- function(x, ...) {
UseMethod("clusters")
}
#' @rdname clusters
#'
#' @param as.list Should the output be in the list format. Defaults to FALSE
#'
#' @param halo.rm Logical. should halo observations be removed. Defaults to TRUE
#'
#' @export
#'
clusters.densityCluster <- function(x, as.list = FALSE, halo.rm = TRUE, ...) {
if (!clustered(x)) stop('x must be clustered prior to cluster extraction')
res <- x$clusters
if (halo.rm) {
res[x$halo] <- NA
}
if (as.list) {
res <- split(1:length(res), res)
}
res
}
#' Check whether a densityCluster object have been clustered
#'
#' This function checks whether [findClusters()] has been performed on
#' the given object and returns a boolean depending on the outcome
#'
#' @param x A densityCluster object
#'
#' @return `TRUE` if [findClusters()] have been performed, otherwise
#' `FALSE`
#'
#' @export
#'
clustered <- function(x) {
UseMethod("clustered")
}
#' @rdname clustered
#'
#' @export
#'
clustered.densityCluster <- function(x) {
!any(is.na(x$peaks[1]), is.na(x$clusters[1]), is.na(x$halo[1]))
}
#' Extract labels
#'
#' @noRd
#'
#' @export
#'
labels.densityCluster <- function(object, ...) {
labels(object$distance)
}
#' Fast knn version of densityClust
#'
#' This function will be called by densityClust if a matrix or data.frame is
#' passed in rather than a distance object
#'
#' @noRd
#'
#' @importFrom FNN get.knn
densityClust.knn <- function(mat, k = NULL, verbose = F, ...) {
if (is.null(k)) {
k <- round(sqrt(nrow(mat)) / 2) # empirical way to select the number of neighbor points
k <- max(10, k) # ensure k is at least 10
}
if (verbose) message('Finding kNN using FNN with ', k, ' neighbors')
dx <- get.knn(mat, k = k, ...)
nn.index <- dx$nn.index
nn.dist <- dx$nn.dist
N <- nrow(nn.index)
knn_graph <- NULL
if (verbose) message('Calculating the local density for each sample based on kNNs ...')
rho <- apply(nn.dist, 1, function(x) {
exp(-mean(x))
})
if (verbose) message('Calculating the minimal distance of a sample to another sample with higher density ...')
rho_order <- order(rho)
delta <- vector(mode = 'integer', length = N)
nearest_higher_density_neighbor <- vector(mode = 'integer', length = N)
delta_neighbor_tmp <- smallest_dist_rho_order_coords(rho[rho_order], as.matrix(mat[rho_order, ]))
delta[rho_order] <- delta_neighbor_tmp$smallest_dist
nearest_higher_density_neighbor[rho_order] <- rho_order[delta_neighbor_tmp$nearest_higher_density_sample + 1]
if (verbose) message('Returning result...')
res <- list(
rho = rho,
delta = delta,
distance = mat,
dc = NULL,
threshold = c(rho = NA, delta = NA),
peaks = NA,
clusters = NA,
halo = NA,
knn_graph = knn_graph,
nearest_higher_density_neighbor = nearest_higher_density_neighbor,
nn.index = nn.index,
nn.dist = nn.dist
)
class(res) <- 'densityCluster'
res
}
|