File: sNN.R

package info (click to toggle)
r-cran-dbscan 1.2.2%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,196 kB
  • sloc: cpp: 4,359; sh: 13; makefile: 5
file content (216 lines) | stat: -rw-r--r-- 6,622 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#######################################################################
# dbscan - Density Based Clustering of Applications with Noise
#          and Related Algorithms
# Copyright (C) 2017 Michael Hahsler

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

# number of shared nearest neighbors including the point itself.


#' Find Shared Nearest Neighbors
#'
#' Calculates the number of shared nearest neighbors
#' and creates a shared nearest neighbors graph.
#'
#' The number of shared nearest neighbors of two points p and q is the
#' intersection of the kNN neighborhood of two points.
#' Note: that each point is considered to be part
#' of its own kNN neighborhood.
#' The range for the shared nearest neighbors is
#' \eqn{[0, k]}. The result is a n-by-k matrix called `shared`.
#' Each row is a point and the columns are the point's k nearest neighbors.
#' The value is the count of the shared neighbors.
#'
#' The shared nearest neighbor graph connects a point with all its nearest neighbors
#' if they have at least one shared neighbor. The number of shared neighbors can be used
#' as an edge weight.
#' Javis and Patrick (1973) use a slightly
#' modified (see parameter `jp`) shared nearest neighbor graph for
#' clustering.
#'
#' @aliases sNN snn
#' @family NN functions
#'
#' @param x a data matrix, a [dist] object or a [kNN] object.
#' @param k number of neighbors to consider to calculate the shared nearest
#' neighbors.
#' @param kt minimum threshold on the number of shared nearest neighbors to
#' build the shared nearest neighbor graph. Edges are only preserved if
#' `kt` or more neighbors are shared.
#' @param jp In regular sNN graphs, two points that are not neighbors
#' can have shared neighbors.
#' Javis and Patrick (1973) requires the two points to be neighbors, otherwise
#' the count is zeroed out. `TRUE` uses this behavior.
#' @param search nearest neighbor search strategy (one of `"kdtree"`, `"linear"` or
#' `"dist"`).
#' @param sort sort by the number of shared nearest neighbors? Note that this
#' is expensive and `sort = FALSE` is much faster. sNN objects can be
#' sorted using `sort()`.
#' @param bucketSize max size of the kd-tree leafs.
#' @param splitRule rule to split the kd-tree. One of `"STD"`, `"MIDPT"`, `"FAIR"`,
#' `"SL_MIDPT"`, `"SL_FAIR"` or `"SUGGEST"` (SL stands for sliding). `"SUGGEST"` uses
#' ANNs best guess.
#' @param approx use approximate nearest neighbors. All NN up to a distance of
#' a factor of `(1 + approx) eps` may be used. Some actual NN may be omitted
#' leading to spurious clusters and noise points.  However, the algorithm will
#' enjoy a significant speedup.
#' @param decreasing logical; sort in decreasing order?
#' @param ... additional parameters are passed on.
#' @return An object of class `sNN` (subclass of [kNN] and [NN]) containing a list
#' with the following components:
#' \item{id }{a matrix with ids. }
#' \item{dist}{a matrix with the distances. }
#' \item{shared }{a matrix with the number of shared nearest neighbors. }
#' \item{k }{number of `k` used. }
#' \item{metric }{the used distance metric. }
#'
#' @author Michael Hahsler
#' @references R. A. Jarvis and E. A. Patrick. 1973. Clustering Using a
#' Similarity Measure Based on Shared Near Neighbors. _IEEE Trans. Comput._
#' 22, 11 (November 1973), 1025-1034.
#' \doi{10.1109/T-C.1973.223640}
#' @keywords model
#' @examples
#' data(iris)
#' x <- iris[, -5]
#'
#' # finding kNN and add the number of shared nearest neighbors.
#' k <- 5
#' nn <- sNN(x, k = k)
#' nn
#'
#' # shared nearest neighbor distribution
#' table(as.vector(nn$shared))
#'
#' # explore number of shared points for the k-neighborhood of point 10
#' i <- 10
#' nn$shared[i,]
#'
#' plot(nn, x)
#'
#' # apply a threshold to create a sNN graph with edges
#' # if more than 3 neighbors are shared.
#' nn_3 <- sNN(nn, kt = 3)
#' plot(nn_3, x)
#'
#' # get an adjacency list for the shared nearest neighbor graph
#' adjacencylist(nn_3)
#' @export
sNN <- function(x,
  k,
  kt = NULL,
  jp = FALSE,
  sort = TRUE,
  search = "kdtree",
  bucketSize = 10,
  splitRule = "suggest",
  approx = 0) {
  if (missing(k))
    k <- x$k

  if (inherits(x, "kNN")) {
    if (k != x$k) {
      if (ncol(x$id) < k)
        stop("kNN object does not contain enough neighbors!")
      if (!x$sort)
        x <- sort.kNN(x)
      x$id <- x$id[, 1:k]
      x$dist <- x$dist[, 1:k]
      x$k <- k
    }

  } else
    x <-
      kNN(
        x,
        k,
        sort = FALSE,
        search = search,
        bucketSize = bucketSize,
        splitRule = splitRule,
        approx = approx
      )

  x$shared <- SNN_sim_int(x$id, as.logical(jp[1]))
  x$sort_shared <- FALSE

  class(x) <- c("sNN", "kNN", "NN")

  if (sort)
    x <- sort.sNN(x)

  x$kt <- kt

  if (!is.null(kt)) {
    if (kt > k)
      stop("kt needs to be less than k.")
    rem <- x$shared < kt
    x$id[rem] <- NA
    x$dist[rem] <- NA
    x$shared[rem] <- NA
  }

  x
}

#' @rdname sNN
#' @export
sort.sNN <- function(x, decreasing = TRUE, ...) {
  if (isTRUE(x$sort_shared))
    return(x)
  if (is.null(x$shared))
    stop("Unable to sort. Number of shared neighbors is missing.")
  if (ncol(x$id) < 2) {
    x$sort <- TRUE
    x$sort_shared <- TRUE
    return(x)
  }

  ## sort first by number of shared points (decreasing) and break ties by id (increasing)
  k <- ncol(x$shared)
  o <- vapply(
    seq_len(nrow(x$shared)),
    function(i) order(k - x$shared[i, ], x$id[i, ], decreasing = !decreasing),
    integer(k)
  )
  for (i in seq_len(ncol(o))) {
    x$shared[i, ] <- x$shared[i, ][o[, i]]
    x$dist[i, ] <- x$dist[i, ][o[, i]]
    x$id[i, ] <- x$id[i, ][o[, i]]
  }

  x$sort <- FALSE
  x$sort_shared <- TRUE

  x
}

#' @rdname sNN
#' @export
print.sNN <- function(x, ...) {
  cat(
    "shared-nearest neighbors for ",
    nrow(x$id),
    " objects (k=",
    x$k,
    ", kt=",
    x$kt %||% "NULL",
    ").",
    "\n",
    sep = ""
  )
  cat("Available fields: ", toString(names(x)), "\n", sep = "")
}