File: dist.Rd

package info (click to toggle)
r-cran-proxy 0.4-27-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 372 kB
  • sloc: ansic: 1,247; sh: 12; makefile: 5
file content (181 lines) | stat: -rwxr-xr-x 6,754 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
\name{dist}
\alias{dist}
\alias{simil}
\alias{print.simil}
\alias{print.dist}
\alias{print.crosssimil}
\alias{print.crossdist}
\alias{as.matrix.dist}
\alias{as.matrix.simil}
\alias{pr_simil2dist}
\alias{pr_dist2simil}
\alias{as.matrix}
\alias{as.dist}
\alias{as.simil}
\alias{row.dist}
\alias{col.dist}
\title{Matrix Distance/Similarity Computation}
\description{
  These functions compute and return the auto-distance/similarity matrix
  between either rows or columns of a matrix/data frame, or a list,
  as well as the cross-distance matrix between two matrices/data frames/lists.
}
\usage{
dist(x, y = NULL, method = NULL, ..., diag = FALSE, upper = FALSE,
     pairwise = FALSE, by_rows = TRUE, convert_similarities = TRUE,
     auto_convert_data_frames = TRUE)
simil(x, y = NULL, method = NULL, ..., diag = FALSE, upper = FALSE,
      pairwise = FALSE, by_rows = TRUE, convert_distances = TRUE,
      auto_convert_data_frames = TRUE)

pr_dist2simil(x)
pr_simil2dist(x)

as.dist(x, FUN = NULL)
as.simil(x, FUN = NULL)

\method{as.matrix}{dist}(x, diag = 0, \dots)
\method{as.matrix}{simil}(x, diag = NA, \dots)
}
\arguments{
  \item{x}{For \code{dist} and \code{simil}, a numeric matrix object, a data frame, or a list. A vector
    will be converted into a column matrix. For \code{as.simil} and
    \code{as.dist}, an object of class \code{dist} and
    \code{simil}, respectively, or a numeric matrix. For
    \code{pr_dist2simil} and \code{pr_simil2dist}, any numeric vector.}
  \item{y}{\code{NULL}, or a similar object than \code{x}}
  \item{method}{a function, a registry entry, or a mnemonic string referencing the
    proximity measure. A list of all available measures can be obtained
    using \code{\link{pr_DB}} (see examples). The default for \code{dist} is
    \code{"Euclidean"}, and for \code{simil} \code{"correlation"}.}
  \item{diag}{logical value indicating whether the diagonal of the
    distance/similarity matrix should be printed by
    \code{\link{print.dist}}/\code{\link{print.simil}}. Note that the
    diagonal values are never stored in \code{dist} objects. 

    In the context of \code{as.matrix} the value to use on the diagonal
    representing self-proximities. In case of similarities, this
    defaults to \code{NA} since a priori there are no upper bounds, so
    the maximum similarity needs to be specified by the user.}
  \item{upper}{logical value indicating whether the upper triangle of the
    distance/similarity matrix should be printed by
    \code{\link{print.dist}}/\code{\link{print.simil}}}
  \item{pairwise}{logical value indicating whether distances should be
    computed for the pairs of \code{x} and \code{y} only.}
  \item{by_rows}{logical indicating whether proximities between rows, or
    columns should be computed.}
  \item{convert_similarities, convert_distances}{logical indicating
    whether distances should be automatically converted into
    similarities (and the other way round) if needed.}
  \item{auto_convert_data_frames}{logical indicating whether data frames
    should be converted to matrices if all variables are numeric,
    or all are logical, or all are complex.}
  \item{FUN}{optional function to be used by \code{as.dist} and
    \code{as.simil}. If \code{NULL}, it is looked up in the method
    registry. If there is none specified there, \code{FUN} defaults to
    \code{pr_simil2dist} and \code{pr_dist2simil}, respectively.}
  \item{\dots}{further arguments passed to the proximity function.}
}
\details{
  The interface is fashioned after \code{\link[stats]{dist}}, but can
  also compute cross-distances, and allows user extensions by means of
  registry of all proximity measures (see \code{\link{pr_DB}}).

  Missing values are allowed but are excluded from all computations 
  involving the rows within which they occur. If some columns are
  excluded in calculating a Euclidean, Manhattan, Canberra or
  Minkowski distance, the sum is scaled up proportionally to the
  number of columns used (compare \code{\link[stats]{dist}} in
  package \pkg{stats}).

  Data frames are silently coerced to matrix if all columns are of
  (same) mode \code{numeric} or \code{logical}.

  Distance measures can be used with \code{simil}, and similarity
  measures with \code{dist}. In these cases, the result is transformed
  accordingly using the specified coercion functions (default:
  \eqn{\mathrm{pr\_simil2dist}(x) = 1 - \mathrm{abs}(x)}{pr_simil2dist(x) = 1 - abs(x)} and \eqn{\mathrm{pr\_dist2simil}(x) = 1 / (1 + x)}{pr_dist2simil(x) = 1 / (1 + x)}).
  Objects of class \code{simil} and \code{dist} can be converted one in
  another using \code{as.dist} and \code{as.simil}, respectively.

  Distance and similarity objects can conveniently be subset
  (see examples). Note that duplicate indexes are silently ignored.
}
\value{
  Auto distances/similarities are returned as an object of class \code{dist}/\code{simil} and 
  cross-distances/similarities as an object of class \code{crossdist}/\code{crosssimil}. 
}
\references{
  Anderberg, M.R. (1973), \emph{Cluster analysis for applications},
  359 pp., Academic Press, New York, NY, USA.
  
  Cox, M.F. and Cox, M.A.A. (2001), \emph{Multidimensional Scaling},
  Chapman and Hall.
  
  Sokol, R.S. and Sneath P.H.A (1963), \emph{Principles of Numerical
  Taxonomy}, W. H. Freeman and Co., San Francisco.
}
\author{David Meyer \email{David.Meyer@R-project.org}
  and Christian Buchta \email{Christian.Buchta@wu-wien.ac.at}}

\seealso{\code{\link[stats]{dist}} for compatibility information, and
  \code{\link{pr_DB}} for the proximity data base.}
\examples{
### show available proximities
summary(pr_DB)

### get more information about a particular one
pr_DB$get_entry("Jaccard")

### binary data
x <- matrix(sample(c(FALSE, TRUE), 8, rep = TRUE), ncol = 2)
dist(x, method = "Jaccard")

### for real-valued data
dist(x, method = "eJaccard")

### for positive real-valued data
dist(x, method = "fJaccard")

### cross distances
dist(x, x, method = "Jaccard")

### pairwise (diagonal)
dist(x, x, method = "Jaccard", 
	 pairwise = TRUE)

### this is the same but less efficient
as.matrix(stats::dist(x, method = "binary"))

### numeric data
x <- matrix(rnorm(16), ncol = 4)

## test inheritance of names
rownames(x) <- LETTERS[1:4]
colnames(x) <- letters[1:4]
dist(x)
dist(x, x)

## custom distance function
f <- function(x, y) sum(x * y)
dist(x, f)

## working with lists
z <- unlist(apply(x, 1, list), recursive = FALSE)
(d <- dist(z))
dist(z, z)

## subsetting
d[[1:2]]
subset(d, c(1,3,4))
d[[c(1,2,2)]]	    # duplicate index gets ignored

## transformations and self-proximities
as.matrix(as.simil(d, function(x) exp(-x)), diag = 1)

## row and column indexes
row.dist(d)
col.dist(d)
}
\keyword{cluster}