File: seq_dist.Rd

package info (click to toggle)
r-cran-stringdist 0.9.15-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,424 kB
  • sloc: ansic: 1,690; sh: 13; makefile: 2
file content (123 lines) | stat: -rw-r--r-- 4,632 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/seqdist.R
\name{seq_dist}
\alias{seq_dist}
\alias{seq_distmatrix}
\title{Compute distance metrics between integer sequences}
\usage{
seq_dist(
  a,
  b,
  method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw"),
  weight = c(d = 1, i = 1, s = 1, t = 1),
  q = 1,
  p = 0,
  bt = 0,
  nthread = getOption("sd_num_thread")
)

seq_distmatrix(
  a,
  b,
  method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw"),
  weight = c(d = 1, i = 1, s = 1, t = 1),
  q = 1,
  p = 0,
  bt = 0,
  useNames = c("names", "none"),
  nthread = getOption("sd_num_thread")
)
}
\arguments{
\item{a}{(\code{list} of) \code{integer} or \code{numeric} vector(s). Will be converted with \code{as.integer}  (target)}

\item{b}{(\code{list} of) \code{integer} or \code{numeric} vector(s). Will be converted with \code{as.integer} (source). 
Optional for \code{seq_distmatrix}.}

\item{method}{Distance metric. See \code{\link{stringdist-metrics}}}

\item{weight}{For \code{method='osa'} or \code{'dl'}, the penalty for
deletion, insertion, substitution and transposition, in that order. When
\code{method='lv'}, the penalty for transposition is ignored. When
\code{method='jw'}, the weights associated with characters of \code{a},
characters from \code{b} and the transposition weight, in that order. 
Weights must be positive and not exceed 1. \code{weight} is ignored
completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
\code{'Jaccard'}, or \code{'lcs'}}

\item{q}{Size of the \eqn{q}-gram; must be nonnegative. Only applies to
\code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.}

\item{p}{Prefix factor for Jaro-Winkler distance. The valid range for 
\code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the
Jaro-distance is returned. Applies only to \code{method='jw'}.}

\item{bt}{Winkler's boost threshold. Winkler's prefix factor is
only applied when the Jaro distance is larger than \code{bt}
Applies only to \code{method='jw'} and \code{p>0}.}

\item{nthread}{Maximum number of threads to use. By default, a sensible
number of threads is chosen, see \code{\link{stringdist-parallelization}}.}

\item{useNames}{label the output matrix with \code{names(a)} and \code{names(b)}?}
}
\value{
\code{seq_dist} returns a numeric vector with pairwise distances between \code{a}
and \code{b} of length \code{max(length(a),length(b)}.

For \code{seq_distmatrix} there are two options. If \code{b} is missing, the 
\code{\link[stats]{dist}} object corresponding to the \code{length(a) X
length(a)} distance matrix is returned. If \code{b} is specified, the
\code{length(a) X length(b)} distance matrix is returned.
   
If any element of \code{a} or \code{b} is \code{NA_integer_}, the distance with
any matched integer vector will result in \code{NA}. Missing values in the sequences
themselves are treated as a number and not treated specially (Also see the examples).
}
\description{
\code{seq_dist} computes pairwise string distances between elements of 
\code{a} and \code{b}, where the argument with less elements is recycled. 
\code{seq_distmatrix} computes the distance matrix with rows according to
\code{a} and columns according to \code{b}.
}
\section{Notes}{

Input vectors are converted with \code{as.integer}. This causes truncation for numeric
vectors (e.g. \code{pi} will be treated as \code{3L}).
}

\examples{
# Distances between lists of integer vectors. Note the postfix 'L' to force 
# integer storage. The shorter argument is recycled over (\code{a})
a <- list(c(102L, 107L))                        # fu
b <- list(c(102L,111L,111L),c(102L,111L,111L))  # foo, fo
seq_dist(a,b)

# translate strings to a list of integer sequences 
a <- lapply(c("foo","bar","baz"),utf8ToInt)
seq_distmatrix(a)

# Note how missing values are treated. NA's as part of the sequence are treated 
# as an integer (the representation of NA_integer_).
a <- list(NA_integer_,c(102L, 107L))
b <- list(c(102L,111L,111L),c(102L,111L,NA_integer_))  
seq_dist(a,b)

\dontrun{
# Distance between sentences based on word order. Note: words must match exactly or they
# are treated as completely different.
#
# For this example you need to have the 'hashr' package installed.
a <- "Mary had a little lamb"
a.words <- strsplit(a,"[[:blank:]]+")
a.int <- hashr::hash(a.words)
b <- c("a little lamb had Mary",
           "had Mary a little lamb")
b.int <- hashr::hash(strsplit(b,"[[:blank:]]+"))
seq_dist(a.int,b.int)
}

}
\seealso{
\code{\link{seq_sim}}, \code{\link{seq_amatch}}, \code{\link{seq_qgrams}}
}