File: seq_amatch.Rd

package info (click to toggle)
r-cran-stringdist 0.9.15-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,424 kB
  • sloc: ansic: 1,690; sh: 13; makefile: 2
file content (123 lines) | stat: -rw-r--r-- 4,410 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/amatch.R
\name{seq_amatch}
\alias{seq_amatch}
\alias{seq_ain}
\title{Approximate matching for integer sequences.}
\usage{
seq_amatch(
  x,
  table,
  nomatch = NA_integer_,
  matchNA = TRUE,
  method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw"),
  weight = c(d = 1, i = 1, s = 1, t = 1),
  maxDist = 0.1,
  q = 1,
  p = 0,
  bt = 0,
  nthread = getOption("sd_num_thread")
)

seq_ain(x, table, ...)
}
\arguments{
\item{x}{(\code{list} of) \code{integer} or \code{numeric} vector(s) to be
approximately matched. Will be converted with \code{as.integer}.}

\item{table}{(\code{list} of) \code{integer} or \code{numeric} vector(s)
serving as lookup table for matching. Will be converted with
\code{as.integer}.}

\item{nomatch}{The value to be returned when no match is found. This is
coerced to integer.}

\item{matchNA}{Should \code{NA}'s be matched? Default behaviour mimics the 
behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches
\code{NA}. With \code{NA}, we mean a missing entry in the \code{list}, represented as \code{NA_integer_}. 
If one of the integer sequences stored in the list has an \code{NA} entry,
this is just treated as another integer (the representation of
\code{NA_integer_}).}

\item{method}{Matching algorithm to use. See \code{\link{stringdist-metrics}}.}

\item{weight}{For \code{method='osa'} or \code{'dl'}, the penalty for
deletion, insertion, substitution and transposition, in that order. When
\code{method='lv'}, the penalty for transposition is ignored. When
\code{method='jw'}, the weights associated with integers in elements of \code{a},
integers in elements of \code{b} and the transposition weight, in that order. 
Weights must be positive and not exceed 1. \code{weight} is ignored
completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
\code{'Jaccard'}, or \code{'lcs'}.}

\item{maxDist}{Elements in \code{x} will not be matched with elements of 
\code{table} if their distance is larger than \code{maxDist}. Note that the
maximum distance between strings depends on the method: it should always be
specified.}

\item{q}{q-gram size, only when method is \code{'qgram'}, \code{'jaccard'},
or \code{'cosine'}.}

\item{p}{Winkler's prefix parameter for Jaro-Winkler distance, with
\eqn{0\leq p\leq0.25}. Only when method is \code{'jw'}}

\item{bt}{Winkler's boost threshold. Winkler's prefix factor is
only applied when the Jaro distance is larger than \code{bt}.
Applies only to \code{method='jw'} and \code{p>0}.}

\item{nthread}{Number of threads used by the underlying C-code. A sensible
default is chosen, see \code{\link{stringdist-parallelization}}.}

\item{...}{parameters to pass to \code{seq_amatch} (except \code{nomatch})}
}
\value{
\code{seq_amatch} returns the position of the closest match of \code{x}
  in \code{table}. When multiple matches with the same minimal distance
  metric exist, the first one is returned. \code{seq_ain} returns a
  \code{logical} vector of length \code{length(x)} indicating wether an
  element of \code{x} approximately matches an element in \code{table}.
}
\description{
For a \code{list} of integer vectors \code{x}, find the closest matches in a
\code{list} of integer or numeric vectors in \code{table.}
}
\section{Notes}{

\code{seq_ain} is currently defined as 

\code{seq_ain(x,table,...) <- function(x,table,...) amatch(x, table, nomatch=0,...) > 0}

All input vectors are converted with \code{as.integer}. This causes truncation for numeric
vectors (e.g. \code{pi} will be treated as \code{3L}).
}

\examples{

x <- list(1:3,c(3:1),c(1L,3L,4L))
table <- list(
  c(5L,3L,1L,2L)
  ,1:4
)
seq_amatch(x,table,maxDist=2)

# behaviour with missings
seq_amatch(list(c(1L,NA_integer_,3L),NA_integer_), list(1:3),maxDist=1)


\dontrun{
# Match sentences based on word order. Note: words must match exactly or they
# are treated as completely different.
#
# For this example you need to have the 'hashr' package installed.
x <- "Mary had a little lamb"
x.words <- strsplit(x,"[[:blank:]]+")
x.int <- hashr::hash(x.words)
table <- c("a little lamb had Mary",
           "had Mary a little lamb")
table.int <- hashr::hash(strsplit(table,"[[:blank:]]+"))
seq_amatch(x.int,table.int,maxDist=3)
}
}
\seealso{
\code{\link{seq_dist}}, \code{\link{seq_sim}}, \code{\link{seq_qgrams}}
}