1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/amatch.R
\name{amatch}
\alias{amatch}
\alias{ain}
\title{Approximate string matching}
\usage{
amatch(
x,
table,
nomatch = NA_integer_,
matchNA = TRUE,
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw",
"soundex"),
useBytes = FALSE,
weight = c(d = 1, i = 1, s = 1, t = 1),
maxDist = 0.1,
q = 1,
p = 0,
bt = 0,
nthread = getOption("sd_num_thread")
)
ain(x, table, ...)
}
\arguments{
\item{x}{elements to be approximately matched: will be coerced to
\code{character} unless it is a list consisting of \code{integer} vectors.}
\item{table}{lookup table for matching. Will be coerced to \code{character}
unless it is a list consting of \code{integer} vectors.}
\item{nomatch}{The value to be returned when no match is found. This is
coerced to integer.}
\item{matchNA}{Should \code{NA}'s be matched? Default behaviour mimics the
behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches
\code{NA} (see also the note on \code{NA} handling below).}
\item{method}{Matching algorithm to use. See \code{\link{stringdist-metrics}}.}
\item{useBytes}{Perform byte-wise comparison. See \code{\link{stringdist-encoding}}.}
\item{weight}{For \code{method='osa'} or \code{'dl'}, the penalty for
deletion, insertion, substitution and transposition, in that order. When
\code{method='lv'}, the penalty for transposition is ignored. When
\code{method='jw'}, the weights associated with characters of \code{a},
characters from \code{b} and the transposition weight, in that order.
Weights must be positive and not exceed 1. \code{weight} is ignored
completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
\code{'Jaccard'}, \code{'lcs'}, or \code{'soundex'}.}
\item{maxDist}{Elements in \code{x} will not be matched with elements of
\code{table} if their distance is larger than \code{maxDist}. Note that the
maximum distance between strings depends on the method: it should always be
specified.}
\item{q}{q-gram size, only when method is \code{'qgram'}, \code{'jaccard'},
or \code{'cosine'}.}
\item{p}{Winklers 'prefix' parameter for Jaro-Winkler distance, with
\eqn{0\leq p\leq0.25}. Only when method is \code{'jw'}}
\item{bt}{Winkler's boost threshold. Winkler's prefix factor is
only applied when the Jaro distance is larger than \code{bt}.
Applies only to \code{method='jw'} and \code{p>0}.}
\item{nthread}{Number of threads used by the underlying C-code. A sensible
default is chosen, see \code{\link{stringdist-parallelization}}.}
\item{...}{parameters to pass to \code{amatch} (except \code{nomatch})}
}
\value{
\code{amatch} returns the position of the closest match of \code{x}
in \code{table}. When multiple matches with the same smallest distance
metric exist, the first one is returned. \code{ain} returns a
\code{logical} vector of length \code{length(x)} indicating wether an
element of \code{x} approximately matches an element in \code{table}.
}
\description{
Approximate string matching equivalents of \code{R}'s native
\code{\link[base]{match}} and \code{\%in\%}.
}
\details{
\code{ain} is currently defined as
\code{ain(x,table,...) <- function(x,table,...) amatch(x, table, nomatch=0,...) > 0}
}
\section{Note on \code{NA} handling}{
\code{R}'s native \code{\link[base]{match}} function matches \code{NA} with
\code{NA}. This may feel inconsistent with \code{R}'s usual \code{NA}
handling, since for example \code{NA==NA} yields
\code{NA} rather than \code{TRUE}. In most cases, one may reason about the
behaviour under \code{NA} along the lines of ``if one of the arguments is
\code{NA}, the result shall be \code{NA}'', simply because not all
information necessary to execute the function is available. One uses special
functions such as \code{is.na}, \code{is.null} \emph{etc.} to handle special
values.
The \code{amatch} function mimics the behaviour of \code{\link[base]{match}}
by default: \code{NA} is matched with \code{NA} and with nothing else. Note
that this is inconsistent with the behaviour of \code{\link{stringdist}}
since \code{stringdist} yields \code{NA} when at least one of the arguments
is \code{NA}. The same inconsistency exists between \code{\link[base]{match}}
and \code{\link[utils]{adist}}. In \code{amatch} this behaviour can be
controlled by setting \code{matchNA=FALSE}. In that case, if any of the
arguments in \code{x} is \code{NA}, the \code{nomatch} value is returned,
regardless of whether \code{NA} is present in \code{table}. In
\code{\link[base]{match}} the behaviour can be controlled by setting the
\code{incomparables} option.
}
\examples{
# lets see which sci-fi heroes are stringdistantly nearest
amatch("leia",c("uhura","leela"),maxDist=5)
# we can restrict the search
amatch("leia",c("uhura","leela"),maxDist=1)
# we can match each value in the find vector against values in the lookup table:
amatch(c("leia","uhura"),c("ripley","leela","scully","trinity"),maxDist=2)
# setting nomatch returns a different value when no match is found
amatch("leia",c("uhura","leela"),maxDist=1,nomatch=0)
# this is always true if maxDist is Inf
ain("leia",c("uhura","leela"),maxDist=Inf)
# Let's look in a neighbourhood of maximum 2 typo's (by default, the OSA algorithm is used)
ain("leia",c("uhura","leela"), maxDist=2)
}
\seealso{
Other matching:
\code{\link{afind}()}
}
\concept{matching}
|