File: stringdist.Rd

package info (click to toggle)
r-cran-stringdist 0.9.15-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 1,424 kB
sloc: ansic: 1,690; sh: 13; makefile: 2
file content (176 lines) | stat: -rw-r--r-- 6,333 bytes
parent folder | download | duplicates (3)
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stringdist.R
\name{stringdist}
\alias{stringdist}
\alias{stringdistmatrix}
\title{Compute distance metrics between strings}
\usage{
stringdist(
  a,
  b,
  method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw",
    "soundex"),
  useBytes = FALSE,
  weight = c(d = 1, i = 1, s = 1, t = 1),
  q = 1,
  p = 0,
  bt = 0,
  nthread = getOption("sd_num_thread")
)

stringdistmatrix(
  a,
  b,
  method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw",
    "soundex"),
  useBytes = FALSE,
  weight = c(d = 1, i = 1, s = 1, t = 1),
  q = 1,
  p = 0,
  bt = 0,
  useNames = c("none", "strings", "names"),
  nthread = getOption("sd_num_thread")
)
}
\arguments{
\item{a}{R object (target); will be converted by \code{as.character}}

\item{b}{R object (source); will be converted by \code{as.character}
This argument is optional for \code{stringdistmatrix} (see section \code{Value}).}

\item{method}{Method for distance calculation. The default is \code{"osa"},
see \code{\link{stringdist-metrics}}.}

\item{useBytes}{Perform byte-wise comparison, see
\code{\link{stringdist-encoding}}.}

\item{weight}{For \code{method='osa'} or \code{'dl'}, the penalty for
deletion, insertion, substitution and transposition, in that order. When
\code{method='lv'}, the penalty for transposition is ignored. When
\code{method='jw'}, the weights associated with characters of \code{a},
characters from \code{b} and the transposition weight, in that order. 
Weights must be positive and not exceed 1. \code{weight} is ignored
completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'},
\code{'Jaccard'}, \code{'lcs'}, or \code{soundex}.}

\item{q}{Size of the \eqn{q}-gram; must be nonnegative. Only applies to
\code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}.}

\item{p}{Prefix factor for Jaro-Winkler distance. The valid range for 
\code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the
Jaro-distance is returned. Applies only to \code{method='jw'}.}

\item{bt}{Winkler's boost threshold. Winkler's prefix factor is
only applied when the Jaro distance is larger than \code{bt}.
Applies only to \code{method='jw'} and \code{p>0}.}

\item{nthread}{Maximum number of threads to use. By default, a sensible
number of threads is chosen, see \code{\link{stringdist-parallelization}}.}

\item{useNames}{Use input vectors as row and column names?}
}
\value{
For \code{stringdist},  a vector with string distances of size
  \code{max(length(a),length(b))}.
 
For \code{stringdistmatrix}: if both \code{a} and \code{b} are passed, a
\code{length(a)xlength(b)} \code{matrix}. If a single argument \code{a} is
given an object of class \code{\link[stats]{dist}} is returned.
 
Distances are nonnegative if they can be computed, \code{NA} if any of the
two argument strings is \code{NA} and \code{Inf} when \code{maxDist} is
exceeded or, in case of the hamming distance, when the two compared strings
have different length.
}
\description{
\code{stringdist} computes pairwise string distances between elements of
\code{a} and \code{b}, where the argument with less elements is recycled.
\code{stringdistmatrix} computes the string distance matrix with rows
according to
\code{a} and columns according to \code{b}.
}
\examples{

# Simple example using optimal string alignment
stringdist("ca","abc")

# computing a 'dist' object
d <- stringdistmatrix(c('foo','bar','boo','baz'))
# try plot(hclust(d))

# The following gives a matrix
stringdistmatrix(c("foo","bar","boo"),c("baz","buz"))

# An example using Damerau-Levenshtein distance (multiple editing of substrings allowed)
stringdist("ca","abc",method="dl")

# string distance matching is case sensitive:
stringdist("ABC","abc")

# so you may want to normalize a bit:
stringdist(tolower("ABC"),"abc")

# stringdist recycles the shortest argument:
stringdist(c('a','b','c'),c('a','c'))

# stringdistmatrix gives the distance matrix (by default for optimal string alignment):
stringdist(c('a','b','c'),c('a','c'))

# different edit operations may be weighted; e.g. weighted substitution:
stringdist('ab','ba',weight=c(1,1,1,0.5))

# Non-unit weights for insertion and deletion makes the distance metric asymetric
stringdist('ca','abc')
stringdist('abc','ca')
stringdist('ca','abc',weight=c(0.5,1,1,1))
stringdist('abc','ca',weight=c(0.5,1,1,1))

# Hamming distance is undefined for 
# strings of unequal lengths so stringdist returns Inf
stringdist("ab","abc",method="h")
# For strings of eqal length it counts the number of unequal characters as they occur
# in the strings from beginning to end
stringdist("hello","HeLl0",method="h")

# The lcs (longest common substring) distance returns the number of 
# characters that are not part of the lcs.
#
# Here, the lcs is either 'a' or 'b' and one character cannot be paired:
stringdist('ab','ba',method="lcs")
# Here the lcs is 'surey' and 'v', 'g' and one 'r' of 'surgery' are not paired
stringdist('survey','surgery',method="lcs")


# q-grams are based on the difference between occurrences of q consecutive characters
# in string a and string b.
# Since each character abc occurs in 'abc' and 'cba', the q=1 distance equals 0:
stringdist('abc','cba',method='qgram',q=1)

# since the first string consists of 'ab','bc' and the second 
# of 'cb' and 'ba', the q=2 distance equals 4 (they have no q=2 grams in common):
stringdist('abc','cba',method='qgram',q=2)

# Wikipedia has the following example of the Jaro-distance. 
stringdist('MARTHA','MATHRA',method='jw')
# Note that stringdist gives a  _distance_ where wikipedia gives the corresponding 
# _similarity measure_. To get the wikipedia result:
1 - stringdist('MARTHA','MATHRA',method='jw')

# The corresponding Jaro-Winkler distance can be computed by setting p=0.1
stringdist('MARTHA','MATHRA',method='jw',p=0.1)
# or, as a similarity measure
1 - stringdist('MARTHA','MATHRA',method='jw',p=0.1)

# This gives distance 1 since Euler and Gauss translate to different soundex codes.
stringdist('Euler','Gauss',method='soundex')
# Euler and Ellery translate to the same code and have distance 0
stringdist('Euler','Ellery',method='soundex')





}
\seealso{
\code{\link{stringsim}}, \code{\link{qgrams}}, \code{\link{amatch}}, \code{\link{afind}}
}