File: qgrams.Rd

package info (click to toggle)
r-cran-stringdist 0.9.15-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,424 kB
  • sloc: ansic: 1,690; sh: 13; makefile: 2
file content (69 lines) | stat: -rw-r--r-- 2,475 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/qgrams.R
\name{qgrams}
\alias{qgrams}
\title{Get a table of qgram counts from one or more character vectors.}
\usage{
qgrams(..., .list = NULL, q = 1L, useBytes = FALSE, useNames = !useBytes)
}
\arguments{
\item{...}{any number of (named) arguments, that will be coerced to character with \code{as.character}.}

\item{.list}{Will be concatenated with the \code{...} argument(s). Useful for adding character vectors named \code{'q'} or \code{'useNames'}.}

\item{q}{size of q-gram, must be non-negative.}

\item{useBytes}{Determine byte-wise qgrams. \code{useBytes=TRUE} is faster but may yield different
results depending on character encoding. For \code{ASCII} it is identical. See also \code{\link{stringdist}} under Encoding issues.}

\item{useNames}{Add q-grams as column names. If \code{useBytes=useNames=TRUE}, the q-byte sequences are represented as 2 hexadecimal numbers
per byte, separated by a vertical bar (\code{|}).}
}
\value{
A table with \eqn{q}-gram counts. Detected \eqn{q}-grams are column names and the argument names as row names.
If no argument names were provided, they will be generated.
}
\description{
Get a table of qgram counts from one or more character vectors.
}
\section{Details}{

The input is converted to \code{character}. If \code{useBytes=TRUE}, each element is 
converted to \code{utf8} and then to \code{integer} as in \code{\link{stringdist}}. 
Next,the data is passed to the underlying routine.

Strings with less than \code{q} characters and elements containing \code{NA} are skipped. Using \code{q=0} 
therefore counts the number of empty strings \code{""} occuring in each argument.
}

\examples{

qgrams('hello world',q=3)

# q-grams are counted uniquely over a character vector
qgrams(rep('hello world',2),q=3)

# to count them separately, do something like
x <- c('hello', 'world')
lapply(x,qgrams, q=3)

# output rows may be named, and you can pass any number of character vectors
x <- "I will not buy this record, it is scratched"
y <- "My hovercraft is full of eels"
z <- c("this", "is", "a", "dead","parrot")
qgrams(A = x, B = y, C = z,q=2)

# a tonque twister, showing the effects of useBytes and useNames
x <- "peter piper picked a peck of pickled peppers"
qgrams(x, q=2) 
qgrams(x, q=2, useNames=FALSE) 
qgrams(x, q=2, useBytes=TRUE)
qgrams(x, q=2, useBytes=TRUE, useNames=TRUE)




}
\seealso{
\code{\link{stringdist}}, \code{\link{amatch}}
}