1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/qgrams.R
\name{qgrams}
\alias{qgrams}
\title{Get a table of qgram counts from one or more character vectors.}
\usage{
qgrams(..., .list = NULL, q = 1L, useBytes = FALSE, useNames = !useBytes)
}
\arguments{
\item{...}{any number of (named) arguments, that will be coerced to character with \code{as.character}.}
\item{.list}{Will be concatenated with the \code{...} argument(s). Useful for adding character vectors named \code{'q'} or \code{'useNames'}.}
\item{q}{size of q-gram, must be non-negative.}
\item{useBytes}{Determine byte-wise qgrams. \code{useBytes=TRUE} is faster but may yield different
results depending on character encoding. For \code{ASCII} it is identical. See also \code{\link{stringdist}} under Encoding issues.}
\item{useNames}{Add q-grams as column names. If \code{useBytes=useNames=TRUE}, the q-byte sequences are represented as 2 hexadecimal numbers
per byte, separated by a vertical bar (\code{|}).}
}
\value{
A table with \eqn{q}-gram counts. Detected \eqn{q}-grams are column names and the argument names as row names.
If no argument names were provided, they will be generated.
}
\description{
Get a table of qgram counts from one or more character vectors.
}
\section{Details}{
The input is converted to \code{character}. If \code{useBytes=TRUE}, each element is
converted to \code{utf8} and then to \code{integer} as in \code{\link{stringdist}}.
Next,the data is passed to the underlying routine.
Strings with less than \code{q} characters and elements containing \code{NA} are skipped. Using \code{q=0}
therefore counts the number of empty strings \code{""} occuring in each argument.
}
\examples{
qgrams('hello world',q=3)
# q-grams are counted uniquely over a character vector
qgrams(rep('hello world',2),q=3)
# to count them separately, do something like
x <- c('hello', 'world')
lapply(x,qgrams, q=3)
# output rows may be named, and you can pass any number of character vectors
x <- "I will not buy this record, it is scratched"
y <- "My hovercraft is full of eels"
z <- c("this", "is", "a", "dead","parrot")
qgrams(A = x, B = y, C = z,q=2)
# a tonque twister, showing the effects of useBytes and useNames
x <- "peter piper picked a peck of pickled peppers"
qgrams(x, q=2)
qgrams(x, q=2, useNames=FALSE)
qgrams(x, q=2, useBytes=TRUE)
qgrams(x, q=2, useBytes=TRUE, useNames=TRUE)
}
\seealso{
\code{\link{stringdist}}, \code{\link{amatch}}
}
|