File: utf8_normalize.Rd

package info (click to toggle)
r-cran-utf8 1.2.6-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,060 kB
  • sloc: ansic: 25,890; python: 1,616; sh: 13; makefile: 6
file content (57 lines) | stat: -rw-r--r-- 1,717 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utf8.R
\name{utf8_normalize}
\alias{utf8_normalize}
\title{Text Normalization}
\usage{
utf8_normalize(
  x,
  ...,
  map_case = FALSE,
  map_compat = FALSE,
  map_quote = FALSE,
  remove_ignorable = FALSE
)
}
\arguments{
\item{x}{character object.}

\item{...}{These dots are for future extensions and must be empty.}

\item{map_case}{a logical value indicating whether to apply Unicode case
mapping to the text. For most languages, this transformation changes
uppercase characters to their lowercase equivalents.}

\item{map_compat}{a logical value indicating whether to apply Unicode
compatibility mappings to the characters, those required for NFKC and NFKD
normal forms.}

\item{map_quote}{a logical value indicating whether to replace curly single
quotes and Unicode apostrophe characters with ASCII apostrophe (U+0027).}

\item{remove_ignorable}{a logical value indicating whether to remove Unicode
"default ignorable" characters like zero-width spaces and soft hyphens.}
}
\value{
The result is a character object with the same attributes as
\code{x} but with \code{Encoding} set to \code{"UTF-8"}.
}
\description{
Transform text to normalized form, optionally mapping to lowercase and
applying compatibility maps.
}
\details{
\code{utf8_normalize()} converts the elements of a character object to Unicode
normalized composed form (NFC) while applying the character maps specified
by the \code{map_case}, \code{map_compat}, \code{map_quote}, and
\code{remove_ignorable} arguments.
}
\examples{

angstrom <- c("\u00c5", "\u0041\u030a", "\u212b")
utf8_normalize(angstrom) == "\u00c5"

}
\seealso{
\code{\link[=as_utf8]{as_utf8()}}.
}