1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_conversion.R
\name{stri_enc_toutf8}
\alias{stri_enc_toutf8}
\title{Convert Strings To UTF-8}
\usage{
stri_enc_toutf8(str, is_unknown_8bit = FALSE, validate = FALSE)
}
\arguments{
\item{str}{a character vector to be converted}
\item{is_unknown_8bit}{a single logical value, see Details}
\item{validate}{a single logical value (can be \code{NA}), see Details}
}
\value{
Returns a character vector.
}
\description{
Converts character strings with declared marked encodings
to UTF-8 strings.
}
\details{
If \code{is_unknown_8bit} is set to \code{FALSE} (the default),
then R encoding marks are used, see \code{\link{stri_enc_mark}}.
Bytes-marked strings will cause the function to fail.
If a string is in UTF-8 and has a byte order mark (BOM),
then the BOM will be silently removed from the output string.
If the default encoding is UTF-8, see \code{\link{stri_enc_get}},
then strings marked with \code{native} are -- for efficiency reasons --
returned as-is, i.e., with unchanged markings.
A similar behavior is observed when calling \code{\link{enc2utf8}}.
For \code{is_unknown_8bit=TRUE}, if a string is declared to be neither
in ASCII nor in UTF-8, then all byte codes > 127 are replaced with
the Unicode REPLACEMENT CHARACTER (\\Ufffd).
Note that the REPLACEMENT CHARACTER may be interpreted as Unicode
missing value for single characters.
Here a \code{bytes}-marked string is assumed to use an 8-bit encoding
that extends the ASCII map.
What is more, setting \code{validate} to \code{TRUE}
or \code{NA} in both cases validates the resulting UTF-8 byte stream.
If \code{validate=TRUE}, then
in case of any incorrect byte sequences, they will be
replaced with the REPLACEMENT CHARACTER.
This option may be used in a case
where you want to fix an invalid UTF-8 byte sequence.
For \code{NA}, a bogus string will be replaced with a missing value.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_conversion:
\code{\link{about_encoding}},
\code{\link{stri_enc_fromutf32}()},
\code{\link{stri_enc_toascii}()},
\code{\link{stri_enc_tonative}()},
\code{\link{stri_enc_toutf32}()},
\code{\link{stri_encode}()}
}
\concept{encoding_conversion}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
|