File: utf8.R

package info (click to toggle)
r-base 4.5.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 112,924 kB
  • sloc: ansic: 291,338; fortran: 111,889; javascript: 14,798; yacc: 6,154; sh: 5,689; makefile: 5,239; tcl: 4,562; perl: 963; objc: 791; f90: 758; asm: 258; java: 31; sed: 1
file content (56 lines) | stat: -rw-r--r-- 1,942 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
## This takes the definition of UTF-8 as RFC3629 (2003),
## but not all software does.
## See also https://en.wikipedia.org/wiki/UTF-8

x <- 1L:0x10FFFF
y <- intToUtf8(x, multiple = TRUE)
names(y) <- sprintf("U+%4x", x)
## values in the surrogate range: not handled in R < 3.4.3
sr <- 0xD800:0xDFFF
stopifnot(is.na(y[sr]))
stopifnot(!is.na(y[-sr]))
## too large values: originally handled by UTF-8, but not in RFC3629
## R >= 3.4.3 conforms to RFC3629
stopifnot(is.na(intToUtf8(c(0x200000, 0x10FFFF + 1:10))))

## next command is quite slow.
xx <- sapply(y, function(x) tryCatch(utf8ToInt(x),
                                     error = function(e) NA_character_))
invalid <- sr # previously included 0xFFFE and 0xFFFF
              # but not other 'noncharacters'.
stopifnot(is.na(xx[invalid]), !is.na(xx[!invalid]))
stopifnot(xx[!invalid] == x[!invalid])

## The pre-2003 UTF-8 standard converted larger code-points to 4-6 bytes,
## and was followed by intToUtf8 in earlier versions of R.
## Earlier conversion of 0x111111, 0x200001, 0x10000001)
x <- c("\xf4\x91\x84\x91", "\xf8\x80\x80\x80\x81", "\xfc\x90\x80\x80\x80\x81")
xx <- sapply(x, function(x) tryCatch(utf8ToInt(x),
                                     error = function(e) NA_character_))
stopifnot(is.na(xx)) # first was not in R < 3.4.3

### test surrogate pairs
surrogate_pair <- function(x)
{
    if(any(x < 0x10000 | x > 0x10FFFF))
        stop("Surrogate pairs apply only to supplementary planes")
    x <- x - 0x10000
    as.vector(rbind(0xD800 + x %/% 1024, 0xDC00 + x %% 1024))
}
## check the example:
xx <- surrogate_pair(0x10437)
sprintf("%X", xx)
stopifnot(xx == c(0xD801, 0xDC37))

## there are 2^20 surrogate pairs, but fast enough to check them all
x <- 0x10000:0x10FFFF
x1 <- intToUtf8(x)
x2 <- utf8ToInt(x1)
stopifnot(x2 == x)

z <- surrogate_pair(x)
x1 <- intToUtf8(z, allow_surrogate_pairs = TRUE)
x2 <- utf8ToInt(x1)
stopifnot(x2 == x)

proc.time() # ~ 9 sec