File: test-utf8_normalize.R

package info (click to toggle)
r-cran-utf8 1.2.6-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,060 kB
  • sloc: ansic: 25,890; python: 1,616; sh: 13; makefile: 6
file content (76 lines) | stat: -rw-r--r-- 2,414 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# From http://unicode.org/reports/tr15/
test_that("'utf8_normalize' can reproduce Fig. 3", {
  src <- c("\u212b", "\u2126")
  nfd <- c("\u0041\u030a", "\u03a9")
  nfc <- c("\u00c5", "\u03a9")

  expect_equal(utf8_normalize(src), nfc)
  expect_equal(utf8_normalize(nfd), nfc)
})

# From http://unicode.org/reports/tr15/
test_that("'utf8_normalize' can reproduce Fig. 4", {
  src <- c("\u00c5", "\u00f4")
  nfd <- c("\u0041\u030a", "\u006f\u0302")
  nfc <- c("\u00c5", "\u00f4")

  expect_equal(utf8_normalize(src), nfc)
  expect_equal(utf8_normalize(nfd), nfc)
})

# From http://unicode.org/reports/tr15/
test_that("'utf8_normalize' can reproduce Fig. 5", {
  src <- c("\u1e69", "\u1e0b\u0323", "\u0071\u0307\u0323")
  nfd <- c("\u0073\u0323\u0307", "\u0064\u0323\u0307", "\u0071\u0323\u0307")
  nfc <- c("\u1e69", "\u1e0d\u0307", "\u0071\u0323\u0307")

  expect_equal(utf8_normalize(src), nfc)
  expect_equal(utf8_normalize(nfd), nfc)
})

# From http://unicode.org/reports/tr15/
test_that("'utf8_normalize' can reproduce Fig. 6", {
  src <- c("\ufb01", "\u0032\u2075", "\u1e9b\u0323")
  nfd <- c("\ufb01", "\u0032\u2075", "\u017f\u0323\u0307")
  nfc <- c("\ufb01", "\u0032\u2075", "\u1e9b\u0323")
  nfkd <- c("\u0066\u0069", "\u0032\u0035", "\u0073\u0323\u0307")
  nfkc <- c("\u0066\u0069", "\u0032\u0035", "\u1e69")

  expect_equal(utf8_normalize(src), nfc)
  expect_equal(utf8_normalize(nfd), nfc)
  expect_equal(utf8_normalize(src, map_compat = TRUE), nfkc)
  expect_equal(utf8_normalize(nfd, map_compat = TRUE), nfkc)
  expect_equal(utf8_normalize(nfkd), nfkc)
})


test_that("'utf8_normalize' can normalize, case fold, and remove ignorables", {
  src <- c("A", "\u00df", "\u1e9e", "\u1fc3", "\u200b")
  nfkc_casefold <- c("a", "ss", "ss", "\u03b7\u03b9", "")
  expect_equal(
    utf8_normalize(src, map_case = TRUE, remove_ignorable = TRUE),
    nfkc_casefold
  )
})


test_that("'utf8_normalize' can map quotes", {
  src <- c("\"", "'", "\u2018", "\u2019", "\u201c", "\u201d")
  quotefold <- c("\"", "'", "'", "'", "\u201c", "\u201d")
  expect_equal(utf8_normalize(src, map_quote = TRUE), quotefold)
})


test_that("'utf8_normalize' accepts NULL", {
  expect_equal(utf8_normalize(NULL), NULL)
})


test_that("'utf8_normalize' accepts NA", {
  expect_equal(utf8_normalize(NA_character_), NA_character_)
})


test_that("'utf8_normalize' can handle backslash", {
  expect_equal(utf8_normalize("\\m"), "\\m")
})