File: unicode_data_to_printability_module.lua

package info (click to toggle)
luacheck 1.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,704 kB
  • sloc: sh: 106; makefile: 96; python: 35
file content (86 lines) | stat: -rw-r--r-- 2,125 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
-- Reads Unicode character data in UnicodeData.txt format from stdin.
-- Prints a Lua module retuning an array of first codepoints of
-- each continuous block of codepoints that are all printable or all not printable.
-- See https://unicode.org/reports/tr44/

local category_printabilities = {
   Lu = true,
   Ll = true,
   Lt = true,
   Lm = true,
   Lo = true,
   Mn = true,
   Mc = true,
   Me = true,
   Nd = true,
   Nl = true,
   No = true,
   Pc = true,
   Pd = true,
   Ps = true,
   Pe = true,
   Pi = true,
   Pf = true,
   Po = true,
   Sm = true,
   Sc = true,
   Sk = true,
   So = true,
   Zs = true,
   Zl = false,
   Zp = false,
   Cc = false,
   Cf = false,
   Cs = false,
   Co = false,
   Cn = false
}

local codepoint_printabilities = {}
local max_codepoint = 0

local range_start_codepoint

for line in io.lines() do
   local codepoint_hex, name, category = assert(line:match("^([^;]+);([^;]+);([^;]+)"))
   local codepoint = assert(tonumber("0x" .. codepoint_hex))
   local printability = category_printabilities[category]
   assert(printability ~= nil)

   if name:find(", First>$") then
      assert(not range_start_codepoint)
      range_start_codepoint = codepoint
   elseif name:find(", Last>$") then
      assert(range_start_codepoint and range_start_codepoint >= range_start_codepoint)

      for range_codepoint = range_start_codepoint, codepoint do
         codepoint_printabilities[range_codepoint] = printability
      end

      range_start_codepoint = nil
   else
      codepoint_printabilities[codepoint] = printability
   end

   max_codepoint = math.max(max_codepoint, codepoint)
end

assert(not range_start_codepoint)

local parts = {"return {"}
local prev_printability = true

-- Iterate up to a non-existent codepoint to ensure that the last required codepoint is printed.
for codepoint = 0, max_codepoint + 1 do
   local printability = codepoint_printabilities[codepoint] or false

   if printability ~= prev_printability then
      table.insert(parts, ("%d,"):format(codepoint))
   end

   prev_printability = printability
end

table.insert(parts, "}")
print(table.concat(parts))