File: decompositions.nim

package info (click to toggle)
nim-unicodedb 0.9.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 9,616 kB
  • sloc: makefile: 8
file content (117 lines) | stat: -rw-r--r-- 3,039 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import strutils

import unicode_data
import two_stage_table
import utils

type
  Decomposition = tuple
    isCanonical: bool
    cps: seq[int]

proc parseDecomps(decompsRaw: seq[string]): seq[Decomposition] =
  result = newSeq[Decomposition](len(decompsRaw))
  for cp, decomp in pairs(decompsRaw):
    if decomp.len == 0:
      continue
    result[cp].isCanonical = true
    result[cp].cps = newSeqOfCap[int](18)
    if "<" in decomp:
      result[cp].isCanonical = false
      for d in decomp.split(">")[1].strip().split(" "):
        result[cp].cps.add(parseHexInt("0x$#" % d))
      assert len(result[cp].cps) > 0
      continue
    for d in decomp.split(" "):
      result[cp].cps.add(parseHexInt("0x$#" % d))
    assert len(result[cp].cps) > 0

type
  DecompTable = tuple
    decomps: seq[int]
    offsets: seq[int]

proc buildDecompTable(decomps: seq[Decomposition]): DecompTable =
  var decompsSize = 0
  for dcp in decomps:
    if dcp.cps.len > 0:
      decompsSize += len(dcp.cps) + 1  # + len

  result = (
    decomps: newSeq[int](decompsSize),
    offsets: newSeq[int](len(decomps)))
  for i in 0 ..< len(decomps):
    result.offsets[i] = -1

  var offset = 0
  for cp, dcp in pairs(decomps):
    if dcp.cps.len == 0:
      continue
    assert len(dcp.cps) > 0
    # Use length >> 1 to retrieve original length
    # And length & 0x01 to retrieve isCanonical
    # Integer type must be >= int8 for this to work
    var length = len(dcp.cps)
    assert length <= 127
    length = length shl 1
    if dcp.isCanonical:
      length = 0x01 + length

    result.offsets[cp] = offset
    result.decomps[offset] = length
    inc offset
    for dcpcp in dcp.cps:
      result.decomps[offset] = dcpcp
      inc offset

type
  MultiStageTable = tuple
    decomps: seq[int]
    stage1: seq[int]
    stage2: seq[int]
    blockSize: int

proc build(decomps: seq[Decomposition]): MultiStageTable =
  let dcpTable = buildDecompTable(decomps)
  echo dcpTable.offsets[0xFD0A]
  assert dcpTable.offsets[0xFD0A] != -1
  echo dcpTable.decomps[dcpTable.offsets[0xFD0A]]
  assert dcpTable.decomps[dcpTable.offsets[0xFD0A]] == 4
  let stageTable = findBestTable(dcpTable.offsets)
  assert stageTable.blockSize > 0
  echo stageTable.blockSize
  echo len(stageTable.stage1)
  echo len(stageTable.stage2)
  result = (
    decomps: dcpTable.decomps,
    stage1: stageTable.stage1,
    stage2: stageTable.stage2,
    blockSize: stageTable.blockSize)

const decompsTemplate = """## This is auto-generated. Do not modify it

const
  decompsOffsets* = [
    $#
  ]
  decompsIndices* = [
    $#
  ]
  decompsData* = [
    $#
  ]
  blockSize* = $#
"""

when isMainModule:
  let stages = build(
    parseDecomps(parseUDDecomps("./gen/UCD/UnicodeData.txt")))
  var f = open("./src/unicodedb/decompositions_data.nim", fmWrite)
  try:
    f.write(decompsTemplate % [
      prettyTable(stages.stage1, 15, "'i8"),
      prettyTable(stages.stage2, 10, "'i16"),
      prettyTable(stages.decomps, 10, "'i32"),
      intToStr(stages.blockSize)])
  finally:
    close(f)