1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
|
import strutils
import unicode_data
import two_stage_table
import utils
type
Decomposition = tuple
isCanonical: bool
cps: seq[int]
proc parseDecomps(decompsRaw: seq[string]): seq[Decomposition] =
result = newSeq[Decomposition](len(decompsRaw))
for cp, decomp in pairs(decompsRaw):
if decomp.len == 0:
continue
result[cp].isCanonical = true
result[cp].cps = newSeqOfCap[int](18)
if "<" in decomp:
result[cp].isCanonical = false
for d in decomp.split(">")[1].strip().split(" "):
result[cp].cps.add(parseHexInt("0x$#" % d))
assert len(result[cp].cps) > 0
continue
for d in decomp.split(" "):
result[cp].cps.add(parseHexInt("0x$#" % d))
assert len(result[cp].cps) > 0
type
DecompTable = tuple
decomps: seq[int]
offsets: seq[int]
proc buildDecompTable(decomps: seq[Decomposition]): DecompTable =
var decompsSize = 0
for dcp in decomps:
if dcp.cps.len > 0:
decompsSize += len(dcp.cps) + 1 # + len
result = (
decomps: newSeq[int](decompsSize),
offsets: newSeq[int](len(decomps)))
for i in 0 ..< len(decomps):
result.offsets[i] = -1
var offset = 0
for cp, dcp in pairs(decomps):
if dcp.cps.len == 0:
continue
assert len(dcp.cps) > 0
# Use length >> 1 to retrieve original length
# And length & 0x01 to retrieve isCanonical
# Integer type must be >= int8 for this to work
var length = len(dcp.cps)
assert length <= 127
length = length shl 1
if dcp.isCanonical:
length = 0x01 + length
result.offsets[cp] = offset
result.decomps[offset] = length
inc offset
for dcpcp in dcp.cps:
result.decomps[offset] = dcpcp
inc offset
type
MultiStageTable = tuple
decomps: seq[int]
stage1: seq[int]
stage2: seq[int]
blockSize: int
proc build(decomps: seq[Decomposition]): MultiStageTable =
let dcpTable = buildDecompTable(decomps)
echo dcpTable.offsets[0xFD0A]
assert dcpTable.offsets[0xFD0A] != -1
echo dcpTable.decomps[dcpTable.offsets[0xFD0A]]
assert dcpTable.decomps[dcpTable.offsets[0xFD0A]] == 4
let stageTable = findBestTable(dcpTable.offsets)
assert stageTable.blockSize > 0
echo stageTable.blockSize
echo len(stageTable.stage1)
echo len(stageTable.stage2)
result = (
decomps: dcpTable.decomps,
stage1: stageTable.stage1,
stage2: stageTable.stage2,
blockSize: stageTable.blockSize)
const decompsTemplate = """## This is auto-generated. Do not modify it
const
decompsOffsets* = [
$#
]
decompsIndices* = [
$#
]
decompsData* = [
$#
]
blockSize* = $#
"""
when isMainModule:
let stages = build(
parseDecomps(parseUDDecomps("./gen/UCD/UnicodeData.txt")))
var f = open("./src/unicodedb/decompositions_data.nim", fmWrite)
try:
f.write(decompsTemplate % [
prettyTable(stages.stage1, 15, "'i8"),
prettyTable(stages.stage2, 10, "'i16"),
prettyTable(stages.decomps, 10, "'i32"),
intToStr(stages.blockSize)])
finally:
close(f)
|