File: compositions.nim

package info (click to toggle)
nim-unicodedb 0.9.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 9,616 kB
  • sloc: makefile: 8
file content (63 lines) | stat: -rw-r--r-- 1,520 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import strutils

import unicode_data
import derived_data
import min_perfect_hash
import utils

proc parseComps(
      decompsRaw: seq[string],
      exclude: seq[int]  # todo: make it a set
    ): seq[Record[seq[int]]] =
  var maxCompSize = 0
  for dcp in decompsRaw:
    if dcp.len > 0:
      inc maxCompSize

  result = newSeqOfCap[Record[seq[int]]](maxCompSize)
  for cp, dcp in pairs(decompsRaw):
    if dcp.len == 0:
      continue
    if cp in exclude:
      continue
    if "<" in dcp:  # Compatibility decomp
      continue
    let dcpParts = dcp.split(" ")
    assert len(dcpParts) == 2
    let
      cp_a = parseHexInt("0x$#" % dcpParts[0])
      cp_b = parseHexInt("0x$#" % dcpParts[1])
    result.add((
      key: @[cp_a, cp_b],
      value: @[cp_a, cp_b, cp]))

const compsTemplate = """## This is auto-generated. Do not modify it

const
  compsHashes* = [
    $#
  ]
  compsValues* = [
    $#
  ]
"""

when isMainModule:
  var decomps = parseComps(
    parseUDDecomps("./gen/UCD/UnicodeData.txt"),
    parseDNPExclusion("./gen/UCD/DerivedNormalizationProps.txt"))
  var mphTables = mph(decomps)
  echo mphLookup(mphTables.h, mphTables.v, [65, 768])

  var compValues = newSeq[string](len(mphTables.v))
  for i, v in mphTables.v:
    assert len(v) == 3
    compValues[i] = "[$#]" % join(v, "'i32, ")

  var f = open("./src/unicodedb/compositions_data.nim", fmWrite)
  try:
    f.write(compsTemplate % [
      prettyTable(mphTables.h, 15, "'i16"),
      join(compValues, ",\L    ")])
  finally:
    close(f)