File: properties.nim

package info (click to toggle)
nim-unicodedb 0.9.0-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 9,616 kB
  • sloc: makefile: 8
file content (327 lines) | stat: -rw-r--r-- 6,463 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import strutils
import algorithm

import unicode_data
import derived_data
import two_stage_table
import utils

type
  Props* {.pure.} = enum
    CAT, CCC, BI, QC

const
  ctgLm = 0x01
  ctgLo = 0x02
  ctgLu = 0x04
  ctgLl = 0x08
  ctgLt = 0x10
  ctgMn = 0x20
  ctgMc = 0x40
  ctgMe = 0x80
  ctgNd = 0x100
  ctgNl = 0x200
  ctgNo = 0x400
  ctgZs = 0x800
  ctgZl = 0x1000
  ctgZp = 0x2000
  ctgCc = 0x4000
  ctgCf = 0x8000
  ctgCs = 0x10000
  ctgCo = 0x20000
  ctgCn = 0x40000
  ctgPc = 0x80000
  ctgPd = 0x100000
  ctgPs = 0x200000
  ctgPe = 0x400000
  ctgPi = 0x800000
  ctgPf = 0x1000000
  ctgPo = 0x2000000
  ctgSm = 0x4000000
  ctgSc = 0x8000000
  ctgSk = 0x10000000
  ctgSo = 0x20000000

  bidirectionalNames* = [
    "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
    "ON", "LRI", "RLI", "FSI", "PDI"
  ]

proc categoryMap(s: string): int =
  case s
  of "Lm":
    ctgLm
  of "Lo":
    ctgLo
  of "Lu":
    ctgLu
  of "Ll":
    ctgLl
  of "Lt":
    ctgLt
  of "Mn":
    ctgMn
  of "Mc":
    ctgMc
  of "Me":
    ctgMe
  of "Nd":
    ctgNd
  of "Nl":
    ctgNl
  of "No":
    ctgNo
  of "Zs":
    ctgZs
  of "Zl":
    ctgZl
  of "Zp":
    ctgZp
  of "Cc":
    ctgCc
  of "Cf":
    ctgCf
  of "Cs":
    ctgCs
  of "Co":
    ctgCo
  of "Cn":
    ctgCn
  of "Pc":
    ctgPc
  of "Pd":
    ctgPd
  of "Ps":
    ctgPs
  of "Pe":
    ctgPe
  of "Pi":
    ctgPi
  of "Pf":
    ctgPf
  of "Po":
    ctgPo
  of "Sm":
    ctgSm
  of "Sc":
    ctgSc
  of "Sk":
    ctgSk
  of "So":
    ctgSo
  else:
    assert false
    -1

proc parseProps(propsRaw: seq[seq[string]]): seq[seq[int]] =
  result = newSeq[seq[int]](len(propsRaw))
  for i in 0 ..< len(propsRaw):
    result[i] = @["Cn".categoryMap(), 0]

  for cp, props in pairs(propsRaw):
    if props.len == 0:
      continue
    result[cp][Props.CAT.ord] = props[0].categoryMap()
    result[cp][Props.CCC.ord] = parseInt(props[1])
    assert result[cp][Props.CAT.ord] >= 0

proc parseBi(biRaw: seq[string]): seq[int] =
  result = newSeq[int](biRaw.len)
  for cp, bi in biRaw:
    result[cp] = bidirectionalNames.find(bi)
    assert result[cp] >= 0

const
  # Default is YES when no NO and no MAYBE
  NfcQcNoMask = 0x01
  NfcQcMaybeMask = 0x02
  NfkcQcNoMask = 0x04
  NfkcQcMaybeMask = 0x08
  NfdQcNoMask = 0x10
  NfkdQcNoMask = 0x20

proc nfMap(qcTV: string): int =
  case qcTV
  of "NFC_QC_N":
    NfcQcNoMask
  of "NFC_QC_M":
    NfcQcMaybeMask
  of "NFKC_QC_N":
    NfkcQcNoMask
  of "NFKC_QC_M":
    NfkcQcMaybeMask
  of "NFD_QC_N":
    NfdQcNoMask
  of "NFKD_QC_N":
    NfkdQcNoMask
  else:
    assert false
    -1

proc parseQC(qcsRaw: seq[seq[string]]): seq[int] =
  result = newSeq[int](qcsRaw.len)
  result.fill(0)
  for cp, qcTVs in qcsRaw:
    if qcTVs.len == 0:
      continue
    for qcTV in qcTVs:
      result[cp] = result[cp] or qcTV.nfMap()

proc parse(
      udPath: string,
      dbcPath: string,
      dnpPath: string
    ): seq[seq[int]] =
  echo "unicode data"
  result = udPath.parseUDProps().parseProps()
  echo "derived bidi"
  let bis = dbcPath.parseDBC().parseBi()
  for cp, bi in bis:
    result[cp].add(bi)
  echo "derived qc"
  let qcs = dnpPath.parseDNPQC().parseQC()
  for cp, qc in qcs:
    result[cp].add(qc)

proc build(props: seq[seq[int]]): ThreeStageTable[seq[int]] =
  buildThreeStageTable(props)

const propsTemplate = """## This is auto-generated. Do not modify it

type
  NfMask* = enum
    ## A type for extracting the QC
    ## (either No or Maybe value)
    ## value out of a raw QC property.
    ## This is used for normalization form algorithms
    nfcQcNo = $#
    nfcQcMaybe = $#
    nfkcQcNo = $#
    nfkcQcMaybe = $#
    nfdQcNo = $#
    nfkdQcNo = $#

type
  UnicodeCategory* = distinct int32
    ## A type for extracting the category
    ## value out of the raw properties.

const
  ctgLm* = $#.UnicodeCategory
  ctgLo* = $#.UnicodeCategory
  ctgLu* = $#.UnicodeCategory
  ctgLl* = $#.UnicodeCategory
  ctgLt* = $#.UnicodeCategory
  ctgMn* = $#.UnicodeCategory
  ctgMc* = $#.UnicodeCategory
  ctgMe* = $#.UnicodeCategory
  ctgNd* = $#.UnicodeCategory
  ctgNl* = $#.UnicodeCategory
  ctgNo* = $#.UnicodeCategory
  ctgZs* = $#.UnicodeCategory
  ctgZl* = $#.UnicodeCategory
  ctgZp* = $#.UnicodeCategory
  ctgCc* = $#.UnicodeCategory
  ctgCf* = $#.UnicodeCategory
  ctgCs* = $#.UnicodeCategory
  ctgCo* = $#.UnicodeCategory
  ctgCn* = $#.UnicodeCategory
  ctgPc* = $#.UnicodeCategory
  ctgPd* = $#.UnicodeCategory
  ctgPs* = $#.UnicodeCategory
  ctgPe* = $#.UnicodeCategory
  ctgPi* = $#.UnicodeCategory
  ctgPf* = $#.UnicodeCategory
  ctgPo* = $#.UnicodeCategory
  ctgSm* = $#.UnicodeCategory
  ctgSc* = $#.UnicodeCategory
  ctgSk* = $#.UnicodeCategory
  ctgSo* = $#.UnicodeCategory

const
  bidirectionalNames* = [
    $#
  ]

  propsOffsets* = [
    $#
  ]
  propsIndices* = [
    $#
  ]
  propsData* = [
    $#
  ]

  blockSize* = $#
"""

when isMainModule:
  var stages = build(parse(
    "./gen/UCD/UnicodeData.txt",
    "./gen/UCD/extracted/DerivedBidiClass.txt",
    "./gen/UCD/DerivedNormalizationProps.txt"))

  echo stages.blockSize
  echo stages.stage1.len
  echo stages.stage2.len
  echo stages.stage3.len

  let propsLen = 4
  let maxCP = 0x10FFFF

  var propsGen = newSeq[string](stages.stage3.len)
  for i, p in stages.stage3:
    assert len(p) == propsLen
    propsGen[i] = "[$#]" % join(p, "'i32, ")
  var bidirectionalNamesGen = newSeq[string](len(bidirectionalNames))
  for i, bi in bidirectionalNames:
    bidirectionalNamesGen[i] = "\"$#\"" % bi

  var f = open("./src/unicodedb/properties_data.nim", fmWrite)
  try:
    f.write(propsTemplate % [
      intToStr(NfcQcNoMask),
      intToStr(NfcQcMaybeMask),
      intToStr(NfkcQcNoMask),
      intToStr(NfkcQcMaybeMask),
      intToStr(NfdQcNoMask),
      intToStr(NfkdQcNoMask),
      $ctgLm,
      $ctgLo,
      $ctgLu,
      $ctgLl,
      $ctgLt,
      $ctgMn,
      $ctgMc,
      $ctgMe,
      $ctgNd,
      $ctgNl,
      $ctgNo,
      $ctgZs,
      $ctgZl,
      $ctgZp,
      $ctgCc,
      $ctgCf,
      $ctgCs,
      $ctgCo,
      $ctgCn,
      $ctgPc,
      $ctgPd,
      $ctgPs,
      $ctgPe,
      $ctgPi,
      $ctgPf,
      $ctgPo,
      $ctgSm,
      $ctgSc,
      $ctgSk,
      $ctgSo,
      join(bidirectionalNamesGen, ",\n    "),
      prettyTable(stages.stage1, 15, "'i16"),
      prettyTable(stages.stage2, 15, "'u8"),
      join(propsGen, ",\n    "),
      intToStr(stages.blockSize)])
  finally:
    close(f)