1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
|
# https://unicode.org/reports/tr29/
import strutils
import derived_data
import two_stage_table
import utils
const
sgwOther = 0
sgwDoubleQuote = 1
sgwSingleQuote = 2
sgwHebrewLetter = 3
sgwCr = 4
sgwLf = 5
sgwNewline = 6
sgwExtend = 7
sgwRegionalIndicator = 8
sgwFormat = 9
sgwKatakana = 10
sgwAletter = 11
sgwMidLetter = 12
sgwMidNum = 13
sgwMidNumLet = 14
sgwNumeric = 15
sgwExtendNumLet = 16
sgwZwj = 17
sgwWsegSpace = 18
sgwExtendedPictographic = 19
func wordMap(s: string): int =
result = case s:
of "Other":
sgwOther
of "Double_Quote":
sgwDoubleQuote
of "Single_Quote":
sgwSingleQuote
of "Hebrew_Letter":
sgwHebrewLetter
of "CR":
sgwCr
of "LF":
sgwLf
of "Newline":
sgwNewline
of "Extend":
sgwExtend
of "Regional_Indicator":
sgwRegionalIndicator
of "Format":
sgwFormat
of "Katakana":
sgwKatakana
of "ALetter":
sgwAletter
of "MidLetter":
sgwMidLetter
of "MidNum":
sgwMidNum
of "MidNumLet":
sgwMidNumLet
of "Numeric":
sgwNumeric
of "ExtendNumLet":
sgwExtendNumLet
of "ZWJ":
sgwZwj
of "WSegSpace":
sgwWsegSpace
else:
assert false
-99
func emojiMap(s: string): int =
result = case s
of "Extended_Pictographic":
sgwExtendedPictographic
else:
-1
type
WordProps = seq[int]
proc parseWordBreak(filePath, filePathEmoji: string): WordProps =
let rawData = filePath.parseUDDNoDups
result = newSeq[int](rawData.len)
for i in 0 .. result.len-1:
result[i] = sgwOther
for cp, data in rawData.pairs:
if data.len == 0:
continue
result[cp] = data[0].wordMap
let rawDataEmoji = filePathEmoji.parseUDDEmoji
for cp, data in rawDataEmoji.pairs:
if data.len == 0:
continue
if data[0].emojiMap == -1:
continue
result[cp] = data[0].emojiMap
func buildWordBreak(wordProps: WordProps): Stages[int] =
buildTwoStageTable(wordProps)
const dataTemplate = """## This is auto-generated. Do not modify it
type
SgWord* = distinct int8
const
sgwOther* = $#.SgWord
sgwDoubleQuote* = $#.SgWord
sgwSingleQuote* = $#.SgWord
sgwHebrewLetter* = $#.SgWord
sgwCr* = $#.SgWord
sgwLf* = $#.SgWord
sgwNewline* = $#.SgWord
sgwExtend* = $#.SgWord
sgwRegionalIndicator* = $#.SgWord
sgwFormat* = $#.SgWord
sgwKatakana* = $#.SgWord
sgwAletter* = $#.SgWord
sgwMidLetter* = $#.SgWord
sgwMidNum* = $#.SgWord
sgwMidNumLet* = $#.SgWord
sgwNumeric* = $#.SgWord
sgwExtendNumLet* = $#.SgWord
sgwZwj* = $#.SgWord
sgwWsegSpace* = $#.SgWord
sgwExtendedPictographic* = $#.SgWord
const
wordBreakIndices* = [
$#
]
wordBreakData* = [
$#
]
wordBreakBlockSize* = $#
"""
when isMainModule:
let wordProps = parseWordBreak(
"./gen/UCD/auxiliary/WordBreakProperty.txt",
"./gen/UCD/emoji/emoji-data.txt")
let wordPropsTable = wordProps.buildWordBreak
var f = open("./src/unicodedb/segmentation_data.nim", fmWrite)
try:
f.write(dataTemplate % [
$sgwOther,
$sgwDoubleQuote,
$sgwSingleQuote,
$sgwHebrewLetter,
$sgwCr,
$sgwLf,
$sgwNewline,
$sgwExtend,
$sgwRegionalIndicator,
$sgwFormat,
$sgwKatakana,
$sgwAletter,
$sgwMidLetter,
$sgwMidNum,
$sgwMidNumLet,
$sgwNumeric,
$sgwExtendNumLet,
$sgwZwj,
$sgwWsegSpace,
$sgwExtendedPictographic,
prettyTable(wordPropsTable.stage1, 15, "'i16"),
prettyTable(wordPropsTable.stage2, 15, "'i8"),
$wordPropsTable.blockSize
])
finally:
close(f)
|