File: generate_unicode_confusables_data.py

package info (click to toggle)
ddnet 19.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 68,960 kB
  • sloc: cpp: 195,050; ansic: 58,572; python: 5,568; asm: 946; sh: 941; java: 366; xml: 206; makefile: 31
file content (135 lines) | stat: -rw-r--r-- 3,874 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Needs UnicodeData.txt and confusables.txt in the current directory.
#
# Those can be obtained from unicode.org:
# - http://www.unicode.org/Public/security/<VERSION>/confusables.txt
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
# If executed as a script, it will generate the contents of the files
# python3 scripts/generate_unicode_confusables_data.py header > `src/base/unicode/confusables.h`,
# python3 scripts/generate_unicode_confusables_data.py data > `src/base/unicode/confusables_data.h`.

import sys
import unicode

def generate_decompositions():
	ud = unicode.data()
	con = unicode.confusables()

	def category(x):
		return {unicode.unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}

	# TODO: Is this correct? They changed the decompositioning format
	nfd = {unicode.unhex(u["Value"]): unicode.unhex_sequence(u["Decomposition_Type"]) for u in ud}
	nfd = {k: v for k, v in nfd.items() if v}
	con = {unicode.unhex(c["Value"]): unicode.unhex_sequence(c["Target"]) for c in con}

	# C: Control
	# M: Combining
	# Z: Space
	ignore = category("C") | category("M") | category("Z")

	con[0x006C] = [0x0069] # LATIN SMALL LETTER L -> LATIN SMALL LETTER I
	con[0x00A1] = [0x0069] # INVERTED EXCLAMATION MARK -> LATIN SMALL LETTER I
	con[0x2800] = [] # BRAILLE PATTERN BLANK
	con[0xFFFC] = [] # OBJECT REPLACEMENT CHARACTER

	interesting = ignore | set(nfd) | set(con)

	def apply(l, replacements):
		return [d for c in l for d in replacements.get(c, [c])]

	def gen(c):
		result = [c]
		while True:
			first = apply(result, nfd)
			second = apply(first, con)
			# Apply substitutions until convergence.
			if result == first and result == second:
				break
			result = second
		return [c for c in result if c not in ignore]

	return {c: gen(c) for c in interesting}

def gen_header(decompositions, len_set):
	print("""\
#include <cstdint>

struct DECOMP_SLICE
{
\tuint16_t offset : 13;
\tuint16_t length : 3;
};
""")
	print("enum")
	print("{")
	print(f"\tNUM_DECOMP_LENGTHS = {len(len_set)},")
	print(f"\tNUM_DECOMPS = {len(decompositions)},")
	print("};")
	print()

	print("extern const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS];")
	print("extern const int32_t decomp_chars[NUM_DECOMPS];")
	print("extern const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS];")
	print("extern const int32_t decomp_data[];")

def gen_data(decompositions, decomposition_set, decomposition_offsets, len_set):
	print("""\
#ifndef CONFUSABLES_DATA
#error "This file should only be included in `confusables.cpp`"
#endif
""")

	print("const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS] = {")
	for l in len_set:
		print(f"\t{l},")
	print("};")
	print()

	print("const int32_t decomp_chars[NUM_DECOMPS] = {")
	for k in sorted(decompositions):
		print(f"\t0x{k:x},")
	print("};")
	print()

	print("const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS] = {")
	for k in sorted(decompositions):
		d = decompositions[k]
		i = decomposition_set.index(tuple(d))
		l = len_set.index(len(d))
		print(f"\t{{{decomposition_offsets[i]}, {l}}},")
	print("};")
	print()

	print("const int32_t decomp_data[] = {")
	for d in decomposition_set:
		for c in d:
			print(f"\t0x{c:x},")
	print("};")

def main():
	decompositions = generate_decompositions()

	# Deduplicate
	decomposition_set = sorted(set(tuple(x) for x in decompositions.values()))
	len_set = sorted(set(len(x) for x in decomposition_set))

	if len(len_set) > 8:
		raise ValueError("Can't pack offset (13 bit) together with len (>3bit)")

	cur_offset = 0
	decomposition_offsets = []
	for d in decomposition_set:
		decomposition_offsets.append(cur_offset)
		cur_offset += len(d)

	header = "header" in sys.argv
	data = "data" in sys.argv

	if header:
		gen_header(decompositions, len_set)
	elif data:
		gen_data(decompositions, decomposition_set, decomposition_offsets, len_set)

if __name__ == '__main__':
	main()