File: makeencoding.py

package info (click to toggle)
mupdf 1.21.1%2Bds2-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 25,256 kB
  • sloc: ansic: 401,295; python: 18,025; java: 5,935; javascript: 4,173; makefile: 1,056; xml: 498; sh: 398; cpp: 352; cs: 216; sed: 7; awk: 6; lisp: 3
file content (85 lines) | stat: -rw-r--r-- 2,813 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3

# Convert unicode mapping table to C arrays mapping glyph names and unicode values.
#
# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT
# ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT
# ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT
# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT
# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
#

BANNED = [
	"controlSTX", "controlSOT", "controlETX", "controlEOT", "controlENQ",
	"controlACK", "controlBEL", "controlBS", "controlHT", "controlLF",
	"controlVT", "controlFF", "controlCR", "controlSO", "controlSI",
	"controlDLE", "controlDC1", "controlDC2", "controlDC3", "controlDC4",
	"controlNAK", "controlSYN", "controlETB", "controlCAN", "controlEM",
	"controlSUB", "controlESC", "controlFS", "controlGS", "controlRS",
	"controlUS",
	"SF100000", "SF110000", "SF010000", "SF030000", "SF020000", "SF040000",
	"SF080000", "SF090000", "SF060000", "SF070000", "SF050000", "SF430000",
	"SF240000", "SF510000", "SF390000", "SF250000", "SF500000", "SF490000",
	"SF380000", "SF280000", "SF260000", "SF360000", "SF370000", "SF420000",
	"SF190000", "SF230000", "SF410000", "SF450000", "SF460000", "SF400000",
	"SF540000", "SF440000",
]

glyphs = {}
for line in open("scripts/glyphlist.txt").readlines():
	if line[0] != '#':
		n, u = line.rstrip().split(';')
		if len(u) == 4:
			u = int(u, base=16)
			if u not in glyphs and n not in BANNED:
				glyphs[u] = n

def load_table(fn):
	table = [0] * 256
	for line in open(fn).readlines():
		line = line.strip()
		if line[0] != '#' and not line.endswith("#UNDEFINED"):
			line = line.split()
			c = int(line[0][2:], base=16)
			u = int(line[1][2:], base=16)
			table[c] = u
	return table

def dump_table(name, table):
	print("unsigned short fz_unicode_from_%s[256] = {" % name)
	for u in table:
		print('\t%d,' % u)
	print("};")
	print()

	print("const char *fz_glyph_name_from_%s[%d] = {" % (name, len(table)))
	for u in table:
		if u in glyphs:
			print('\t"%s",' % glyphs[u])
		else:
			print('\t_notdef,')
	print("};")
	print()

	rev = []
	i = 0
	for u in table:
		if u in glyphs:
			if u >= 128:
				rev += ['{0x%04x,%d},' % (u, i)]
		i = i + 1
	rev.sort()

	print("static const struct { unsigned short u, c; } %s_from_unicode[] = {" % name)
	for s in rev:
		print("\t" + s)
	print("};")
	print()

dump_table("iso8859_1", load_table("scripts/8859-1.TXT"))
dump_table("iso8859_7", load_table("scripts/8859-7.TXT"))
dump_table("koi8u", load_table("scripts/KOI8-U.TXT"))
dump_table("windows_1250", load_table("scripts/CP1250.TXT"))
dump_table("windows_1251", load_table("scripts/CP1251.TXT"))
dump_table("windows_1252", load_table("scripts/CP1252.TXT"))