File: gen_xml_dic.py

package info (click to toggle)
python-pylatexenc 2.10-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 9,104 kB
  • sloc: xml: 146,133; python: 10,734; makefile: 30; sh: 7
file content (82 lines) | stat: -rw-r--r-- 2,236 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#
# mini-script to generate the pylatexenc.latexencode._uni2latexmap_xml dict mapping
#
import re
import sys

if sys.version_info.major > 2:
    # python 3
    unichr = chr

from xml.etree import ElementTree as ET

e = ET.parse('unicode.xml')

d = {}
dnames = {}

for chxml in e.find('charlist').iter('character'):
    Uid = chxml.attrib['id']
    if '-' in Uid:
        # composite/multiple characters not supported
        continue
    charord = int(Uid.lstrip('U'), 16)
    latexxml = chxml.find('latex')
    if latexxml is None:
        continue
    latexval = latexxml.text
    if latexval == unichr(charord):
        # "latex" representation is the same char directly
        continue
    if charord == 0x20:
        # skip space char
        continue
    if latexval.startswith(r'\ElsevierGlyph') or latexval.startswith(r'\El') \
       or latexval.startswith(r'\ensuremath{\El'):
        continue
    if re.search(r'\\[a-zA-Z]+\s+$', latexval):
        # ends with named macro+space, remove space because
        # latexencode.UnicodeToLatexEncoder will handle that with
        # replacement_latex_protection
        latexval = latexval.rstrip()
    d[charord] = latexval
    dnames[charord] = chxml.find('description').text

# dump dictionary into new module file in current working directory
outputfile = '_uni2latexmap_xml.py'

HEADER = """\
# -*- coding: utf-8 -*-
#
# Automatically generated from unicode.xml by gen_xml_dic.py
#

"""

with open(outputfile, 'w') as f:
    f.write(HEADER)

    f.write("uni2latex = {\n")

    for k,v in d.items():
        f.write("0x%04X: %r,\n"%(k, v))

    f.write("}\n")

print("Successfully generated file %s"%(outputfile))


# Now see which characters we don't have in our default set of symbols
from pylatexenc.latexencode._uni2latexmap import uni2latex as uni2latex_defaults

missing_keys = set(d.keys()).difference(set(uni2latex_defaults.keys()))
if missing_keys:
    print("#\n# Missing keys added from unicode.xml\n#\n")
    for k in sorted(missing_keys):
        if "'" not in d[k]:
            therepr = "r'"+d[k]+"'"
        else:
            therepr = repr(d[k])
        thedef = "0x%04X: %s,"%(k, therepr)
        print("%-50s# %s [%s]"%(thedef, dnames[k], unichr(k)))