1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
#
# mini-script to generate the pylatexenc.latexencode._uni2latexmap_xml dict mapping
#
import re
import sys
if sys.version_info.major > 2:
# python 3
unichr = chr
from xml.etree import ElementTree as ET
e = ET.parse('unicode.xml')
d = {}
dnames = {}
for chxml in e.find('charlist').iter('character'):
Uid = chxml.attrib['id']
if '-' in Uid:
# composite/multiple characters not supported
continue
charord = int(Uid.lstrip('U'), 16)
latexxml = chxml.find('latex')
if latexxml is None:
continue
latexval = latexxml.text
if latexval == unichr(charord):
# "latex" representation is the same char directly
continue
if charord == 0x20:
# skip space char
continue
if latexval.startswith(r'\ElsevierGlyph') or latexval.startswith(r'\El') \
or latexval.startswith(r'\ensuremath{\El'):
continue
if re.search(r'\\[a-zA-Z]+\s+$', latexval):
# ends with named macro+space, remove space because
# latexencode.UnicodeToLatexEncoder will handle that with
# replacement_latex_protection
latexval = latexval.rstrip()
d[charord] = latexval
dnames[charord] = chxml.find('description').text
# dump dictionary into new module file in current working directory
outputfile = '_uni2latexmap_xml.py'
HEADER = """\
# -*- coding: utf-8 -*-
#
# Automatically generated from unicode.xml by gen_xml_dic.py
#
"""
with open(outputfile, 'w') as f:
f.write(HEADER)
f.write("uni2latex = {\n")
for k,v in d.items():
f.write("0x%04X: %r,\n"%(k, v))
f.write("}\n")
print("Successfully generated file %s"%(outputfile))
# Now see which characters we don't have in our default set of symbols
from pylatexenc.latexencode._uni2latexmap import uni2latex as uni2latex_defaults
missing_keys = set(d.keys()).difference(set(uni2latex_defaults.keys()))
if missing_keys:
print("#\n# Missing keys added from unicode.xml\n#\n")
for k in sorted(missing_keys):
if "'" not in d[k]:
therepr = "r'"+d[k]+"'"
else:
therepr = repr(d[k])
thedef = "0x%04X: %s,"%(k, therepr)
print("%-50s# %s [%s]"%(thedef, dnames[k], unichr(k)))
|