1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
|
# unicode_denormalizer.py
#
# Demonstration of the pyparsing's transform_string() method, to
# convert identifiers in Python source code to equivalent Unicode
# characters. Python's compiler automatically normalizes Unicode
# characters back to their ASCII equivalents, so that identifiers may
# be rewritten using other Unicode characters, and normalize back to
# the same identifier. For instance, Python treats "print" and "𝕡𝓻ᵢ𝓃𝘁"
# and "𝖕𝒓𝗂𝑛ᵗ" all as the same identifier.
#
# The converter must take care to *only* transform identifiers -
# Python keywords must always be represented in base ASCII form. To
# skip over keywords, they are added to the parser/transformer, but
# contain no transforming parse action.
#
# The converter also detects identifiers in placeholders within f-strings.
#
# Copyright 2022, by Paul McGuire
#
import keyword
import random
import unicodedata
import pyparsing as pp
ppu = pp.pyparsing_unicode
_· = "_·"
ident_chars = (
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ "0123456789" + _·
)
# build map of each ASCII character to a string of
# all the characters in the Basic Multilingual Plane
# that NFKC normalizes back to that ASCII character
ident_char_map = {c: [] for c in ident_chars}
for ch in ppu.BMP.identbodychars:
normal = unicodedata.normalize("NFKC", ch)
if normal in ident_char_map:
ident_char_map[normal].append(ch)
# ligatures will also normalize back to ASCII
# (doubled elements have higher chance of being chosen by random.choice)
ligature_map = {
'IJ': ('IJ', 'IJ', 'IJ'),
'LJ': ('LJ', 'LJ', 'LJ'),
'NJ': ('NJ', 'NJ', 'NJ'),
'DZ': ('DZ', 'DZ', 'DZ'),
'II': ('Ⅱ', 'Ⅱ', 'II'),
'IV': ('Ⅳ', 'Ⅳ', 'IV'),
'VI': ('Ⅵ', 'Ⅵ', 'VI'),
'IX': ('Ⅸ', 'Ⅸ', 'IX'),
'XI': ('Ⅺ', 'Ⅺ', 'XI'),
'ffl': ('ffl', 'ffl', 'ffl', 'ffl', 'ffl'),
'ffi': ('ffi', 'ffi', 'ffi', 'ffi', 'ffi'),
'ff': ('ff', 'ff', 'ff'),
'fi': ('fi', 'fi', 'fi'),
'fl': ('fl', 'fl', 'fl'),
'ij': ('ij', 'ij', 'ij'),
'lj': ('lj', 'lj', 'lj'),
'nj': ('nj', 'nj', 'nj'),
'dz': ('dz', 'dz', 'dz'),
'ii': ('ⅱ', 'ⅱ', 'ii'),
'iv': ('ⅳ', 'ⅳ', 'iv'),
'vi': ('ⅵ', 'ⅵ', 'vi'),
'ix': ('ⅸ', 'ⅸ', 'ix'),
'xi': ('ⅺ', 'ⅺ', 'xi'),
}
ligature_transformer = pp.one_of(ligature_map).add_parse_action(
lambda t: random.choice(ligature_map[t[0]])
)
def make_mixed_font(t):
# extract leading character and remainder to process separately
t_first, t_rest = t[0][0], t[0][1:]
# a leading '_' must be written using the ASCII character '_'
ret = ['_' if t_first == '_'
else random.choice(ident_char_map.get(t_first, t_first))]
t_rest = ligature_transformer.transform_string(t_rest)
ret.extend(random.choice(ident_char_map.get(c, c)) for c in t_rest)
return ''.join(ret)
# define a pyparsing expression to match any identifier; add a parse
# action to convert to mixed Unicode characters
identifier = pp.pyparsing_common.identifier
identifier.add_parse_action(make_mixed_font)
# match quoted strings (which may be f-strings)
python_quoted_string = pp.Opt(pp.Char("fF")("f_string_prefix")) + (
pp.python_quoted_string
)("quoted_string_body")
def mix_fstring_expressions(t):
if not t.f_string_prefix:
return
# define an expression and transformer to handle embedded
# f-string field expressions
fstring_arg = pp.QuotedString("{", end_quote_char="}")
fstring_arg.add_parse_action(
lambda tt: "{" + transformer.transform_string(tt[0]) + "}"
)
return (
t.f_string_prefix
+ fstring_arg.transform_string(t.quoted_string_body)
)
# add parse action to transform identifiers in f-strings
python_quoted_string.add_parse_action(mix_fstring_expressions)
# match keywords separately from identifiers - keywords must be kept in their
# original ASCII
any_keyword = pp.one_of(
list(keyword.kwlist) + getattr(keyword, "softkwlist", []),
as_keyword=True
)
# quoted strings and keywords will be parsed, but left untransformed
transformer = python_quoted_string | any_keyword | identifier
def demo():
import textwrap
hello_source = textwrap.dedent("""
def hello():
try:
hello_ = "Hello"
world_ = "World"
print(f"{hello_}, {world_}!")
except TypeError as exc:
print("failed: {}".format(exc))
if __name__ == "__main__":
hello()
""")
# use transformer to generate code with denormalized identifiers
transformed = transformer.transform_string(hello_source)
print(transformed)
# does it really work? compile the transformed code and run it!
code = compile(transformed, "inline source", mode="exec")
exec(code)
if 1:
# pick some code from the stdlib
import unittest.util as lib_module
import inspect
source = inspect.getsource(lib_module)
transformed = transformer.transform_string(source)
print()
print(transformed)
if __name__ == '__main__':
demo()
|