1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
|
#!/usr/bin/env python3
#
# Updates Cython's Lexicon.py with the unicode characters that are accepted as
# identifiers. Should be run with the most recent version of Python possible
# to ensure that Lexicon is as complete as possible.
#
# Python3 only (it relies on str.isidentifier which is a Python 3 addition)
#
# Run with either
# --overwrite to update the existing Lexicon.py file
# --here to create a copy of Lexicon.py in the current directory
import functools
import re
import os
import sys
# Make sure we import the right Cython
cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
cythonpath, _ = os.path.split(cythonpath)
if os.path.exists(os.path.join(cythonpath, "Cython")):
sys.path.insert(0, cythonpath)
print("Found (and using) local cython directory")
# else we aren't in a development directory
from Cython.Compiler import Lexicon
def main():
arg = '--overwrite'
if len(sys.argv) == 2:
arg = sys.argv[1]
if len(sys.argv) > 2 or arg not in ['--overwrite','--here']:
print("""Call the script with either:
--overwrite to update the existing Lexicon.py file (default)
--here to create an version of Lexicon.py in the current directory
""")
return
generated_code = (
f"# Generated with 'cython-generate-lexicon.py' based on:\n"
f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
"\n"
f"{generate_character_sets()}\n"
)
print("Reading file", Lexicon.__file__)
with open(Lexicon.__file__, 'r') as f:
parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())
if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
print("Warning: generated code section not found - code not inserted")
return
parts[2] = generated_code
output = "".join(parts)
if arg == "--here":
outfile = "Lexicon.py"
else:
assert arg == "--overwrite"
outfile = Lexicon.__file__
print("Writing to file", outfile)
with open(outfile, 'w') as f:
f.write(output)
# The easiest way to generate an appropriate character set is just to use the str.isidentifier method
# An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
@functools.lru_cache()
def get_start_characters_as_number():
return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]
def get_continue_characters_as_number():
return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]
def get_continue_not_start_as_number():
start = get_start_characters_as_number()
cont = get_continue_characters_as_number()
assert set(start) <= set(cont), \
"We assume that all identifier start characters are also continuation characters."
return sorted(set(cont).difference(start))
def to_ranges(char_num_list):
# Convert the large lists of character digits to
# list of characters
# a list pairs of characters representing closed ranges
char_num_list = sorted(char_num_list)
first_good_val = char_num_list[0]
single_chars = []
ranges = []
for n in range(1, len(char_num_list)):
if char_num_list[n]-1 != char_num_list[n-1]:
# discontinuous
if first_good_val == char_num_list[n-1]:
single_chars.append(chr(char_num_list[n-1]))
else:
ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
first_good_val = char_num_list[n]
return ''.join(single_chars), ''.join(ranges)
def escape_chars(chars):
escapes = []
for char in chars:
charval = ord(char)
escape = f'\\U{charval:08x}' if charval > 65535 else f'\\u{charval:04x}'
escapes.append(escape)
return ''.join(escapes)
def make_split_strings(chars, splitby=113, indent=" "):
splitby //= 10 # max length of "\U..." unicode escapes
lines = [f'u"{escape_chars(chars[i:i+splitby])}"' for i in range(0, len(chars), splitby)]
return indent + f"\n{indent}".join(lines)
def generate_character_sets():
declarations = []
for char_type, char_generator in [
("unicode_start_ch", get_start_characters_as_number),
("unicode_continuation_ch", get_continue_not_start_as_number),
]:
for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
declarations.append(
f"{char_type}_{set_type} = (\n"
f"{make_split_strings(chars)}\n"
f")\n"
)
return "".join(declarations)
if __name__ == "__main__":
main()
|