1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
#!/usr/bin/env python3
"""
Unicode case folding database conversion utility
Parses the database and generates a C++ function which implements the case
folding algorithm. The database entries are of the form:
<code>; <status>; <mapping>; # <name>
<status> can be one of four characters:
C - Common mappings
S - mappings for Simple case folding
F - mappings for Full case folding
T - special case for Turkish I characters
Right now this generates a function which implements simple case folding (C+S
entries).
"""
from __future__ import print_function
import sys
import re
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
# This variable will body of the mappings function
body = ""
# Reads file line-by-line, extracts Common and Simple case fold mappings and
# returns a (from_char, to_char, from_name) tuple.
def mappings(f):
previous_from = -1
expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
for line in f:
m = expr.match(line)
if not m: continue
from_char = int(m.group(1), 16)
to_char = int(m.group(2), 16)
from_name = m.group(3)
if from_char <= previous_from:
raise Exception("Duplicate or unsorted characters in input")
yield from_char, to_char, from_name
previous_from = from_char
# Computes the shift (to_char - from_char) in a mapping.
def shift(mapping):
return mapping[1] - mapping[0]
# Computes the stride (from_char2 - from_char1) of two mappings.
def stride2(mapping1, mapping2):
return mapping2[0] - mapping1[0]
# Computes the stride of a list of mappings. The list should have at least two
# mappings. All mappings in the list are assumed to have the same stride.
def stride(block):
return stride2(block[0], block[1])
# b is a list of mappings. All the mappings are assumed to have the same
# shift and the stride between adjecant mappings (if any) is constant.
def dump_block(b):
global body
if len(b) == 1:
# Special case for handling blocks of length 1. We don't even need to
# emit the "if (C < X) return C" check below as all characters in this
# range will be caught by the "C < X" check emitted by the first
# non-trivial block.
body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
return
first = b[0][0]
last = first + stride(b) * (len(b)-1)
modulo = first % stride(b)
# All characters before this block map to themselves.
body += " if (C < {0:#06x})\n return C;\n".format(first)
body += " // {0} characters\n".format(len(b))
# Generic pattern: check upper bound (lower bound is checked by the "if"
# above) and modulo of C, return C+shift.
pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"
if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
# Special case:
# We can elide the modulo-check because the expression "C|1" will map
# the intervening characters to themselves.
pattern = " if (C <= {0:#06x})\n return C | 1;\n"
elif stride(b) == 1:
# Another special case: X % 1 is always zero, so don't emit the
# modulo-check.
pattern = " if (C <= {0:#06x})\n return C + {3};\n"
body += pattern.format(last, stride(b), modulo, shift(b[0]))
current_block = []
f = urlopen(sys.argv[1])
for m in mappings(f):
if len(current_block) == 0:
current_block.append(m)
continue
if shift(current_block[0]) != shift(m):
# Incompatible shift, start a new block.
dump_block(current_block)
current_block = [m]
continue
if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
current_block.append(m)
continue
# Incompatible stride, start a new block.
dump_block(current_block)
current_block = [m]
f.close()
dump_block(current_block)
print('//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//')
print('//')
print('// This file was generated by utils/unicode-case-fold.py from the Unicode')
print('// case folding database at')
print('// ', sys.argv[1])
print('//')
print('// To regenerate this file, run:')
print('// utils/unicode-case-fold.py \\')
print('// "{}" \\'.format(sys.argv[1]))
print('// > lib/Support/UnicodeCaseFold.cpp')
print('//')
print('//===----------------------------------------------------------------------===//')
print('')
print('#include "llvm/Support/Unicode.h"')
print('')
print("int llvm::sys::unicode::foldCharSimple(int C) {")
print(body)
print(" return C;")
print("}")
|