1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
#!/usr/bin/env python3
"""
Script is used to produce file `ppc64_utf16_to_utf8_tables.h`, that
contains lookup tables used in UTF-16 -> UTF-8 translation. Lookup
tables provide shuffle pattern for packing words into 1, 2, or 3-byte
UTF-8 characters.
"""
import sys
from os.path import normpath
from pathlib import Path
from contextlib import redirect_stdout
def format_array(array):
result = []
for value in array:
if value < 0 or value == 0x80:
result.append('0x80')
else:
result.append(str(value))
return ', '.join(result)
def assure_array_length(array, size, value=0x80):
while len(array) < size:
array.append(value)
LE = True
BE = False
def shuffle_for_conversion_1_2_3_utf8_bytes(endianess):
# There are two 8-bit bitmask telling how many bytes each word produces (1, 2 or 3).
# mask1 = ddccbbaa -- output exactly one byte (d - MSB, a - LSB)
# mask2 = hhggffee -- output one or two bytes
# Please note that each bit is duplicated. In final form these bits are interleaved:
# mask = (mask1 & 0x5555) | (mask2 & 0xaaaa)
# = hdgcfbea
# Each two-bit subword decides how many bytes will be copied from a 32-bit word of register:
# | e | a | ea |
# +---+---+----+-------
# | 0 | 0 | 0 | 3 bytes
# | 0 | 1 | 1 | -- such combination will never come from C++ code, it has no sense
# | 1 | 0 | 2 | 2 bytes
# | 1 | 1 | 3 | 1 byte
# unlike SSE, the PPC64 shuffle reads from **two** 16-byte vectors
if endianess == BE:
byte0_3bytes = 1
byte1_3bytes = 0
byte2_3bytes = 16 + 0
byte0_2bytes = 0
byte1_2bytes = 16 + 0
byte0_1byte = 16 + 1
else:
byte0_3bytes = 0
byte1_3bytes = 1
byte2_3bytes = 16 + 1
byte0_2bytes = 1
byte1_2bytes = 16 + 1
byte0_1byte = 16 + 0
for mask in range(256):
empty = 0x80
shuffle = []
for i in range(4):
subword = mask & 0b11
mask >>= 2
if subword == 0:
shuffle.append(i*2 + byte0_3bytes)
shuffle.append(i*2 + byte1_3bytes)
shuffle.append(i*2 + byte2_3bytes)
elif subword == 3:
shuffle.append(i*2 + byte0_1byte)
elif subword == 2:
shuffle.append(i*2 + byte0_2bytes)
shuffle.append(i*2 + byte1_2bytes)
output_bytes = len(shuffle)
assure_array_length(shuffle, 16, empty)
yield (shuffle, output_bytes)
CPP_HEADER = """// Code generated automatically; DO NOT EDIT
// file generated by scripts/ppc64_convert_utf16_to_utf8.py
#ifndef PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
#define PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
namespace simdutf {
namespace {
namespace tables {
namespace ppc64_utf16_to_utf8 {
"""
CPP_FOOTER = """} // ppc64_utf16_to_utf8 namespace
} // tables namespace
} // unnamed namespace
} // namespace simdutf
#endif // PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H"""
def lookup_3_byte_utf8_for_perm():
byte0_3bytes = 1
byte1_3bytes = 0
byte2_3bytes = 16 + 0
tmp = []
for i in range(8):
tmp.append(2*i + byte0_3bytes)
tmp.append(2*i + byte1_3bytes)
tmp.append(2*i + byte2_3bytes)
print("lo =", tmp[:16])
print("hi =", tmp[16:])
def main():
name = 'ppc64_utf16_to_utf8_tables.h'
path = normpath(Path(__file__).parent / '..' / 'src' / 'ppc64' / name)
print(f"creating {path}")
with open(path, 'wt') as f:
with redirect_stdout(f):
print(CPP_HEADER)
print("#if SIMDUTF_IS_BIG_ENDIAN")
print("// 1 byte for length, 16 bytes for mask")
print("const uint8_t pack_1_2_3_utf8_bytes[256][17] = {")
for shuffle, size in shuffle_for_conversion_1_2_3_utf8_bytes(BE):
print(" {%s}," % format_array([size] + shuffle))
print("};")
print("#else")
print("// 1 byte for length, 16 bytes for mask")
print("const uint8_t pack_1_2_3_utf8_bytes[256][17] = {")
for shuffle, size in shuffle_for_conversion_1_2_3_utf8_bytes(LE):
print(" {%s}," % format_array([size] + shuffle))
print("};")
print("#endif // SIMDUTF_IS_BIG_ENDIAN")
print(CPP_FOOTER)
if __name__ == '__main__':
main()
|