File: ppc64_convert_utf16_to_utf8.py

package info (click to toggle)
simdutf 7.7.1-3
links: PTS, VCS
area: main
in suites: forky, sid
size: 7,244 kB
sloc: cpp: 60,074; ansic: 14,226; python: 3,364; sh: 321; makefile: 12
file content (155 lines) | stat: -rwxr-xr-x 4,422 bytes
#!/usr/bin/env python3
"""
Script is used to produce file `ppc64_utf16_to_utf8_tables.h`, that
contains lookup tables used in UTF-16 -> UTF-8 translation. Lookup
tables provide shuffle pattern for packing words into 1, 2, or 3-byte
UTF-8 characters.
"""

import sys
from os.path import normpath
from pathlib import Path
from contextlib import redirect_stdout


def format_array(array):
    result = []
    for value in array:
        if value < 0 or value == 0x80:
            result.append('0x80')
        else:
            result.append(str(value))

    return ', '.join(result)


def assure_array_length(array, size, value=0x80):
    while len(array) < size:
        array.append(value)


LE = True
BE = False


def shuffle_for_conversion_1_2_3_utf8_bytes(endianess):
    # There are two 8-bit bitmask telling how many bytes each word produces (1, 2 or 3).
    # mask1 = ddccbbaa -- output exactly one byte (d - MSB, a - LSB)
    # mask2 = hhggffee -- output one or two bytes

    # Please note that each bit is duplicated. In final form these bits are interleaved:
    # mask  = (mask1 & 0x5555) | (mask2 & 0xaaaa)
    #       = hdgcfbea

    # Each two-bit subword decides how many bytes will be copied from a 32-bit word of register:
    # | e | a | ea |
    # +---+---+----+-------
    # | 0 | 0 |  0 |  3 bytes
    # | 0 | 1 |  1 |  -- such combination will never come from C++ code, it has no sense
    # | 1 | 0 |  2 |  2 bytes
    # | 1 | 1 |  3 |  1 byte

    # unlike SSE, the PPC64 shuffle reads from **two** 16-byte vectors
    if endianess == BE:
        byte0_3bytes = 1
        byte1_3bytes = 0
        byte2_3bytes = 16 + 0

        byte0_2bytes = 0
        byte1_2bytes = 16 + 0

        byte0_1byte = 16 + 1
    else:
        byte0_3bytes = 0
        byte1_3bytes = 1
        byte2_3bytes = 16 + 1

        byte0_2bytes = 1
        byte1_2bytes = 16 + 1

        byte0_1byte = 16 + 0

    for mask in range(256):
        empty = 0x80
        shuffle = []
        for i in range(4):
            subword = mask & 0b11
            mask >>= 2

            if subword == 0:
                shuffle.append(i*2 + byte0_3bytes)
                shuffle.append(i*2 + byte1_3bytes)
                shuffle.append(i*2 + byte2_3bytes)
            elif subword == 3:
                shuffle.append(i*2 + byte0_1byte)
            elif subword == 2:
                shuffle.append(i*2 + byte0_2bytes)
                shuffle.append(i*2 + byte1_2bytes)

        output_bytes = len(shuffle)
        assure_array_length(shuffle, 16, empty)

        yield (shuffle, output_bytes)


CPP_HEADER = """// Code generated automatically; DO NOT EDIT
// file generated by scripts/ppc64_convert_utf16_to_utf8.py
#ifndef PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
#define PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H

namespace simdutf {
namespace {
namespace tables {
namespace ppc64_utf16_to_utf8 {
"""

CPP_FOOTER = """} // ppc64_utf16_to_utf8 namespace
} // tables namespace
} // unnamed namespace
} // namespace simdutf

#endif // PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H"""


def lookup_3_byte_utf8_for_perm():
    byte0_3bytes = 1
    byte1_3bytes = 0
    byte2_3bytes = 16 + 0

    tmp = []
    for i in range(8):
        tmp.append(2*i + byte0_3bytes)
        tmp.append(2*i + byte1_3bytes)
        tmp.append(2*i + byte2_3bytes)

    print("lo =", tmp[:16])
    print("hi =", tmp[16:])


def main():
    name = 'ppc64_utf16_to_utf8_tables.h'
    path = normpath(Path(__file__).parent / '..' / 'src' / 'ppc64' / name)
    print(f"creating {path}")
    with open(path, 'wt') as f:
        with redirect_stdout(f):
            print(CPP_HEADER)

            print("#if SIMDUTF_IS_BIG_ENDIAN")
            print("// 1 byte for length, 16 bytes for mask")
            print("const uint8_t pack_1_2_3_utf8_bytes[256][17] = {")
            for shuffle, size in shuffle_for_conversion_1_2_3_utf8_bytes(BE):
                print("  {%s}," % format_array([size] + shuffle))
            print("};")
            print("#else")
            print("// 1 byte for length, 16 bytes for mask")
            print("const uint8_t pack_1_2_3_utf8_bytes[256][17] = {")
            for shuffle, size in shuffle_for_conversion_1_2_3_utf8_bytes(LE):
                print("  {%s}," % format_array([size] + shuffle))
            print("};")
            print("#endif // SIMDUTF_IS_BIG_ENDIAN")

            print(CPP_FOOTER)


if __name__ == '__main__':
    main()