File: ppc64_convert_utf16_to_utf8.py

package info (click to toggle)
simdutf 7.7.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,244 kB
  • sloc: cpp: 60,074; ansic: 14,226; python: 3,364; sh: 321; makefile: 12
file content (155 lines) | stat: -rwxr-xr-x 4,422 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python3
"""
Script is used to produce file `ppc64_utf16_to_utf8_tables.h`, that
contains lookup tables used in UTF-16 -> UTF-8 translation. Lookup
tables provide shuffle pattern for packing words into 1, 2, or 3-byte
UTF-8 characters.
"""

import sys
from os.path import normpath
from pathlib import Path
from contextlib import redirect_stdout


def format_array(array):
    result = []
    for value in array:
        if value < 0 or value == 0x80:
            result.append('0x80')
        else:
            result.append(str(value))

    return ', '.join(result)


def assure_array_length(array, size, value=0x80):
    while len(array) < size:
        array.append(value)


LE = True
BE = False


def shuffle_for_conversion_1_2_3_utf8_bytes(endianess):
    # There are two 8-bit bitmask telling how many bytes each word produces (1, 2 or 3).
    # mask1 = ddccbbaa -- output exactly one byte (d - MSB, a - LSB)
    # mask2 = hhggffee -- output one or two bytes

    # Please note that each bit is duplicated. In final form these bits are interleaved:
    # mask  = (mask1 & 0x5555) | (mask2 & 0xaaaa)
    #       = hdgcfbea

    # Each two-bit subword decides how many bytes will be copied from a 32-bit word of register:
    # | e | a | ea |
    # +---+---+----+-------
    # | 0 | 0 |  0 |  3 bytes
    # | 0 | 1 |  1 |  -- such combination will never come from C++ code, it has no sense
    # | 1 | 0 |  2 |  2 bytes
    # | 1 | 1 |  3 |  1 byte

    # unlike SSE, the PPC64 shuffle reads from **two** 16-byte vectors
    if endianess == BE:
        byte0_3bytes = 1
        byte1_3bytes = 0
        byte2_3bytes = 16 + 0

        byte0_2bytes = 0
        byte1_2bytes = 16 + 0

        byte0_1byte = 16 + 1
    else:
        byte0_3bytes = 0
        byte1_3bytes = 1
        byte2_3bytes = 16 + 1

        byte0_2bytes = 1
        byte1_2bytes = 16 + 1

        byte0_1byte = 16 + 0

    for mask in range(256):
        empty = 0x80
        shuffle = []
        for i in range(4):
            subword = mask & 0b11
            mask >>= 2

            if subword == 0:
                shuffle.append(i*2 + byte0_3bytes)
                shuffle.append(i*2 + byte1_3bytes)
                shuffle.append(i*2 + byte2_3bytes)
            elif subword == 3:
                shuffle.append(i*2 + byte0_1byte)
            elif subword == 2:
                shuffle.append(i*2 + byte0_2bytes)
                shuffle.append(i*2 + byte1_2bytes)

        output_bytes = len(shuffle)
        assure_array_length(shuffle, 16, empty)

        yield (shuffle, output_bytes)


CPP_HEADER = """// Code generated automatically; DO NOT EDIT
// file generated by scripts/ppc64_convert_utf16_to_utf8.py
#ifndef PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
#define PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H

namespace simdutf {
namespace {
namespace tables {
namespace ppc64_utf16_to_utf8 {
"""

CPP_FOOTER = """} // ppc64_utf16_to_utf8 namespace
} // tables namespace
} // unnamed namespace
} // namespace simdutf

#endif // PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H"""


def lookup_3_byte_utf8_for_perm():
    byte0_3bytes = 1
    byte1_3bytes = 0
    byte2_3bytes = 16 + 0

    tmp = []
    for i in range(8):
        tmp.append(2*i + byte0_3bytes)
        tmp.append(2*i + byte1_3bytes)
        tmp.append(2*i + byte2_3bytes)

    print("lo =", tmp[:16])
    print("hi =", tmp[16:])


def main():
    name = 'ppc64_utf16_to_utf8_tables.h'
    path = normpath(Path(__file__).parent / '..' / 'src' / 'ppc64' / name)
    print(f"creating {path}")
    with open(path, 'wt') as f:
        with redirect_stdout(f):
            print(CPP_HEADER)

            print("#if SIMDUTF_IS_BIG_ENDIAN")
            print("// 1 byte for length, 16 bytes for mask")
            print("const uint8_t pack_1_2_3_utf8_bytes[256][17] = {")
            for shuffle, size in shuffle_for_conversion_1_2_3_utf8_bytes(BE):
                print("  {%s}," % format_array([size] + shuffle))
            print("};")
            print("#else")
            print("// 1 byte for length, 16 bytes for mask")
            print("const uint8_t pack_1_2_3_utf8_bytes[256][17] = {")
            for shuffle, size in shuffle_for_conversion_1_2_3_utf8_bytes(LE):
                print("  {%s}," % format_array([size] + shuffle))
            print("};")
            print("#endif // SIMDUTF_IS_BIG_ENDIAN")

            print(CPP_FOOTER)


if __name__ == '__main__':
    main()