File: sse_validate_utf16le_testcases.py

package info (click to toggle)
simdutf 7.7.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,244 kB
  • sloc: cpp: 60,074; ansic: 14,226; python: 3,364; sh: 321; makefile: 12
file content (129 lines) | stat: -rw-r--r-- 2,787 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from itertools import product
from random import randint, seed
from sse_validate_utf16le_proof import bitmask


# This is a copy from sse_validate_utf16le_proof.py with
# adjusted the mask for the 16-bit base
def mask(words):
    L = bitmask(words, 'L')
    H = bitmask(words, 'H')
    V = (~(L | H)) & 0xffff

    a = L & (H >> 1)
    b = a << 1
    c = V | a | b

    return c


class Record:
    def __init__(self):
        self.words = []


    def add(self, word):
        self.words.append(word)


    @property
    def is_valid(self):
        c = mask(self.words)

        if c == 0xffff:
          return True

        if c == 0x7fff:
          # in test we reject cases when 'L' or 'H' ends a chunk
          if self.words[-1] in ('L', 'H'):
            return False
          else:
            return True

        return False


    def __str__(self):
        words = ''.join(self.words)
        if self.is_valid:
            return 'T' + words
        else:
            return 'F' + words


def test_words():
    collection = set()
    for seq in test_words_aux():
        collection.add(tuple(seq))

    return sorted(collection)


def test_words_aux():
    # 1. all valid
    yield ['V'] * 16

    # 2. only low surrogates
    yield ['L'] * 16

    # 3. only high surrogates
    yield ['H'] * 16

    # 4. sole low surrogate
    for i in range(16):
        seq = ['V'] * 16
        seq[i] = 'L'
        yield seq

    # 5. sole high surrogate
    for i in range(16):
        seq = ['V'] * 16
        seq[i] = 'H'
        yield seq

    # 6. scattered three surrogates
    for i in range(16):
        for j in range(16):
            for k in range(16):
                    seq = ['V'] * 16
                    for a, b, c in product('LH', repeat=3):
                        seq[i] = a
                        seq[j] = b
                        seq[k] = c
                        yield seq

    # To cover all 16-byte inputs we would need 3**16 cases (43'046'721)
    # Instead, we cover all possible 6-element combinations (3**6 = 729)
    # and move it within 16-element input. This yields 729 * 10 cases.
    k = 6
    for combination in product('VLH', repeat=k):
        for position in range(16 - k):
            seq = ['V'] * 16
            for i, v in enumerate(combination):
                seq[i + position] = v

            yield seq


TXT = """# generated by scripts/sse_validate_utf16le_testcases.py
"""

def write_file(file):
    file.write(TXT)
    for words in test_words():
        record = Record()
        for word in words:
            record.add(word)

        file.write(str(record))
        file.write('\n')


def main():
    seed(0)
    with open('validate_utf16_testcases.txt', 'w') as f:
        write_file(f)


if __name__ == '__main__':
    main()