File: validate_utf8_with_errors_tests.cpp

package info (click to toggle)
simdutf 7.7.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,244 kB
  • sloc: cpp: 60,074; ansic: 14,226; python: 3,364; sh: 321; makefile: 12
file content (179 lines) | stat: -rw-r--r-- 7,074 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#include "simdutf.h"

#include <tests/helpers/random_utf8.h>
#include <tests/helpers/test.h>

TEST(validate_utf8_with_errors_cbf29ce4842223f0) {
  const unsigned char data[] = {
      0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
      0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
      0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
      0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
      0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
      0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xff};
  constexpr std::size_t data_len_bytes = sizeof(data);
  constexpr std::size_t data_len = data_len_bytes / sizeof(char);
  const auto validation1 =
      implementation.validate_utf8_with_errors((const char *)data, data_len);
  /*
  got return [count=64, error=SUCCESS] from implementation icelake
  got return [count=63, error=HEADER_BITS] from implementation haswell
  got return [count=63, error=HEADER_BITS] from implementation westmere
  got return [count=63, error=HEADER_BITS] from implementation fallback
  */
  ASSERT_EQUAL(validation1.count, 63);
  ASSERT_EQUAL(validation1.error, simdutf::error_code::HEADER_BITS);
}

// https://github.com/nodejs/node/issues/48995
TEST(node48995) {
  const char bad[1] = {(char)0x80};
  size_t length = 1;
  simdutf::result res = implementation.validate_utf8_with_errors(bad, length);
  ASSERT_TRUE(res.error);
}

TEST(copyright) {
  const char good[2] = {'\xC2', '\xA9'};
  size_t length = 2;
  simdutf::result res = implementation.validate_utf8_with_errors(good, length);
  ASSERT_EQUAL(res.error, simdutf::error_code::SUCCESS);
}

TEST_LOOP(no_error) {
  simdutf::tests::helpers::random_utf8 generator{seed, 1, 1, 1, 1};
  const auto utf8{generator.generate(512, seed)};
  simdutf::result res = implementation.validate_utf8_with_errors(
      reinterpret_cast<const char *>(utf8.data()), utf8.size());
  ASSERT_EQUAL(res.error, simdutf::error_code::SUCCESS);
  ASSERT_EQUAL(res.count, utf8.size());
}

TEST_LOOP(header_bits_error) {
  simdutf::tests::helpers::random_utf8 generator{seed, 1, 1, 1, 1};
  auto utf8{generator.generate(512, seed)};

  for (unsigned int i = 0; i < 512; i++) {
    if ((utf8[i] & 0b11000000) != 0b10000000) { // Only process leading bytes
      const unsigned char old = utf8[i];
      utf8[i] = uint8_t(0b11111000);
      simdutf::result res = implementation.validate_utf8_with_errors(
          reinterpret_cast<const char *>(utf8.data()), utf8.size());
      ASSERT_EQUAL(res.error, simdutf::error_code::HEADER_BITS);
      ASSERT_EQUAL(res.count, i);
      utf8[i] = old;
    }
  }
}

TEST_LOOP(too_short_error) {
  simdutf::tests::helpers::random_utf8 generator{seed, 1, 1, 1, 1};
  auto utf8{generator.generate(512, seed)};
  int leading_byte_pos = 0;
  for (int i = 0; i < 512; i++) {
    if ((utf8[i] & 0b11000000) ==
        0b10000000) { // Only process continuation bytes by making them leading
                      // bytes
      const unsigned char old = utf8[i];
      utf8[i] = uint8_t(0b11100000);
      simdutf::result res = implementation.validate_utf8_with_errors(
          reinterpret_cast<const char *>(utf8.data()), utf8.size());
      ASSERT_EQUAL(res.error, simdutf::error_code::TOO_SHORT);
      ASSERT_EQUAL(res.count, static_cast<unsigned>(leading_byte_pos));
      utf8[i] = old;
    } else {
      leading_byte_pos = i;
    }
  }
}

TEST_LOOP(too_long_error) {
  simdutf::tests::helpers::random_utf8 generator{seed, 1, 1, 1, 1};
  auto utf8{generator.generate(512, seed)};
  for (unsigned int i = 1; i < 512; i++) {
    if (((utf8[i] & 0b11000000) !=
         0b10000000)) { // Only process leading bytes by making them
                        // continuation bytes
      const unsigned char old = utf8[i];
      utf8[i] = uint8_t(0b10000000);
      simdutf::result res = implementation.validate_utf8_with_errors(
          reinterpret_cast<const char *>(utf8.data()), utf8.size());
      ASSERT_EQUAL(res.error, simdutf::error_code::TOO_LONG);
      ASSERT_EQUAL(res.count, i);
      utf8[i] = old;
    }
  }
}

TEST_LOOP(overlong_error) {
  simdutf::tests::helpers::random_utf8 generator{seed, 1, 1, 1, 1};
  auto utf8{generator.generate(512, seed)};
  for (unsigned int i = 1; i < 512; i++) {
    if (utf8[i] >= 0b11000000) { // Only non-ASCII leading bytes can be overlong
      const unsigned char old = utf8[i];
      const unsigned char second_old = utf8[i + 1];
      if ((old & 0b11100000) == 0b11000000) { // two-bytes case, change to a
                                              // value less or equal than 0x7f
        utf8[i] = 0b11000000;
      } else if ((old & 0b11110000) ==
                 0b11100000) { // three-bytes case, change to a value less or
                               // equal than 0x7ff
        utf8[i] = 0b11100000;
        utf8[i + 1] = utf8[i + 1] & 0b11011111;
      } else { // four-bytes case, change to a value less or equal than 0xffff
        utf8[i] = 0b11110000;
        utf8[i + 1] = utf8[i + 1] & 0b11001111;
      }
      simdutf::result res = implementation.validate_utf8_with_errors(
          reinterpret_cast<const char *>(utf8.data()), utf8.size());
      ASSERT_EQUAL(res.error, simdutf::error_code::OVERLONG);
      ASSERT_EQUAL(res.count, i);
      utf8[i] = old;
      utf8[i + 1] = second_old;
    }
  }
}

TEST_LOOP(too_large_error) {
  simdutf::tests::helpers::random_utf8 generator{seed, 1, 1, 1, 1};
  auto utf8{generator.generate(512, seed)};
  for (unsigned int i = 1; i < 512; i++) {
    if ((utf8[i] & 0b11111000) ==
        0b11110000) { // Can only have too large error in 4-bytes case
      utf8[i] += ((utf8[i] & 0b100) == 0b100)
                     ? 0b10
                     : 0b100; // Make sure we get too large error and not header
                              // bits error
      simdutf::result res = implementation.validate_utf8_with_errors(
          reinterpret_cast<const char *>(utf8.data()), utf8.size());
      ASSERT_EQUAL(res.error, simdutf::error_code::TOO_LARGE);
      ASSERT_EQUAL(res.count, i);
      utf8[i] -= 0b100;
    }
  }
}

TEST_LOOP(surrogate_error) {
  simdutf::tests::helpers::random_utf8 generator{seed, 1, 1, 1, 1};
  auto utf8{generator.generate(512, seed)};
  for (unsigned int i = 1; i < 512; i++) {
    if ((utf8[i] & 0b11110000) ==
        0b11100000) { // Can only have surrogate error in 3-bytes case
      const unsigned char old = utf8[i];
      const unsigned char second_old = utf8[i + 1];
      utf8[i] = 0b11101101; // Leading byte is always the same
      for (int s = 0x8; s < 0xf;
           s++) { // Modify second byte to create a surrogate codepoint
        utf8[i + 1] = (utf8[i + 1] & 0b11000011) | (s << 2);
        simdutf::result res = implementation.validate_utf8_with_errors(
            reinterpret_cast<const char *>(utf8.data()), utf8.size());
        ASSERT_EQUAL(res.error, simdutf::error_code::SURROGATE);
        ASSERT_EQUAL(res.count, i);
      }
      utf8[i] = old;
      utf8[i + 1] = second_old;
    }
  }
}

TEST_MAIN