File: utf8_length_from_utf16_tests.cpp

package info (click to toggle)
simdutf 8.0.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,524 kB
  • sloc: cpp: 64,498; ansic: 15,347; python: 3,592; sh: 366; makefile: 12
file content (202 lines) | stat: -rw-r--r-- 7,084 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
// The goal of these tests is to trigger the case when a low surrogate (the
// first one) is the last char handled by a vectorized code, and the remaining
// single one char16_t is passed to scalar code.

#include "simdutf.h"

#include <tests/helpers/compiletime_conversions.h>
#include <tests/helpers/fixed_string.h>
#include <tests/helpers/test.h>

TEST(utf16le_surrogate_pair) {
  for (size_t size = 0; size < 512; size++) {
    std::vector<uint8_t> input(size * 2, 0);

    // low surrogate
    input.push_back(0x01);
    input.push_back(0xd8);

    // high surrogate
    input.push_back(0x01);
    input.push_back(0xdc);

    const size_t want = size + 4;
    const size_t got = implementation.utf8_length_from_utf16le(
        reinterpret_cast<const char16_t *>(input.data()), input.size() / 2);

    ASSERT_EQUAL(want, got);

    const simdutf::result got_with_replacement =
        implementation.utf8_length_from_utf16le_with_replacement(
            reinterpret_cast<const char16_t *>(input.data()), input.size() / 2);
    ASSERT_EQUAL(want, got_with_replacement.count);
    ASSERT_EQUAL(simdutf::SURROGATE, got_with_replacement.error);
  }
}

TEST(utf16be_surrogate_pair) {
  for (size_t size = 0; size < 512; size++) {
    std::vector<uint8_t> input(size * 2, 0);

    // low surrogate
    input.push_back(0xd8);
    input.push_back(0x01);

    // high surrogate
    input.push_back(0xdc);
    input.push_back(0x01);

    const size_t want = size + 4;
    const size_t got = implementation.utf8_length_from_utf16be(
        reinterpret_cast<const char16_t *>(input.data()), input.size() / 2);

    ASSERT_EQUAL(want, got);

    const simdutf::result got_with_replacement =
        implementation.utf8_length_from_utf16be_with_replacement(
            reinterpret_cast<const char16_t *>(input.data()), input.size() / 2);
    ASSERT_EQUAL(want, got_with_replacement.count);
    ASSERT_EQUAL(simdutf::SURROGATE, got_with_replacement.error);
  }
}

TEST(issue001) {
  // There are surrogates but they are well formed.
  std::vector<char16_t> input = {0x004e, 0x000e, 0xdbba, 0xdd90,
                                 0x030b, 0x0035, 0x004f, 0x0045};
#if SIMDUTF_IS_BIG_ENDIAN
  const size_t standard =
      implementation.utf8_length_from_utf16be(input.data(), input.size());
  ASSERT_EQUAL(standard, 11);
  const auto result1 = implementation.utf8_length_from_utf16be_with_replacement(
      input.data(), input.size());
  ASSERT_EQUAL(result1.count, 11);
  ASSERT_EQUAL(simdutf::SURROGATE, result1.error);
#else
  const size_t standard =
      implementation.utf8_length_from_utf16le(input.data(), input.size());
  ASSERT_EQUAL(standard, 11);
  const auto result2 = implementation.utf8_length_from_utf16le_with_replacement(
      input.data(), input.size());
  ASSERT_EQUAL(result2.count, 11);
  ASSERT_EQUAL(simdutf::SURROGATE, result2.error);
#endif
}

TEST(issue002) {
  // There are surrogates but they are well formed.
  std::vector<char16_t> input = {0xd950, 0xdd9a, 0x002d};
#if SIMDUTF_IS_BIG_ENDIAN
  const size_t standard =
      implementation.utf8_length_from_utf16be(input.data(), input.size());
  ASSERT_EQUAL(standard, 5);
  const auto result1 = implementation.utf8_length_from_utf16be_with_replacement(
      input.data(), input.size());
  ASSERT_EQUAL(result1.count, 5);
  ASSERT_EQUAL(simdutf::SURROGATE, result1.error);
#else
  const size_t standard =
      implementation.utf8_length_from_utf16le(input.data(), input.size());
  ASSERT_EQUAL(standard, 5);
  const auto result2 = implementation.utf8_length_from_utf16le_with_replacement(
      input.data(), input.size());
  ASSERT_EQUAL(result2.count, 5);
  ASSERT_EQUAL(simdutf::SURROGATE, result2.error);
#endif
}

TEST(bug_found_in_release_7_7_0) {

  // this is invalid input in native endian, such that
  // utf8_length_from_utf16_with_replacement happens to give a different answer
  // than utf8_length_from_utf16. It is implementation defined what
  // utf8_length_from_utf16 gives, but it is sufficient to demonstrate the bug
  // to prove the bug in the current implementation.

  const std::vector<char16_t> input = {0xD800, 0xDC00, 0xDFFF, 0xD800, 0xDC00};
  const bool valid = simdutf::validate_utf16(input.data(), input.size());
  ASSERT_FALSE(valid);

  const auto native_length =
      simdutf::utf8_length_from_utf16(input.data(), input.size());
  const auto be_length =
      simdutf::utf8_length_from_utf16be(input.data(), input.size());
  const auto le_length =
      simdutf::utf8_length_from_utf16le(input.data(), input.size());
#if SIMDUTF_IS_BIG_ENDIAN
  ASSERT_EQUAL(native_length, be_length);
  (void)le_length;
#else
  (void)be_length;
  ASSERT_EQUAL(native_length, le_length);
#endif
}

#if SIMDUTF_CPLUSPLUS23

namespace {
// makes a malformed string in the requested endianness
template <simdutf::endianness e> constexpr auto make_malformed() {
  simdutf::tests::helpers::CTString<
      char16_t, 5,
      e == simdutf::endianness::BIG ? std::endian::big : std::endian::little>
      data{};
  data[2] = simdutf::scalar::utf16::swap_if_needed<e>(0xD800);
  return data;
}
} // namespace

TEST(compile_time_utf8_length_from_utf16_with_replacement) {
  using namespace simdutf::tests::helpers;
  using enum simdutf::endianness;

  {
    constexpr auto malformed = make_malformed<NATIVE>();
    constexpr simdutf::result utf8_length =
        simdutf::utf8_length_from_utf16_with_replacement(malformed);
    static_assert(utf8_length.count == malformed.size() + 2);
    static_assert(utf8_length.error == simdutf::SURROGATE);
    constexpr auto wellformed = to_wellformed(malformed);
    constexpr size_t utf8_length_check =
        simdutf::utf8_length_from_utf16(wellformed);
    static_assert(utf8_length.count == utf8_length_check);
  }
}

TEST(compile_time_utf8_length_from_utf16le_with_replacement) {
  using namespace simdutf::tests::helpers;
  using enum simdutf::endianness;

  {
    constexpr auto malformed = make_malformed<LITTLE>();
    constexpr simdutf::result utf8_length =
        simdutf::utf8_length_from_utf16le_with_replacement(malformed);
    static_assert(utf8_length.count == malformed.size() + 2);
    static_assert(utf8_length.error == simdutf::SURROGATE);
    constexpr auto wellformed = to_wellformed(malformed);
    constexpr size_t utf8_length_check =
        simdutf::utf8_length_from_utf16le(wellformed);
    static_assert(utf8_length.count == utf8_length_check);
  }
}

TEST(compile_time_utf8_length_from_utf16be_with_replacement) {
  using namespace simdutf::tests::helpers;
  using enum simdutf::endianness;

  {
    constexpr auto malformed = make_malformed<BIG>();
    constexpr simdutf::result utf8_length =
        simdutf::utf8_length_from_utf16be_with_replacement(malformed);
    static_assert(utf8_length.count == malformed.size() + 2);
    static_assert(utf8_length.error == simdutf::SURROGATE);
    constexpr auto wellformed = to_wellformed(malformed);
    constexpr size_t utf8_length_check =
        simdutf::utf8_length_from_utf16be(wellformed);
    static_assert(utf8_length.count == utf8_length_check);
  }
}

#endif

TEST_MAIN