1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
#include "validate_utf8.h"
namespace simdutf {
namespace tests {
namespace reference {
// credit: based on code from Google Fuchsia (Apache Licensed)
simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
const uint8_t *data = (const uint8_t *)buf;
uint64_t pos = 0;
uint64_t next_pos = 0;
uint32_t code_point = 0;
while (pos < len) {
unsigned char byte = data[pos];
if (byte < 0b10000000) {
pos++;
continue;
} else if ((byte & 0b11100000) == 0b11000000) {
next_pos = pos + 2;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
if (code_point < 0x80 || 0x7ff < code_point) {
return false;
}
} else if ((byte & 0b11110000) == 0b11100000) {
next_pos = pos + 3;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point = (byte & 0b00001111) << 12 |
(data[pos + 1] & 0b00111111) << 6 |
(data[pos + 2] & 0b00111111);
if (code_point < 0x800 || 0xffff < code_point ||
(0xd7ff < code_point && code_point < 0xe000)) {
return false;
}
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
next_pos = pos + 4;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point =
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
if (code_point <= 0xffff || 0x10ffff < code_point) {
return false;
}
} else {
// we may have a continuation
return false;
}
pos = next_pos;
}
return true;
}
} // namespace reference
} // namespace tests
} // namespace simdutf
|