1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
|
#include "random_utf8.h"
#include <vector>
namespace simdutf {
namespace tests {
namespace helpers {
random_utf8::random_utf8(uint32_t seed, int prob_1byte, int prob_2bytes,
int prob_3bytes, int prob_4bytes)
: gen(seed), bytes_count({double(prob_1byte), double(prob_2bytes),
double(prob_3bytes), double(prob_4bytes)}) {}
std::pair<std::vector<uint8_t>, size_t>
random_utf8::generate_counted(size_t output_bytes) {
std::vector<uint8_t> result;
result.reserve(output_bytes);
uint8_t candidate, head;
size_t count{0};
while (result.size() < output_bytes) {
count++;
switch (bytes_count(gen)) {
case 0: // 1 byte
candidate = uint8_t(val_7bit(gen));
while (candidate == 0) { // though strictly speaking, a stream of nulls is
// UTF8, it tends to break some code
candidate = uint8_t(val_7bit(gen));
}
result.push_back(candidate);
break;
case 1: // 2 bytes
candidate = 0xc0 | uint8_t(val_5bit(gen));
while (candidate < 0xC2) {
candidate = 0xc0 | uint8_t(val_5bit(gen));
}
result.push_back(candidate);
result.push_back(0x80 | uint8_t(val_6bit(gen)));
break;
case 2: // 3 bytes
head = 0xe0 | uint8_t(val_4bit(gen));
result.push_back(head);
candidate = 0x80 | uint8_t(val_6bit(gen));
if (head == 0xE0) {
while (candidate < 0xA0) {
candidate = 0x80 | uint8_t(val_6bit(gen));
}
} else if (head == 0xED) {
while (candidate > 0x9F) {
candidate = 0x80 | uint8_t(val_6bit(gen));
}
}
result.push_back(candidate);
result.push_back(0x80 | uint8_t(val_6bit(gen)));
break;
case 3: // 4 bytes
head = 0xf0 | uint8_t(val_3bit(gen));
while (head > 0xF4) {
head = 0xf0 | uint8_t(val_3bit(gen));
}
result.push_back(head);
candidate = 0x80 | uint8_t(val_6bit(gen));
if (head == 0xF0) {
while (candidate < 0x90) {
candidate = 0x80 | uint8_t(val_6bit(gen));
}
} else if (head == 0xF4) {
while (candidate > 0x8F) {
candidate = 0x80 | uint8_t(val_6bit(gen));
}
}
result.push_back(candidate);
result.push_back(0x80 | uint8_t(val_6bit(gen)));
result.push_back(0x80 | uint8_t(val_6bit(gen)));
break;
}
}
result.push_back(0); // EOS for scalar code
return make_pair(result, count);
}
std::vector<uint8_t> random_utf8::generate(size_t output_bytes) {
return generate_counted(output_bytes).first;
}
std::vector<uint8_t> random_utf8::generate(size_t output_bytes, long seed) {
gen.seed(uint32_t(seed));
return generate(output_bytes);
}
} // namespace helpers
} // namespace tests
} // namespace simdutf
|