1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
|
#ifndef MISC_UTF8ITER_HPP
#define MISC_UTF8ITER_HPP
#include <cstdint>
#include <cstring>
#include <string>
#include <string_view>
#include <tuple>
class Utf8Stream
{
public:
typedef uint32_t UnicodeChar;
typedef unsigned char const* Point;
// static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this
static UnicodeChar sBadChar() { return UnicodeChar(0xFFFFFFFF); }
Utf8Stream(Point begin, Point end)
: cur(begin)
, nxt(begin)
, end(end)
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(const char* str)
: cur(reinterpret_cast<const unsigned char*>(str))
, nxt(reinterpret_cast<const unsigned char*>(str))
, end(reinterpret_cast<const unsigned char*>(str) + strlen(str))
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(std::pair<Point, Point> range)
: cur(range.first)
, nxt(range.first)
, end(range.second)
, val(Utf8Stream::sBadChar())
{
}
Utf8Stream(std::string_view str)
: Utf8Stream(reinterpret_cast<Point>(str.data()), reinterpret_cast<Point>(str.data() + str.size()))
{
}
bool eof() const { return cur == end; }
Point current() const { return cur; }
UnicodeChar peek()
{
if (cur == nxt)
next();
return val;
}
UnicodeChar consume()
{
if (cur == nxt)
next();
cur = nxt;
return val;
}
static bool isAscii(unsigned char value) { return (value & 0x80) == 0; }
static std::pair<UnicodeChar, Point> decode(Point cur, Point end)
{
if (isAscii(*cur))
{
UnicodeChar chr = *cur++;
return std::make_pair(chr, cur);
}
std::size_t octets;
UnicodeChar chr;
std::tie(octets, chr) = getOctetCount(*cur++);
return decode(cur, end, chr, octets);
}
static std::pair<UnicodeChar, Point> decode(Point cur, Point end, UnicodeChar chr, std::size_t octets)
{
if (octets > 5)
return std::make_pair(sBadChar(), cur);
Point eoc = cur + octets;
if (eoc > end)
return std::make_pair(sBadChar(), cur);
while (cur != eoc)
{
if ((*cur & 0xC0) != 0x80) // check continuation mark
return std::make_pair(sBadChar(), cur);
chr = (chr << 6) | UnicodeChar((*cur++) & 0x3F);
}
return std::make_pair(chr, cur);
}
static UnicodeChar toLowerUtf8(UnicodeChar ch)
{
// Russian alphabet
if (ch >= 0x0410 && ch < 0x0430)
return ch + 0x20;
// Cyrillic IO character
if (ch == 0x0401)
return ch + 0x50;
// Latin alphabet
if (ch >= 0x41 && ch < 0x60)
return ch + 0x20;
// German characters
if (ch == 0xc4 || ch == 0xd6 || ch == 0xdc)
return ch + 0x20;
if (ch == 0x1e9e)
return 0xdf;
// TODO: probably we will need to support characters from other languages
return ch;
}
static std::string lowerCaseUtf8(std::string_view str)
{
if (str.empty())
return std::string{ str };
// Decode string as utf8 characters, convert to lower case and pack them to string
std::string out;
out.reserve(str.length());
Utf8Stream stream(str);
while (!stream.eof())
{
UnicodeChar character = toLowerUtf8(stream.peek());
if (character <= 0x7f)
out.append(1, static_cast<char>(character));
else if (character <= 0x7ff)
{
out.append(1, static_cast<char>(0xc0 | ((character >> 6) & 0x1f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
else if (character <= 0xffff)
{
out.append(1, static_cast<char>(0xe0 | ((character >> 12) & 0x0f)));
out.append(1, static_cast<char>(0x80 | ((character >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
else
{
out.append(1, static_cast<char>(0xf0 | ((character >> 18) & 0x07)));
out.append(1, static_cast<char>(0x80 | ((character >> 12) & 0x3f)));
out.append(1, static_cast<char>(0x80 | ((character >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (character & 0x3f)));
}
stream.consume();
}
return out;
}
static std::pair<std::size_t, UnicodeChar> getOctetCount(unsigned char octet)
{
std::size_t octets;
unsigned char mark = 0xC0;
unsigned char mask = 0xE0;
for (octets = 1; octets <= 5; ++octets)
{
if ((octet & mask) == mark)
break;
mark = (mark >> 1) | 0x80;
mask = (mask >> 1) | 0x80;
}
return std::make_pair(octets, octet & ~mask);
}
private:
void next() { std::tie(val, nxt) = decode(nxt, end); }
Point cur;
Point nxt;
Point end;
UnicodeChar val;
};
#endif
|