1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
// Copyright Maarten L. Hekkelman, Radboud University 2010-2011.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#ifndef ZEEP_XML_UNICODE_SUPPORT_HPP
#define ZEEP_XML_UNICODE_SUPPORT_HPP
#include <zeep/config.hpp>
#include <boost/cstdint.hpp>
#include <string>
namespace zeep { namespace xml {
/// We use our own unicode type since wchar_t might be too small.
/// This type should be able to contain a UCS4 encoded character.
typedef uint32_t unicode;
/// the supported encodings. Perhaps we should extend this list a bit?
enum encoding_type
{
enc_UTF8, ///< UTF-8
enc_UTF16BE, ///< UTF-16 Big Endian
enc_UTF16LE, ///< UTF 16 Little Endian
// enc_ISO88591
};
/// some character classification routines
bool is_name_start_char(unicode uc);
bool is_name_char(unicode uc);
bool is_char(unicode uc);
bool is_valid_system_literal_char(unicode uc);
bool is_valid_system_literal(const std::string& s);
bool is_valid_public_id_char(unicode uc);
bool is_valid_public_id(const std::string& s);
/// Convert a string from UCS4 to UTF-8
std::string wstring_to_string(const std::wstring& s);
/// manipulate UTF-8 encoded strings
void append(std::string& s, unicode ch);
unicode pop_last_char(std::string& s);
// inlines
inline bool is_char(unicode uc)
{
return
uc == 0x09 or
uc == 0x0A or
uc == 0x0D or
(uc >= 0x020 and uc <= 0x0D7FF) or
(uc >= 0x0E000 and uc <= 0x0FFFD) or
(uc >= 0x010000 and uc <= 0x010FFFF);
}
inline void append(std::string& s, unicode uc)
{
if (uc < 0x080)
s += (static_cast<char>(uc));
else if (uc < 0x0800)
{
char ch[2] = {
static_cast<char>(0x0c0 | (uc >> 6)),
static_cast<char>(0x080 | (uc & 0x3f))
};
s.append(ch, 2);
}
else if (uc < 0x00010000)
{
char ch[3] = {
static_cast<char>(0x0e0 | (uc >> 12)),
static_cast<char>(0x080 | ((uc >> 6) & 0x3f)),
static_cast<char>(0x080 | (uc & 0x3f))
};
s.append(ch, 3);
}
else
{
char ch[4] = {
static_cast<char>(0x0f0 | (uc >> 18)),
static_cast<char>(0x080 | ((uc >> 12) & 0x3f)),
static_cast<char>(0x080 | ((uc >> 6) & 0x3f)),
static_cast<char>(0x080 | (uc & 0x3f))
};
s.append(ch, 4);
}
}
inline unicode pop_last_char(std::string& s)
{
unicode result = 0;
if (not s.empty())
{
std::string::iterator ch = s.end() - 1;
if ((*ch & 0x0080) == 0)
{
result = *ch;
s.erase(ch);
}
else
{
int o = 0;
do
{
result |= (*ch & 0x03F) << o;
o += 6;
--ch;
}
while (ch != s.begin() and (*ch & 0x0C0) == 0x080);
switch (o)
{
case 6: result |= (*ch & 0x01F) << 6; break;
case 12: result |= (*ch & 0x00F) << 12; break;
case 18: result |= (*ch & 0x007) << 18; break;
}
s.erase(ch, s.end());
}
}
return result;
}
}
}
#endif
|