1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
|
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// utf8_codecvt_facet.ipp
// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
// Use, modification and distribution is subject to the Boost Software
// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
// Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
// learn how this file should be used.
#include <boost/detail/utf8_codecvt_facet.hpp>
#include <cstdlib> // for multi-byte converson routines
#include <cassert>
#include <boost/limits.hpp>
#include <boost/config.hpp>
// If we don't have wstring, then Unicode support
// is not available anyway, so we don't need to even
// compiler this file. This also fixes the problem
// with mingw, which can compile this file, but will
// generate link error when building DLL.
#ifndef BOOST_NO_STD_WSTRING
BOOST_UTF8_BEGIN_NAMESPACE
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// implementation for wchar_t
// Translate incoming UTF-8 into UCS-4
std::codecvt_base::result utf8_codecvt_facet::do_in(
std::mbstate_t& /*state*/,
const char * from,
const char * from_end,
const char * & from_next,
wchar_t * to,
wchar_t * to_end,
wchar_t * & to_next
) const {
// Basic algorithm: The first octet determines how many
// octets total make up the UCS-4 character. The remaining
// "continuing octets" all begin with "10". To convert, subtract
// the amount that specifies the number of octets from the first
// octet. Subtract 0x80 (1000 0000) from each continuing octet,
// then mash the whole lot together. Note that each continuing
// octet only uses 6 bits as unique values, so only shift by
// multiples of 6 to combine.
while (from != from_end && to != to_end) {
// Error checking on the first octet
if (invalid_leading_octet(*from)){
from_next = from;
to_next = to;
return std::codecvt_base::error;
}
// The first octet is adjusted by a value dependent upon
// the number of "continuing octets" encoding the character
const int cont_octet_count = get_cont_octet_count(*from);
const wchar_t octet1_modifier_table[] = {
0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
};
// The unsigned char conversion is necessary in case char is
// signed (I learned this the hard way)
wchar_t ucs_result =
(unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
// Invariants :
// 1) At the start of the loop, 'i' continuing characters have been
// processed
// 2) *from points to the next continuing character to be processed.
int i = 0;
while(i != cont_octet_count && from != from_end) {
// Error checking on continuing characters
if (invalid_continuing_octet(*from)) {
from_next = from;
to_next = to;
return std::codecvt_base::error;
}
ucs_result *= (1 << 6);
// each continuing character has an extra (10xxxxxx)b attached to
// it that must be removed.
ucs_result += (unsigned char)(*from++) - 0x80;
++i;
}
// If the buffer ends with an incomplete unicode character...
if (from == from_end && i != cont_octet_count) {
// rewind "from" to before the current character translation
from_next = from - (i+1);
to_next = to;
return std::codecvt_base::partial;
}
*to++ = ucs_result;
}
from_next = from;
to_next = to;
// Were we done converting or did we run out of destination space?
if(from == from_end) return std::codecvt_base::ok;
else return std::codecvt_base::partial;
}
std::codecvt_base::result utf8_codecvt_facet::do_out(
std::mbstate_t& /*state*/,
const wchar_t * from,
const wchar_t * from_end,
const wchar_t * & from_next,
char * to,
char * to_end,
char * & to_next
) const
{
// RG - consider merging this table with the other one
const wchar_t octet1_modifier_table[] = {
0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
};
wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
while (from != from_end && to != to_end) {
// Check for invalid UCS-4 character
if (*from > max_wchar) {
from_next = from;
to_next = to;
return std::codecvt_base::error;
}
int cont_octet_count = get_cont_octet_out_count(*from);
// RG - comment this formula better
int shift_exponent = (cont_octet_count) * 6;
// Process the first character
*to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
(unsigned char)(*from / (1 << shift_exponent)));
// Process the continuation characters
// Invariants: At the start of the loop:
// 1) 'i' continuing octets have been generated
// 2) '*to' points to the next location to place an octet
// 3) shift_exponent is 6 more than needed for the next octet
int i = 0;
while (i != cont_octet_count && to != to_end) {
shift_exponent -= 6;
*to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
++i;
}
// If we filled up the out buffer before encoding the character
if(to == to_end && i != cont_octet_count) {
from_next = from;
to_next = to - (i+1);
return std::codecvt_base::partial;
}
++from;
}
from_next = from;
to_next = to;
// Were we done or did we run out of destination space
if(from == from_end) return std::codecvt_base::ok;
else return std::codecvt_base::partial;
}
// How many char objects can I process to get <= max_limit
// wchar_t objects?
int utf8_codecvt_facet::do_length(
const std::mbstate_t &,
const char * from,
const char * from_end,
std::size_t max_limit
) const
#if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
throw()
#endif
{
// RG - this code is confusing! I need a better way to express it.
// and test cases.
// Invariants:
// 1) last_octet_count has the size of the last measured character
// 2) char_count holds the number of characters shown to fit
// within the bounds so far (no greater than max_limit)
// 3) from_next points to the octet 'last_octet_count' before the
// last measured character.
int last_octet_count=0;
std::size_t char_count = 0;
const char* from_next = from;
// Use "<" because the buffer may represent incomplete characters
while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
from_next += last_octet_count;
last_octet_count = (get_octet_count(*from_next));
++char_count;
}
return static_cast<int>(from_next-from_end);
}
unsigned int utf8_codecvt_facet::get_octet_count(
unsigned char lead_octet
){
// if the 0-bit (MSB) is 0, then 1 character
if (lead_octet <= 0x7f) return 1;
// Otherwise the count number of consecutive 1 bits starting at MSB
// assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
else return 6;
}
namespace detail {
template<std::size_t s>
int get_cont_octet_out_count_impl(wchar_t word){
if (word < 0x80) {
return 0;
}
if (word < 0x800) {
return 1;
}
return 2;
}
template<>
int get_cont_octet_out_count_impl<4>(wchar_t word){
if (word < 0x80) {
return 0;
}
if (word < 0x800) {
return 1;
}
// Note that the following code will generate warnings on some platforms
// where wchar_t is defined as UCS2. The warnings are superfluous as the
// specialization is never instantitiated with such compilers, but this
// can cause problems if warnings are being treated as errors, so we guard
// against that. Including <boost/detail/utf8_codecvt_facet.hpp> as we do
// should be enough to get WCHAR_MAX defined.
#if !defined(WCHAR_MAX)
# error WCHAR_MAX not defined!
#endif
// cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
#if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier
return 2;
#elif WCHAR_MAX > 0x10000
if (word < 0x10000) {
return 2;
}
if (word < 0x200000) {
return 3;
}
if (word < 0x4000000) {
return 4;
}
return 5;
#else
return 2;
#endif
}
} // namespace detail
// How many "continuing octets" will be needed for this word
// == total octets - 1.
int utf8_codecvt_facet::get_cont_octet_out_count(
wchar_t word
) const {
return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
}
BOOST_UTF8_END_NAMESPACE
#endif
|