1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
|
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// utf8_codecvt_facet.cpp
// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
// Distributed under the Boost Software License, Version 1.0. (See accompany-
// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
// See http://www.boost.org/libs/iostreams for documentation.
//#include <cstdlib> // for multi-byte converson routines
// Jonathan Turkanis:
// - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
// BOOST_IOSTREAMS_NO_WIDE_STREAMS;
// - Derived from codecvt_helper instead of codecvt.
#include <boost/config.hpp>
#include <boost/iostreams/detail/config/wide_streams.hpp>
#include <boost/numeric/conversion/cast.hpp>
#ifdef BOOST_IOSTREAMS_NO_LOCALES
# error "C++ locales not supported on this platform"
#else
#include <cassert>
#include <cstddef>
#include <boost/detail/workaround.hpp>
#include "./utf8_codecvt_facet.hpp"
#if BOOST_WORKAROUND(BOOST_BORLANDC, <= 0x600)
# pragma warn -sig // Conversion may lose significant digits
# pragma warn -rng // Constant is out of range in comparison
#endif
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// implementation for wchar_t
// Translate incoming UTF-8 into UCS-4
std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
std::mbstate_t&,
const char * from,
const char * from_end,
const char * & from_next,
wchar_t * to,
wchar_t * to_end,
wchar_t * & to_next
) const {
// Basic algorithm: The first octet determines how many
// octets total make up the UCS-4 character. The remaining
// "continuing octets" all begin with "10". To convert, subtract
// the amount that specifies the number of octets from the first
// octet. Subtract 0x80 (1000 0000) from each continuing octet,
// then mash the whole lot together. Note that each continuing
// octet only uses 6 bits as unique values, so only shift by
// multiples of 6 to combine.
while (from != from_end && to != to_end) {
// Error checking on the first octet
if (invalid_leading_octet(*from)){
from_next = from;
to_next = to;
return std::codecvt_base::error;
}
// The first octet is adjusted by a value dependent upon
// the number of "continuing octets" encoding the character
const int cont_octet_count = get_cont_octet_count(*from);
const wchar_t octet1_modifier_table[] = {
0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
};
// The unsigned char conversion is necessary in case char is
// signed (I learned this the hard way)
wchar_t ucs_result =
(unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
// Invariants :
// 1) At the start of the loop, 'i' continuing characters have been
// processed
// 2) *from points to the next continuing character to be processed.
int i = 0;
while(i != cont_octet_count && from != from_end) {
// Error checking on continuing characters
if (invalid_continuing_octet(*from)) {
from_next = from;
to_next = to;
return std::codecvt_base::error;
}
ucs_result *= (1 << 6);
// each continuing character has an extra (10xxxxxx)b attached to
// it that must be removed.
ucs_result += (unsigned char)(*from++) - 0x80;
++i;
}
// If the buffer ends with an incomplete unicode character...
if (from == from_end && i != cont_octet_count) {
// rewind "from" to before the current character translation
from_next = from - (i+1);
to_next = to;
return std::codecvt_base::partial;
}
*to++ = ucs_result;
}
from_next = from;
to_next = to;
// Were we done converting or did we run out of destination space?
if(from == from_end) return std::codecvt_base::ok;
else return std::codecvt_base::partial;
}
std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
std::mbstate_t &,
const wchar_t * from,
const wchar_t * from_end,
const wchar_t * & from_next,
char * to,
char * to_end,
char * & to_next
) const
{
// RG - consider merging this table with the other one
const wchar_t octet1_modifier_table[] = {
0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
};
while (from != from_end && to != to_end) {
#define BOOST_NULL // Prevent macro expansion
// Check for invalid UCS-4 character
if (*from > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
from_next = from;
to_next = to;
return std::codecvt_base::error;
}
#undef BOOST_NULL
int cont_octet_count = get_cont_octet_out_count(*from);
// RG - comment this formula better
int shift_exponent = (cont_octet_count) * 6;
// Process the first character
*to++ = octet1_modifier_table[cont_octet_count] +
(unsigned char)(*from / (1 << shift_exponent));
// Process the continuation characters
// Invariants: At the start of the loop:
// 1) 'i' continuing octets have been generated
// 2) '*to' points to the next location to place an octet
// 3) shift_exponent is 6 more than needed for the next octet
int i = 0;
while (i != cont_octet_count && to != to_end) {
shift_exponent -= 6;
*to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
++i;
}
// If we filled up the out buffer before encoding the character
if(to == to_end && i != cont_octet_count) {
from_next = from;
to_next = to - (i+1);
return std::codecvt_base::partial;
}
++from;
}
from_next = from;
to_next = to;
// Were we done or did we run out of destination space
if(from == from_end) return std::codecvt_base::ok;
else return std::codecvt_base::partial;
}
// How many char objects can I process to get <= max_limit
// wchar_t objects?
int utf8_codecvt_facet_wchar_t::do_length(
BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
const char * from,
const char * from_end,
std::size_t max_limit
) const throw()
{
const char * from_next = from;
for (std::size_t char_count = 0u; char_count < max_limit && from_next < from_end; ++char_count) {
unsigned int octet_count = get_octet_count(*from_next);
// The buffer may represent incomplete characters, so terminate early if one is found
if (octet_count > static_cast<std::size_t>(from_end - from_next))
break;
from_next += octet_count;
}
return static_cast<int>(from_next - from);
}
unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
unsigned char lead_octet
){
// if the 0-bit (MSB) is 0, then 1 character
if (lead_octet <= 0x7f) return 1;
// Otherwise the count number of consecutive 1 bits starting at MSB
assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
else return 6;
}
namespace {
template<std::size_t s>
int get_cont_octet_out_count_impl(wchar_t word){
if (word < 0x80) {
return 0;
}
if (word < 0x800) {
return 1;
}
return 2;
}
// note the following code will generate on some platforms where
// wchar_t is defined as UCS2. The warnings are superfluous as
// the specialization is never instantitiated with such compilers.
template<>
int get_cont_octet_out_count_impl<4>(wchar_t word)
{
if (word < 0x80) {
return 0;
}
if (word < 0x800) {
return 1;
}
if (word < 0x10000) {
return 2;
}
if (word < 0x200000) {
return 3;
}
if (word < 0x4000000) {
return 4;
}
return 5;
}
} // namespace anonymous
// How many "continuing octets" will be needed for this word
// == total octets - 1.
int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
wchar_t word
) const {
return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
}
#if 0 // not used?
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// implementation for char
std::codecvt_base::result utf8_codecvt_facet_char::do_in(
std::mbstate_t & state,
const char * from,
const char * from_end,
const char * & from_next,
char * to,
char * to_end,
char * & to_next
) const
{
while(from_next < from_end){
wchar_t w;
wchar_t *wnext = & w;
utf8_codecvt_facet_wchar_t::result ucs4_result;
ucs4_result = base_class::do_in(
state,
from, from_end, from_next,
wnext, wnext + 1, wnext
);
if(codecvt_base::ok != ucs4_result)
return ucs4_result;
// if the conversion succeeds.
int length = std::wctomb(to_next, w);
assert(-1 != length);
to_next += length;
}
return codecvt_base::ok;
}
std::codecvt_base::result utf8_codecvt_facet_char::do_out(
mbstate_t & state,
const char * from,
const char * from_end,
const char * & from_next,
char * to,
char * to_end,
char * & to_next
) const
{
while(from_next < from_end){
wchar_t w;
int result = std::mbtowc(&w, from_next, MB_LENGTH_MAX);
assert(-1 != result);
from_next += result;
utf8_codecvt_facet_wchar_t::result ucs4_result;
const wchar_t *wptr = & w;
ucs4_result = base_class::do_out(
state,
wptr, wptr+1, wptr,
to_next, to_end, to_next
);
if(codecvt_base::ok != ucs4_result)
return ucs4_result;
}
return codecvt_base::ok;
}
// How many bytes objects can I process to get <= max_limit
// char objects?
int utf8_codecvt_facet_char::do_length(
// it seems that the standard doesn't use const so these librarires
// would be in error
BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
const char * from_next,
const char * from_end,
std::size_t max_limit
) const
{
int total_length = 0;
const char *from = from_next;
mbstate_t state = initial_state;
while(from_next < from_end){
wchar_t w;
wchar_t *wnext = & w;
utf8_codecvt_facet_wchar_t::result ucs4_result;
ucs4_result = base_class::do_in(
state,
from_next, from_end, from_next,
wnext, wnext + 1, wnext
);
if(codecvt_base::ok != ucs4_result)
break;
char carray[MB_LENGTH_MAX];
std::size_t count = wctomb(carray, w);
if(count > max_limit)
break;
max_limit -= count;
total_length = from_next - from;
}
return total_length;
}
#endif
#endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS
|