1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
|
#ifndef BOOST_UTF8_CODECVT_FACET_HPP
#define BOOST_UTF8_CODECVT_FACET_HPP
#include <boost/iostreams/detail/config/wide_streams.hpp>
#ifdef BOOST_IOSTREAMS_NO_WIDE_STREAMS
# error wide streams not supported on this platform
#endif
// MS compatible compilers support #pragma once
#if defined(_MSC_VER)
# pragma once
#endif
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// utf8_codecvt_facet.hpp
// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
// Copyright (c) 2001 Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
// Distributed under the Boost Software License, Version 1.0. (See accompany-
// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
// Note:(Robert Ramey). I have made the following alterations in the original
// code.
// a) Rendered utf8_codecvt<wchar_t, char> with using templates
// b) Move longer functions outside class definition to prevent inlining
// and make code smaller
// c) added on a derived class to permit translation to/from current
// locale to utf8
// See http://www.boost.org for updates, documentation, and revision history.
// archives stored as text - note these ar templated on the basic
// stream templates to accommodate wide (and other?) kind of characters
//
// note the fact that on libraries without wide characters, ostream is
// is not a specialization of basic_ostream which in fact is not defined
// in such cases. So we can't use basic_ostream<OStream::char_type> but rather
// use two template parameters
//
// utf8_codecvt_facet
// This is an implementation of a std::codecvt facet for translating
// from UTF-8 externally to UCS-4. Note that this is not tied to
// any specific types in order to allow customization on platforms
// where wchar_t is not big enough.
//
// NOTES: The current implementation jumps through some unpleasant hoops in
// order to deal with signed character types. As a std::codecvt_base::result,
// it is necessary for the ExternType to be convertible to unsigned char.
// I chose not to tie the extern_type explicitly to char. But if any combination
// of types other than <wchar_t,char_t> is used, then std::codecvt must be
// specialized on those types for this to work.
#include <locale>
#include <cstddef> // size_t
#include <cwchar> // mbstate_t
#include <boost/integer_traits.hpp>
#include <boost/iostreams/detail/config/wide_streams.hpp>
#include <boost/iostreams/detail/codecvt_helper.hpp>
// maximum lenght of a multibyte string
#define MB_LENGTH_MAX 8
struct utf8_codecvt_facet_wchar_t
: public boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t>
{
public:
explicit utf8_codecvt_facet_wchar_t(std::size_t no_locale_manage = 0)
: boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t>
(no_locale_manage)
{ }
protected:
virtual std::codecvt_base::result do_in(
std::mbstate_t& state,
const char * from,
const char * from_end,
const char * & from_next,
wchar_t * to,
wchar_t * to_end,
wchar_t*& to_next
) const;
virtual std::codecvt_base::result do_out(
std::mbstate_t & state, const wchar_t * from,
const wchar_t * from_end, const wchar_t* & from_next,
char * to, char * to_end, char * & to_next
) const;
bool invalid_continuing_octet(unsigned char octet_1) const {
return (octet_1 < 0x80|| 0xbf< octet_1);
}
bool invalid_leading_octet(unsigned char octet_1) const {
return (0x7f < octet_1 && octet_1 < 0xc0) ||
(octet_1 > 0xfd);
}
// continuing octets = octets except for the leading octet
static unsigned int get_cont_octet_count(unsigned char lead_octet) {
return get_octet_count(lead_octet) - 1;
}
static unsigned int get_octet_count(unsigned char lead_octet);
// How many "continuing octets" will be needed for this word
// == total octets - 1.
int get_cont_octet_out_count(wchar_t word) const ;
virtual bool do_always_noconv() const throw() { return false; }
// UTF-8 isn't really stateful since we rewind on partial conversions
virtual std::codecvt_base::result do_unshift(
std::mbstate_t&,
char * from,
char * /* to */,
char * & next
) const{
next = from;
return ok;
}
virtual int do_encoding() const throw() {
const int variable_byte_external_encoding=0;
return variable_byte_external_encoding;
}
// How many char objects can I process to get <= max_limit
// wchar_t objects?
virtual int do_length(
BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
const char * from,
const char * from_end,
std::size_t max_limit
) const throw();
// Largest possible value do_length(state,from,from_end,1) could return.
virtual int do_max_length() const throw () {
return 6; // largest UTF-8 encoding of a UCS-4 character
}
};
#if 0 // not used - incorrect in any case
// Robert Ramey - use the above to make a code converter from multi-byte
// char strings to utf8 encoding
struct utf8_codecvt_facet_char : public utf8_codecvt_facet_wchar_t
{
typedef utf8_codecvt_facet_wchar_t base_class;
public:
explicit utf8_codecvt_facet_char(std::size_t no_locale_manage=0)
: base_class(no_locale_manage)
{}
protected:
virtual std::codecvt_base::result do_in(
std::mbstate_t & state,
const char * from,
const char * from_end,
const char * & from_next,
char * to,
char * to_end,
char * & to_next
) const;
virtual std::codecvt_base::result do_out(
std::mbstate_t & state,
const char * from,
const char * from_end,
const char* & from_next,
char * to,
char * to_end,
char * & to_next
) const;
// How many char objects can I process to get <= max_limit
// char objects?
virtual int do_length(
const std::mbstate_t&,
const char * from,
const char * from_end,
std::size_t max_limit
) const;
};
#endif
template<class Internal, class External>
struct utf8_codecvt_facet
{};
template<>
struct utf8_codecvt_facet<wchar_t, char>
: public utf8_codecvt_facet_wchar_t
{};
#if 0
template<>
struct utf8_codecvt_facet<char, char>
: public utf8_codecvt_facet_char
{};
#endif
#endif // BOOST_UTF8_CODECVT_FACET_HPP
|