1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
|
// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/strings/utf_string_conversions.h"
#include <limits.h>
#include <stdint.h>
#include <ostream>
#include <type_traits>
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_ostream_operators.h"
#include "base/strings/utf_string_conversion_utils.h"
#include "base/third_party/icu/icu_utf.h"
#include "build/build_config.h"
namespace base {
namespace {
constexpr base_icu::UChar32 kErrorCodePoint = 0xFFFD;
// Size coefficient ----------------------------------------------------------
// The maximum number of codeunits in the destination encoding corresponding to
// one codeunit in the source encoding.
template <typename SrcChar, typename DestChar>
struct SizeCoefficient {
static_assert(sizeof(SrcChar) < sizeof(DestChar),
"Default case: from a smaller encoding to the bigger one");
// ASCII symbols are encoded by one codeunit in all encodings.
static constexpr int value = 1;
};
template <>
struct SizeCoefficient<char16_t, char> {
// One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
static constexpr int value = 3;
};
#if defined(WCHAR_T_IS_UTF32)
template <>
struct SizeCoefficient<wchar_t, char> {
// UTF-8 uses at most 4 codeunits per character.
static constexpr int value = 4;
};
template <>
struct SizeCoefficient<wchar_t, char16_t> {
// UTF-16 uses at most 2 codeunits per character.
static constexpr int value = 2;
};
#endif // defined(WCHAR_T_IS_UTF32)
template <typename SrcChar, typename DestChar>
constexpr int size_coefficient_v =
SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
// UnicodeAppendUnsafe --------------------------------------------------------
// Function overloads that write code_point to the output string. Output string
// has to have enough space for the codepoint.
// Convenience typedef that checks whether the passed in type is integral (i.e.
// bool, char, int or their extended versions) and is of the correct size.
template <typename Char, size_t N>
using EnableIfBitsAre =
std::enable_if_t<std::is_integral_v<Char> && CHAR_BIT * sizeof(Char) == N,
bool>;
template <typename Char, EnableIfBitsAre<Char, 8> = true>
void UnicodeAppendUnsafe(Char* out,
size_t* size,
base_icu::UChar32 code_point) {
CBU8_APPEND_UNSAFE(reinterpret_cast<uint8_t*>(out), *size, code_point);
}
template <typename Char, EnableIfBitsAre<Char, 16> = true>
void UnicodeAppendUnsafe(Char* out,
size_t* size,
base_icu::UChar32 code_point) {
CBU16_APPEND_UNSAFE(out, *size, code_point);
}
template <typename Char, EnableIfBitsAre<Char, 32> = true>
void UnicodeAppendUnsafe(Char* out,
size_t* size,
base_icu::UChar32 code_point) {
out[(*size)++] = static_cast<Char>(code_point);
}
// DoUTFConversion ------------------------------------------------------------
// Main driver of UTFConversion specialized for different Src encodings.
// dest has to have enough room for the converted text.
template <typename DestChar>
bool DoUTFConversion(const char* src,
size_t src_len,
DestChar* dest,
size_t* dest_len) {
bool success = true;
for (size_t i = 0; i < src_len;) {
base_icu::UChar32 code_point;
CBU8_NEXT(reinterpret_cast<const uint8_t*>(src), i, src_len, code_point);
if (!IsValidCodepoint(code_point)) {
success = false;
code_point = kErrorCodePoint;
}
UnicodeAppendUnsafe(dest, dest_len, code_point);
}
return success;
}
template <typename DestChar>
bool DoUTFConversion(const char16_t* src,
size_t src_len,
DestChar* dest,
size_t* dest_len) {
bool success = true;
auto ConvertSingleChar = [&success](char16_t in) -> base_icu::UChar32 {
if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
success = false;
return kErrorCodePoint;
}
return in;
};
size_t i = 0;
// Always have another symbol in order to avoid checking boundaries in the
// middle of the surrogate pair.
while (i + 1 < src_len) {
base_icu::UChar32 code_point;
if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
if (!IsValidCodepoint(code_point)) {
code_point = kErrorCodePoint;
success = false;
}
i += 2;
} else {
code_point = ConvertSingleChar(src[i]);
++i;
}
UnicodeAppendUnsafe(dest, dest_len, code_point);
}
if (i < src_len) {
UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
}
return success;
}
#if defined(WCHAR_T_IS_UTF32)
template <typename DestChar>
bool DoUTFConversion(const wchar_t* src,
size_t src_len,
DestChar* dest,
size_t* dest_len) {
bool success = true;
for (size_t i = 0; i < src_len; ++i) {
auto code_point = static_cast<base_icu::UChar32>(src[i]);
if (!IsValidCodepoint(code_point)) {
success = false;
code_point = kErrorCodePoint;
}
UnicodeAppendUnsafe(dest, dest_len, code_point);
}
return success;
}
#endif // defined(WCHAR_T_IS_UTF32)
// UTFConversion --------------------------------------------------------------
// Function template for generating all UTF conversions.
template <typename InputString, typename DestString>
bool UTFConversion(const InputString& src_str, DestString* dest_str) {
if (IsStringASCII(src_str)) {
dest_str->assign(src_str.begin(), src_str.end());
return true;
}
dest_str->resize(src_str.length() *
size_coefficient_v<typename InputString::value_type,
typename DestString::value_type>);
// Empty string is ASCII => it OK to call operator[].
auto* dest = &(*dest_str)[0];
// ICU requires 32 bit numbers.
size_t src_len = src_str.length();
size_t dest_len = 0;
bool res = DoUTFConversion(src_str.data(), src_len, dest, &dest_len);
dest_str->resize(dest_len);
dest_str->shrink_to_fit();
return res;
}
} // namespace
// UTF16 <-> UTF8 --------------------------------------------------------------
bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
return UTFConversion(StringPiece(src, src_len), output);
}
std::u16string UTF8ToUTF16(StringPiece utf8) {
std::u16string ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
return ret;
}
bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
return UTFConversion(StringPiece16(src, src_len), output);
}
std::string UTF16ToUTF8(StringPiece16 utf16) {
std::string ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
return ret;
}
// UTF-16 <-> Wide -------------------------------------------------------------
#if defined(WCHAR_T_IS_UTF16)
// When wide == UTF-16 the conversions are a NOP.
bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
output->assign(src, src + src_len);
return true;
}
std::u16string WideToUTF16(WStringPiece wide) {
return std::u16string(wide.begin(), wide.end());
}
bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
output->assign(src, src + src_len);
return true;
}
std::wstring UTF16ToWide(StringPiece16 utf16) {
return std::wstring(utf16.begin(), utf16.end());
}
#elif defined(WCHAR_T_IS_UTF32)
bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
return UTFConversion(base::WStringPiece(src, src_len), output);
}
std::u16string WideToUTF16(WStringPiece wide) {
std::u16string ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
WideToUTF16(wide.data(), wide.length(), &ret);
return ret;
}
bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
return UTFConversion(StringPiece16(src, src_len), output);
}
std::wstring UTF16ToWide(StringPiece16 utf16) {
std::wstring ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF16ToWide(utf16.data(), utf16.length(), &ret);
return ret;
}
#endif // defined(WCHAR_T_IS_UTF32)
// UTF-8 <-> Wide --------------------------------------------------------------
// UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
return UTFConversion(StringPiece(src, src_len), output);
}
std::wstring UTF8ToWide(StringPiece utf8) {
std::wstring ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF8ToWide(utf8.data(), utf8.length(), &ret);
return ret;
}
#if defined(WCHAR_T_IS_UTF16)
// Easy case since we can use the "utf" versions we already wrote above.
bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
return UTF16ToUTF8(as_u16cstr(src), src_len, output);
}
std::string WideToUTF8(WStringPiece wide) {
return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size()));
}
#elif defined(WCHAR_T_IS_UTF32)
bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
return UTFConversion(WStringPiece(src, src_len), output);
}
std::string WideToUTF8(WStringPiece wide) {
std::string ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
WideToUTF8(wide.data(), wide.length(), &ret);
return ret;
}
#endif // defined(WCHAR_T_IS_UTF32)
std::u16string ASCIIToUTF16(StringPiece ascii) {
DCHECK(IsStringASCII(ascii)) << ascii;
return std::u16string(ascii.begin(), ascii.end());
}
std::string UTF16ToASCII(StringPiece16 utf16) {
DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
return std::string(utf16.begin(), utf16.end());
}
#if defined(WCHAR_T_IS_UTF16)
std::wstring ASCIIToWide(StringPiece ascii) {
DCHECK(IsStringASCII(ascii)) << ascii;
return std::wstring(ascii.begin(), ascii.end());
}
std::string WideToASCII(WStringPiece wide) {
DCHECK(IsStringASCII(wide)) << wide;
return std::string(wide.begin(), wide.end());
}
#endif // defined(WCHAR_T_IS_UTF16)
} // namespace base
|