1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
|
// Copyright (c) 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/strings/utf_string_conversions.h"
#include <limits.h>
#include <stdint.h>
#include <type_traits>
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversion_utils.h"
#include <unicode/utf.h>
#include "build/build_config.h"
namespace base {
namespace {
constexpr int32_t kErrorCodePoint = 0xFFFD;
// Size coefficient ----------------------------------------------------------
// The maximum number of codeunits in the destination encoding corresponding to
// one codeunit in the source encoding.
template <typename SrcChar, typename DestChar>
struct SizeCoefficient {
static_assert(sizeof(SrcChar) < sizeof(DestChar),
"Default case: from a smaller encoding to the bigger one");
// ASCII symbols are encoded by one codeunit in all encodings.
static constexpr int value = 1;
};
template <>
struct SizeCoefficient<char16, char> {
// One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
static constexpr int value = 3;
};
#if defined(WCHAR_T_IS_UTF32)
template <>
struct SizeCoefficient<wchar_t, char> {
// UTF-8 uses at most 4 codeunits per character.
static constexpr int value = 4;
};
template <>
struct SizeCoefficient<wchar_t, char16> {
// UTF-16 uses at most 2 codeunits per character.
static constexpr int value = 2;
};
#endif // defined(WCHAR_T_IS_UTF32)
template <typename SrcChar, typename DestChar>
constexpr int size_coefficient_v =
SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
// UnicodeAppendUnsafe --------------------------------------------------------
// Function overloads that write code_point to the output string. Output string
// has to have enough space for the codepoint.
// Convenience typedef that checks whether the passed in type is integral (i.e.
// bool, char, int or their extended versions) and is of the correct size.
template <typename Char, size_t N>
using EnableIfBitsAre = std::enable_if_t<std::is_integral<Char>::value &&
CHAR_BIT * sizeof(Char) == N,
bool>;
template <typename Char, EnableIfBitsAre<Char, 8> = true>
void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) {
U8_APPEND_UNSAFE(out, *size, code_point);
}
template <typename Char, EnableIfBitsAre<Char, 16> = true>
void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) {
U16_APPEND_UNSAFE(out, *size, code_point);
}
template <typename Char, EnableIfBitsAre<Char, 32> = true>
void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) {
out[(*size)++] = code_point;
}
// DoUTFConversion ------------------------------------------------------------
// Main driver of UTFConversion specialized for different Src encodings.
// dest has to have enough room for the converted text.
template <typename DestChar>
bool DoUTFConversion(const char* src,
int32_t src_len,
DestChar* dest,
int32_t* dest_len) {
bool success = true;
for (int32_t i = 0; i < src_len;) {
int32_t code_point;
U8_NEXT(src, i, src_len, code_point);
if (!IsValidCodepoint(code_point)) {
success = false;
code_point = kErrorCodePoint;
}
UnicodeAppendUnsafe(dest, dest_len, code_point);
}
return success;
}
template <typename DestChar>
bool DoUTFConversion(const char16* src,
int32_t src_len,
DestChar* dest,
int32_t* dest_len) {
bool success = true;
auto ConvertSingleChar = [&success](char16 in) -> int32_t {
if (!U16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
success = false;
return kErrorCodePoint;
}
return in;
};
int32_t i = 0;
// Always have another symbol in order to avoid checking boundaries in the
// middle of the surrogate pair.
while (i < src_len - 1) {
int32_t code_point;
if (U16_IS_LEAD(src[i]) && U16_IS_TRAIL(src[i + 1])) {
code_point = U16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
if (!IsValidCodepoint(code_point)) {
code_point = kErrorCodePoint;
success = false;
}
i += 2;
} else {
code_point = ConvertSingleChar(src[i]);
++i;
}
UnicodeAppendUnsafe(dest, dest_len, code_point);
}
if (i < src_len)
UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
return success;
}
#if defined(WCHAR_T_IS_UTF32)
template <typename DestChar>
bool DoUTFConversion(const wchar_t* src,
int32_t src_len,
DestChar* dest,
int32_t* dest_len) {
bool success = true;
for (int32_t i = 0; i < src_len; ++i) {
int32_t code_point = src[i];
if (!IsValidCodepoint(code_point)) {
success = false;
code_point = kErrorCodePoint;
}
UnicodeAppendUnsafe(dest, dest_len, code_point);
}
return success;
}
#endif // defined(WCHAR_T_IS_UTF32)
// UTFConversion --------------------------------------------------------------
// Function template for generating all UTF conversions.
template <typename InputString, typename DestString>
bool UTFConversion(const InputString& src_str, DestString* dest_str) {
if (IsStringASCII(src_str)) {
dest_str->assign(src_str.begin(), src_str.end());
return true;
}
dest_str->resize(src_str.length() *
size_coefficient_v<typename InputString::value_type,
typename DestString::value_type>);
// Empty string is ASCII => it OK to call operator[].
auto* dest = &(*dest_str)[0];
// ICU requires 32 bit numbers.
int32_t src_len32 = static_cast<int32_t>(src_str.length());
int32_t dest_len32 = 0;
bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32);
dest_str->resize(dest_len32);
dest_str->shrink_to_fit();
return res;
}
} // namespace
// UTF16 <-> UTF8 --------------------------------------------------------------
bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
return UTFConversion(StringPiece(src, src_len), output);
}
string16 UTF8ToUTF16(StringPiece utf8) {
string16 ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
return ret;
}
bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
return UTFConversion(StringPiece16(src, src_len), output);
}
std::string UTF16ToUTF8(StringPiece16 utf16) {
std::string ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
return ret;
}
// UTF-16 <-> Wide -------------------------------------------------------------
#if defined(WCHAR_T_IS_UTF16)
// When wide == UTF-16 the conversions are a NOP.
bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
output->assign(src, src + src_len);
return true;
}
string16 WideToUTF16(WStringPiece wide) {
return string16(wide.begin(), wide.end());
}
bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
output->assign(src, src + src_len);
return true;
}
std::wstring UTF16ToWide(StringPiece16 utf16) {
return std::wstring(utf16.begin(), utf16.end());
}
#elif defined(WCHAR_T_IS_UTF32)
bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
return UTFConversion(base::WStringPiece(src, src_len), output);
}
string16 WideToUTF16(WStringPiece wide) {
string16 ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
WideToUTF16(wide.data(), wide.length(), &ret);
return ret;
}
bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
return UTFConversion(StringPiece16(src, src_len), output);
}
std::wstring UTF16ToWide(StringPiece16 utf16) {
std::wstring ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF16ToWide(utf16.data(), utf16.length(), &ret);
return ret;
}
#endif // defined(WCHAR_T_IS_UTF32)
// UTF-8 <-> Wide --------------------------------------------------------------
// UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
return UTFConversion(StringPiece(src, src_len), output);
}
std::wstring UTF8ToWide(StringPiece utf8) {
std::wstring ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
UTF8ToWide(utf8.data(), utf8.length(), &ret);
return ret;
}
#if defined(WCHAR_T_IS_UTF16)
// Easy case since we can use the "utf" versions we already wrote above.
bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
return UTF16ToUTF8(as_u16cstr(src), src_len, output);
}
std::string WideToUTF8(WStringPiece wide) {
return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size()));
}
#elif defined(WCHAR_T_IS_UTF32)
bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
return UTFConversion(WStringPiece(src, src_len), output);
}
std::string WideToUTF8(WStringPiece wide) {
std::string ret;
// Ignore the success flag of this call, it will do the best it can for
// invalid input, which is what we want here.
WideToUTF8(wide.data(), wide.length(), &ret);
return ret;
}
#endif // defined(WCHAR_T_IS_UTF32)
string16 ASCIIToUTF16(StringPiece ascii) {
DCHECK(IsStringASCII(ascii)) << ascii;
return string16(ascii.begin(), ascii.end());
}
std::string UTF16ToASCII(StringPiece16 utf16) {
DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
return std::string(utf16.begin(), utf16.end());
}
} // namespace base
|