1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
|
// © 2025 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html
// utfstring.h
// created: 2025jul18 Markus W. Scherer
#ifndef __UTFSTRING_H__
#define __UTFSTRING_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
#include "unicode/utf16.h"
/**
* \file
* \brief C++ header-only API: C++ string helper functions.
*/
#ifndef U_HIDE_DRAFT_API
namespace U_HEADER_ONLY_NAMESPACE {
namespace utfstring {
// Write code points to strings -------------------------------------------- ***
#ifndef U_IN_DOXYGEN
namespace prv {
// This function, and the public wrappers,
// want to be U_FORCE_INLINE but the gcc-debug-build-and-test CI check failed with
// error: ‘always_inline’ function might not be inlinable [-Werror=attributes]
template<typename StringClass, bool validate>
inline StringClass &appendCodePoint(StringClass &s, uint32_t c) {
using Unit = typename StringClass::value_type;
if constexpr (sizeof(Unit) == 1) {
// UTF-8: Similar to U8_APPEND().
if (c <= 0x7f) {
s.push_back(static_cast<Unit>(c));
} else {
Unit buf[4];
uint8_t len;
if (c <= 0x7ff) {
len = 2;
buf[2] = (c >> 6) | 0xc0;
} else {
if (validate ?
c < 0xd800 ||
(c < 0xe000 || c > 0x10ffff ? (c = 0xfffd, true) : c <= 0xffff) :
c <= 0xffff) {
len = 3;
buf[1] = (c >> 12) | 0xe0;
} else {
len = 4;
buf[0] = (c >> 18) | 0xf0;
buf[1] = ((c >> 12) & 0x3f) | 0x80;
}
buf[2] = ((c >> 6) & 0x3f) | 0x80;
}
buf[3] = (c & 0x3f) | 0x80;
s.append(buf + 4 - len, len);
}
} else if constexpr (sizeof(Unit) == 2) {
// UTF-16: Similar to U16_APPEND().
if (validate ?
c < 0xd800 || (c < 0xe000 || c > 0x10ffff ? (c = 0xfffd, true) : c <= 0xffff) :
c <= 0xffff) {
s.push_back(static_cast<Unit>(c));
} else {
Unit buf[2] = { U16_LEAD(c), U16_TRAIL(c) };
s.append(buf, 2);
}
} else {
// UTF-32
s.push_back(!validate || U_IS_SCALAR_VALUE(c) ? c : 0xfffd);
}
return s;
}
} // namespace prv
#endif // U_IN_DOXYGEN
#ifndef U_HIDE_DRAFT_API
/**
* Appends the code point to the string.
* Appends the U+FFFD replacement character instead if c is not a scalar value.
* See https://www.unicode.org/glossary/#unicode_scalar_value
*
* @tparam StringClass A version of std::basic_string (or a compatible type)
* @param s The string to append to
* @param c The code point to append
* @return s
* @draft ICU 78
* @see U_IS_SCALAR_VALUE
*/
template<typename StringClass>
inline StringClass &appendOrFFFD(StringClass &s, UChar32 c) {
return prv::appendCodePoint<StringClass, true>(s, c);
}
/**
* Appends the code point to the string.
* The code point must be a scalar value; otherwise the behavior is undefined.
* See https://www.unicode.org/glossary/#unicode_scalar_value
*
* @tparam StringClass A version of std::basic_string (or a compatible type)
* @param s The string to append to
* @param c The code point to append (must be a scalar value)
* @return s
* @draft ICU 78
* @see U_IS_SCALAR_VALUE
*/
template<typename StringClass>
inline StringClass &appendUnsafe(StringClass &s, UChar32 c) {
return prv::appendCodePoint<StringClass, false>(s, c);
}
/**
* Returns the code point as a string of code units.
* Returns the U+FFFD replacement character instead if c is not a scalar value.
* See https://www.unicode.org/glossary/#unicode_scalar_value
*
* @tparam StringClass A version of std::basic_string (or a compatible type)
* @param c The code point
* @return the string of c's code units
* @draft ICU 78
* @see U_IS_SCALAR_VALUE
*/
template<typename StringClass>
inline StringClass encodeOrFFFD(UChar32 c) {
StringClass s;
prv::appendCodePoint<StringClass, true>(s, c);
return s;
}
/**
* Returns the code point as a string of code units.
* The code point must be a scalar value; otherwise the behavior is undefined.
* See https://www.unicode.org/glossary/#unicode_scalar_value
*
* @tparam StringClass A version of std::basic_string (or a compatible type)
* @param c The code point
* @return the string of c's code units
* @draft ICU 78
* @see U_IS_SCALAR_VALUE
*/
template<typename StringClass>
inline StringClass encodeUnsafe(UChar32 c) {
StringClass s;
prv::appendCodePoint<StringClass, false>(s, c);
return s;
}
#endif // U_HIDE_DRAFT_API
} // namespace utfstring
} // namespace U_HEADER_ONLY_NAMESPACE
#endif // U_HIDE_DRAFT_API
#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
#endif // __UTFSTRING_H__
|