File: utf8.cc

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (148 lines) | stat: -rw-r--r-- 5,882 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// Copyright 2017 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// UTF8 utilities, implemented to reduce dependencies.

#include "absl/strings/internal/utf8.h"

#include <cstddef>
#include <cstdint>
#include <limits>

#include "absl/base/config.h"

namespace absl {
ABSL_NAMESPACE_BEGIN
namespace strings_internal {

size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) {
  if (utf8_char <= 0x7F) {
    *buffer = static_cast<char>(utf8_char);
    return 1;
  } else if (utf8_char <= 0x7FF) {
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[0] = static_cast<char>(0xC0 | utf8_char);
    return 2;
  } else if (utf8_char <= 0xFFFF) {
    buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[0] = static_cast<char>(0xE0 | utf8_char);
    return 3;
  } else {
    buffer[3] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
    utf8_char >>= 6;
    buffer[0] = static_cast<char>(0xF0 | utf8_char);
    return 4;
  }
}

size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) {
  // Reinterpret the output buffer `buf` as `unsigned char*` for subsequent
  // bitwise operations. This ensures well-defined behavior for bit
  // manipulations (avoiding issues with signed `char`) and is safe under C++
  // aliasing rules, as `unsigned char` can alias any type.
  auto* ubuf = reinterpret_cast<unsigned char*>(buf);
  const uint32_t v = static_cast<uint32_t>(wc);
  constexpr size_t kError = static_cast<size_t>(-1);

  if (v <= 0x007F) {
    // 1-byte sequence (U+0000 to U+007F).
    // 0xxxxxxx.
    ubuf[0] = (0b0111'1111 & v);
    s = {};  // Reset surrogate state.
    return 1;
  } else if (0x0080 <= v && v <= 0x07FF) {
    // 2-byte sequence (U+0080 to U+07FF).
    // 110xxxxx 10xxxxxx.
    ubuf[0] = 0b1100'0000 | (0b0001'1111 & (v >> 6));
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);
    s = {};  // Reset surrogate state.
    return 2;
  } else if ((0x0800 <= v && v <= 0xD7FF) || (0xE000 <= v && v <= 0xFFFF)) {
    // 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF).
    // Excludes surrogate code points U+D800-U+DFFF.
    // 1110xxxx 10xxxxxx 10xxxxxx.
    ubuf[0] = 0b1110'0000 | (0b0000'1111 & (v >> 12));
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
    ubuf[2] = 0b1000'0000 | (0b0011'1111 & v);
    s = {};  // Reset surrogate state.
    return 3;
  } else if (0xD800 <= v && v <= 0xDBFF) {
    // High Surrogate (U+D800 to U+DBFF).
    // This part forms the first two bytes of an eventual 4-byte UTF-8 sequence.
    const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1;

    // First byte of the 4-byte UTF-8 sequence (11110xxx).
    ubuf[0] = 0b1111'0000 | (0b0000'0111 & (high_bits_val >> 2));
    // Second byte of the 4-byte UTF-8 sequence (10xxxxxx).
    ubuf[1] = 0b1000'0000 |                           //
              (0b0011'0000 & (high_bits_val << 4)) |  //
              (0b0000'1111 & (v >> 2));
    // Set state for high surrogate after writing to buffer.
    s = {true, static_cast<unsigned char>(0b0000'0011 & v)};
    return 2;  // Wrote 2 bytes, expecting 2 more from a low surrogate.
  } else if (0xDC00 <= v && v <= 0xDFFF) {
    // Low Surrogate (U+DC00 to U+DFFF).
    // This part forms the last two bytes of a 4-byte UTF-8 sequence,
    // using state from a preceding high surrogate.
    if (!s.saw_high_surrogate) {
      // Error: Isolated low surrogate without a preceding high surrogate.
      // s remains in its current (problematic) state.
      // Caller should handle error.
      return kError;
    }

    // Third byte of the 4-byte UTF-8 sequence (10xxxxxx).
    ubuf[0] = 0b1000'0000 |                    //
              (0b0011'0000 & (s.bits << 4)) |  //
              (0b0000'1111 & (v >> 6));
    // Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx).
    ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);

    s = {};    // Reset surrogate state, pair complete.
    return 2;  // Wrote 2 more bytes, completing the 4-byte sequence.
  } else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) {
    // Conditionally compile the 4-byte direct conversion branch.
    // This block is compiled only if wchar_t can represent values > 0xFFFF.
    // It's placed after surrogate checks to ensure surrogates are handled by
    // their specific logic. This inner 'if' is the runtime check for the 4-byte
    // range. At this point, v is known not to be in the 1, 2, or 3-byte BMP
    // ranges, nor is it a surrogate code point.
    if (0x10000 <= v && v <= 0x10FFFF) {
      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
      ubuf[0] = 0b1111'0000 | (0b0000'0111 & (v >> 18));
      ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 12));
      ubuf[2] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
      ubuf[3] = 0b1000'0000 | (0b0011'1111 & v);
      s = {};  // Reset surrogate state.
      return 4;
    }
  }

  // Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all
  // checks).
  s = {};  // Reset surrogate state.
  return kError;
}

}  // namespace strings_internal
ABSL_NAMESPACE_END
}  // namespace absl