File: generalized-utf8-decoder.h

package info (click to toggle)

chromium 139.0.7258.127-1

links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36

file content (104 lines) | stat: -rw-r--r-- 4,727 bytes

parent folder | download | duplicates (14)

// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ and the sibling file
// utf8-decoder.h for details.
//
// This file decodes "generalized UTF-8", which is the same as UTF-8 except that
// it allows surrogates: https://simonsapin.github.io/wtf-8/#generalized-utf8

#include <stdint.h>

#ifndef __GENERALIZED_UTF8_DFA_DECODER_H
#define __GENERALIZED_UTF8_DFA_DECODER_H

struct GeneralizedUtf8DfaDecoder {
  enum State : uint8_t {
    kReject = 0,
    kAccept = 11,
    kTwoByte = 22,
    kThreeByte = 33,
    kFourByte = 44,
    kFourByteLow = 55,
    kThreeByteHigh = 66,
    kFourByteMidHigh = 77,
  };

  static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
    // This first table maps bytes to character to a transition.
    //
    // The transition value takes a state to a new state, but it also determines
    // the set of bits from the current byte that contribute to the decoded
    // codepoint:
    //
    //   Transition | Current byte bits that contribute to decoded codepoint
    //   -------------------------------------------------------------------
    //    0, 1      | 0b01111111
    //    2, 3      | 0b00111111
    //    4, 5      | 0b00011111
    //    6, 7      | 0b00001111
    //    8, 9      | 0b00000111
    //    10        | 0b00000011
    //
    // Given the WTF-8 encoding, we therefore have the following constraints:

    //   1. The transition value for 1-byte encodings should have the value 0 or
    //      1 so that we preserve all of the low 7 bits.
    //   2. Continuation bytes (0x80 to 0xBF) are of the form 0b10xxxxxx, and
    //      therefore should have transition value between 0 and 3.
    //   3. Leading bytes for 2-byte encodings are of the form 0b110yyyyy, and
    //      therefore the transition value can be between 2 and 5.
    //   4. Leading bytes for 3-byte encodings (0b1110zzzz) need transition
    //      value between 4 and 7.
    //   5. Leading bytes for 4-byte encodings (0b11110uuu) need transition
    //      value between 6 and 9.
    //   6. We need more states to impose irregular constraints.  Sometimes we
    //      can use the knowldege that e.g. some high significant bits of the
    //      xxxx in 0b1110xxxx are 0, then we can use a higher transition value.
    //   7. Transitions to invalid states can use any transition value.
    static constexpr uint8_t transitions[] = {
        0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 00-0F
        0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 10-1F
        0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 20-2F
        0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 30-3F
        0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 40-4F
        0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 50-5F
        0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 60-6F
        0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 70-7F
        1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80-8F
        2,  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 90-9F
        3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // A0-AF
        3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // B0-BF
        8,  8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // C0-CF
        4,  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // D0-DF
        9,  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,  // E0-EF
        10, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,  // F0-FF
    };

    // This second table maps a state to a new state when adding a transition.
    //  00-7F
    //  |   80-8F
    //  |   |   90-9F
    //  |   |   |   A0-BF
    //  |   |   |   |   C2-DF
    //  |   |   |   |   |   E1-EF
    //  |   |   |   |   |   |   F1-F3
    //  |   |   |   |   |   |   |   F4
    //  |   |   |   |   |   |   |   |   C0, C1, F5-FF
    //  |   |   |   |   |   |   |   |   |  E0
    //  |   |   |   |   |   |   |   |   |  |   F0
    static constexpr uint8_t states[] = {
        0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,   // REJECT = 0
        11, 0,  0,  0,  22, 33, 44, 55, 0, 66, 77,  // ACCEPT = 11
        0,  11, 11, 11, 0,  0,  0,  0,  0, 0,  0,   // 2-byte = 22
        0,  22, 22, 22, 0,  0,  0,  0,  0, 0,  0,   // 3-byte = 33
        0,  33, 33, 33, 0,  0,  0,  0,  0, 0,  0,   // 4-byte = 44
        0,  33, 0,  0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte low = 55
        0,  0,  0,  22, 0,  0,  0,  0,  0, 0,  0,   // 3-byte high = 66
        0,  0,  33, 33, 0,  0,  0,  0,  0, 0,  0,   // 4-byte mid/high = 77
    };

    uint8_t type = transitions[byte];
    *state = static_cast<State>(states[*state + type]);
    *buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1)));
  }
};

#endif  // __GENERALIZED_UTF8_DFA_DECODER_H