File: unescape_string.c

package info (click to toggle)
haskell-aeson 2.1.2.1-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,988 kB
  • sloc: haskell: 11,933; ansic: 123; makefile: 11
file content (149 lines) | stat: -rw-r--r-- 4,720 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// Copyright (c) 2008-2009 Bjoern Hoehrmann
// Copyright (c) 2015, Ondrej Palkovsky
// Copyright (c) 2016, Winterland

#include <string.h>
#include <stdio.h>
#include <stdint.h>


#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const uint8_t utf8d[] = {
  // The first part of the table maps bytes to character classes that
  // to reduce the size of the transition table and create bitmasks.
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

  // The second part is a transition table that maps a combination
  // of a state of the automaton and a character class to a state.
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};

static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  uint32_t type = utf8d[byte];

  *codep = (*state != UTF8_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & (byte);

  *state = utf8d[256 + *state + type];
  return *state;
}

static inline uint16_t decode_hex(uint32_t c)
{
  if (c >= '0' && c <= '9')      return c - '0';
  else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
  else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
  return 0xFFFF; // Should not happen
}

// Decode, return non-zero value on error
int _js_decode_string(uint16_t *const dest, size_t *destoff,
                  const uint8_t *s, const uint8_t *const srcend)
{
  uint16_t *d = dest + *destoff;
  uint32_t state = 0;
  uint32_t codepoint;

  uint8_t surrogate = 0;
  uint16_t temp_hex = 0;
  uint16_t unidata;

  // Optimized version of dispatch when just an ASCII char is expected
  #define DISPATCH_ASCII(label) {\
    if (s >= srcend) {\
      return -1;\
    }\
    codepoint = *s++;\
    goto label;\
  }

  standard:
    // Test end of stream
    while (s < srcend) {
        if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
          if (state == UTF8_REJECT) { return -1; }
          continue;
        }

        if (codepoint == '\\')
          DISPATCH_ASCII(backslash)
        else if (codepoint <= 0xffff)
          *d++ = (uint16_t) codepoint;
        else {
          *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
          *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
        }
    }
    *destoff = d - dest;
    // Exit point
    return (state != UTF8_ACCEPT);
  backslash:
    switch (codepoint) {
      case '"':
      case '\\':
      case '/':
        *d++ = (uint16_t) codepoint;
        goto standard;
        break;
      case 'b': *d++ = '\b';goto standard;
      case 'f': *d++ = '\f';goto standard;
      case 'n': *d++ = '\n';goto standard;
      case 'r': *d++ = '\r';goto standard;
      case 't': *d++ = '\t';goto standard;
      case 'u': DISPATCH_ASCII(unicode1);;break;
      default:
        return -1;
    }
  unicode1:
    temp_hex = decode_hex(codepoint);
    if (temp_hex == 0xFFFF) { return -1; }
    else unidata = temp_hex << 12;
    DISPATCH_ASCII(unicode2);
  unicode2:
    temp_hex = decode_hex(codepoint);
    if (temp_hex == 0xFFFF) { return -1; }
    else unidata |= temp_hex << 8;
    DISPATCH_ASCII(unicode3);
  unicode3:
    temp_hex = decode_hex(codepoint);
    if (temp_hex == 0xFFFF) { return -1; }
    else unidata |= temp_hex << 4;
    DISPATCH_ASCII(unicode4);
  unicode4:
    temp_hex = decode_hex(codepoint);
    if (temp_hex == 0xFFFF) { return -1; }
    else unidata |= temp_hex;
    *d++ = (uint16_t) unidata;

    if (surrogate) {
      if (unidata < 0xDC00 || unidata > 0xDFFF) // is not low surrogate
        return -1;
      surrogate = 0;
    } else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate
        surrogate = 1;
        DISPATCH_ASCII(surrogate1);
    } else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate
        return -1;
    }
    goto standard;
  surrogate1:
    if (codepoint != '\\') { return -1; }
    DISPATCH_ASCII(surrogate2)
  surrogate2:
    if (codepoint != 'u') { return -1; }
    DISPATCH_ASCII(unicode1)
}