1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
// Copyright (c) 2008-2009 Bjoern Hoehrmann
// Copyright (c) 2015, Ondrej Palkovsky
// Copyright (c) 2016, Winterland
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
static const uint8_t utf8d[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
};
static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
*state = utf8d[256 + *state + type];
return *state;
}
static inline uint16_t decode_hex(uint32_t c)
{
if (c >= '0' && c <= '9') return c - '0';
else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
return 0xFFFF; // Should not happen
}
// Decode, return non-zero value on error
int _js_decode_string(uint16_t *const dest, size_t *destoff,
const uint8_t *s, const uint8_t *const srcend)
{
uint16_t *d = dest + *destoff;
uint32_t state = 0;
uint32_t codepoint;
uint8_t surrogate = 0;
uint16_t temp_hex = 0;
uint16_t unidata;
// Optimized version of dispatch when just an ASCII char is expected
#define DISPATCH_ASCII(label) {\
if (s >= srcend) {\
return -1;\
}\
codepoint = *s++;\
goto label;\
}
standard:
// Test end of stream
while (s < srcend) {
if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
if (state == UTF8_REJECT) { return -1; }
continue;
}
if (codepoint == '\\')
DISPATCH_ASCII(backslash)
else if (codepoint <= 0xffff)
*d++ = (uint16_t) codepoint;
else {
*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
}
}
*destoff = d - dest;
// Exit point
return (state != UTF8_ACCEPT);
backslash:
switch (codepoint) {
case '"':
case '\\':
case '/':
*d++ = (uint16_t) codepoint;
goto standard;
break;
case 'b': *d++ = '\b';goto standard;
case 'f': *d++ = '\f';goto standard;
case 'n': *d++ = '\n';goto standard;
case 'r': *d++ = '\r';goto standard;
case 't': *d++ = '\t';goto standard;
case 'u': DISPATCH_ASCII(unicode1);;break;
default:
return -1;
}
unicode1:
temp_hex = decode_hex(codepoint);
if (temp_hex == 0xFFFF) { return -1; }
else unidata = temp_hex << 12;
DISPATCH_ASCII(unicode2);
unicode2:
temp_hex = decode_hex(codepoint);
if (temp_hex == 0xFFFF) { return -1; }
else unidata |= temp_hex << 8;
DISPATCH_ASCII(unicode3);
unicode3:
temp_hex = decode_hex(codepoint);
if (temp_hex == 0xFFFF) { return -1; }
else unidata |= temp_hex << 4;
DISPATCH_ASCII(unicode4);
unicode4:
temp_hex = decode_hex(codepoint);
if (temp_hex == 0xFFFF) { return -1; }
else unidata |= temp_hex;
*d++ = (uint16_t) unidata;
if (surrogate) {
if (unidata < 0xDC00 || unidata > 0xDFFF) // is not low surrogate
return -1;
surrogate = 0;
} else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate
surrogate = 1;
DISPATCH_ASCII(surrogate1);
} else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate
return -1;
}
goto standard;
surrogate1:
if (codepoint != '\\') { return -1; }
DISPATCH_ASCII(surrogate2)
surrogate2:
if (codepoint != 'u') { return -1; }
DISPATCH_ASCII(unicode1)
}
|