File: unescape_string.c

package info (click to toggle)
haskell-aeson 2.1.2.1-6
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 8,988 kB
sloc: haskell: 11,933; ansic: 123; makefile: 11
file content (149 lines) | stat: -rw-r--r-- 4,720 bytes
parent folder | download | duplicates (4)
// Copyright (c) 2008-2009 Bjoern Hoehrmann
// Copyright (c) 2015, Ondrej Palkovsky
// Copyright (c) 2016, Winterland

#include <string.h>
#include <stdio.h>
#include <stdint.h>


#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const uint8_t utf8d[] = {
  // The first part of the table maps bytes to character classes that
  // to reduce the size of the transition table and create bitmasks.
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

  // The second part is a transition table that maps a combination
  // of a state of the automaton and a character class to a state.
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};

static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  uint32_t type = utf8d[byte];

  *codep = (*state != UTF8_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & (byte);

  *state = utf8d[256 + *state + type];
  return *state;
}

static inline uint16_t decode_hex(uint32_t c)
{
  if (c >= '0' && c <= '9')      return c - '0';
  else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
  else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
  return 0xFFFF; // Should not happen
}

// Decode, return non-zero value on error
int _js_decode_string(uint16_t *const dest, size_t *destoff,
                  const uint8_t *s, const uint8_t *const srcend)
{
  uint16_t *d = dest + *destoff;
  uint32_t state = 0;
  uint32_t codepoint;

  uint8_t surrogate = 0;
  uint16_t temp_hex = 0;
  uint16_t unidata;

  // Optimized version of dispatch when just an ASCII char is expected
  #define DISPATCH_ASCII(label) {\
    if (s >= srcend) {\
      return -1;\
    }\
    codepoint = *s++;\
    goto label;\
  }

  standard:
    // Test end of stream
    while (s < srcend) {
        if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
          if (state == UTF8_REJECT) { return -1; }
          continue;
        }

        if (codepoint == '\\')
          DISPATCH_ASCII(backslash)
        else if (codepoint <= 0xffff)
          *d++ = (uint16_t) codepoint;
        else {
          *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
          *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
        }
    }
    *destoff = d - dest;
    // Exit point
    return (state != UTF8_ACCEPT);
  backslash:
    switch (codepoint) {
      case '"':
      case '\\':
      case '/':
        *d++ = (uint16_t) codepoint;
        goto standard;
        break;
      case 'b': *d++ = '\b';goto standard;
      case 'f': *d++ = '\f';goto standard;
      case 'n': *d++ = '\n';goto standard;
      case 'r': *d++ = '\r';goto standard;
      case 't': *d++ = '\t';goto standard;
      case 'u': DISPATCH_ASCII(unicode1);;break;
      default:
        return -1;
    }
  unicode1:
    temp_hex = decode_hex(codepoint);
    if (temp_hex == 0xFFFF) { return -1; }
    else unidata = temp_hex << 12;
    DISPATCH_ASCII(unicode2);
  unicode2:
    temp_hex = decode_hex(codepoint);
    if (temp_hex == 0xFFFF) { return -1; }
    else unidata |= temp_hex << 8;
    DISPATCH_ASCII(unicode3);
  unicode3:
    temp_hex = decode_hex(codepoint);
    if (temp_hex == 0xFFFF) { return -1; }
    else unidata |= temp_hex << 4;
    DISPATCH_ASCII(unicode4);
  unicode4:
    temp_hex = decode_hex(codepoint);
    if (temp_hex == 0xFFFF) { return -1; }
    else unidata |= temp_hex;
    *d++ = (uint16_t) unidata;

    if (surrogate) {
      if (unidata < 0xDC00 || unidata > 0xDFFF) // is not low surrogate
        return -1;
      surrogate = 0;
    } else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate
        surrogate = 1;
        DISPATCH_ASCII(surrogate1);
    } else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate
        return -1;
    }
    goto standard;
  surrogate1:
    if (codepoint != '\\') { return -1; }
    DISPATCH_ASCII(surrogate2)
  surrogate2:
    if (codepoint != 'u') { return -1; }
    DISPATCH_ASCII(unicode1)
}