/* * Copyright (C) 2020 Linux Studio Plugins Project * (C) 2020 Vladimir Sadovnikov * * This file is part of lsp-runtime-lib * Created on: 14 окт. 2019 г. * * lsp-runtime-lib is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * any later version. * * lsp-runtime-lib is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with lsp-runtime-lib. If not, see . */ #include #include #include #define IO_BUF_SIZE 0x1000 namespace lsp { namespace json { static const char *ecma_reserved[] = { "Infinity", "abstract", "arguments", "await", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "instanceof", "interface", "else", "enum", "eval", "export", "extends", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "int", "let", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "transient", "try", "true", "typeof", "var", "void", "volatile", "while", "with", "yield" }; Tokenizer::Tokenizer(io::IInSequence *in) { pIn = in; cCurrent = -1; enToken = JT_UNKNOWN; nError = STATUS_OK; fValue = 0; iValue = 0; nUnget = 0; bEscaping = false; nIoSize = 0; nIoOffset = 0; pIoBuf = NULL; } Tokenizer::~Tokenizer() { pIn = NULL; if (pIoBuf != NULL) { ::free(pIoBuf); pIoBuf = NULL; } } token_t Tokenizer::set_error(status_t code) { nError = code; return enToken = JT_ERROR; } lsp_swchar_t Tokenizer::skip_whitespace() { if (cCurrent < 0) cCurrent = get_char(); while (true) { // Skip whitespace if (::iswspace(cCurrent) || (::iswblank(cCurrent))) cCurrent = get_char(); else return cCurrent; } } status_t Tokenizer::fill_buffer() { // Check that we need to shift the buffer if (nIoOffset > 0) { size_t tail = nIoSize - nIoOffset; if (tail > 0) ::memmove(pIoBuf, &pIoBuf[nIoOffset], tail * sizeof(lsp_wchar_t)); nIoSize = tail; nIoOffset = 0; } // Check that we can read something ssize_t to_read = IO_BUF_SIZE - nIoSize; if (to_read <= 0) return STATUS_OK; // Lazy buffer allocation if (pIoBuf == NULL) { pIoBuf = static_cast(::malloc(IO_BUF_SIZE * sizeof(lsp_wchar_t))); if (pIoBuf == NULL) return STATUS_NO_MEM; } // Read data to buffer while (to_read > 0) { ssize_t nread = pIn->read(&pIoBuf[nIoSize], to_read); if (nread < 0) return (nIoOffset < nIoSize) ? STATUS_OK : -nread; nIoSize += nread; to_read -= nread; } return STATUS_OK; } lsp_swchar_t Tokenizer::get_char() { status_t res; int digit; size_t tail = nIoSize - nIoOffset; // Escaping mode? if (bEscaping) { if (tail > 0) return pIoBuf[nIoOffset++]; if ((res = fill_buffer()) != STATUS_OK) return -res; return pIoBuf[nIoOffset++]; } // There should be at least 2 characters to lookup for '\u' and '\x' sequence size_t off = nIoOffset; if (tail < 2) { if ((res = fill_buffer()) != STATUS_OK) return -res; tail = nIoSize - nIoOffset; if (tail < 2) return pIoBuf[nIoOffset++]; off = nIoOffset; } // Read first character lsp_swchar_t c = pIoBuf[off++]; if ((c != '\\') || (tail < 2)) return pIoBuf[nIoOffset++]; // Got "\", lookup for next character c = pIoBuf[off++]; if ((c == 'x') || (c == 'X')) { // Got "\x" sequence, should be at least 4 chars for '\x12' sequence if (tail < 4) { if ((res = fill_buffer()) != STATUS_OK) return -res; tail = nIoSize - nIoOffset; if (tail < 4) return pIoBuf[nIoOffset++]; off = nIoOffset; } c = 0; for (size_t i=0; i<2; ++i) { if (!parse_digit(&digit, pIoBuf[off++], 16)) return pIoBuf[nIoOffset++]; c = (c << 4) + digit; } // Commit offset nIoOffset = off; return c; } else if ((c == 'u') || (c == 'U')) { // Got "\x" sequence, should be at least 6 chars for '\u1234' sequence if (tail < 6) { if ((res = fill_buffer()) != STATUS_OK) return -res; tail = nIoSize - nIoOffset; if (tail < 6) return pIoBuf[nIoOffset++]; off = nIoOffset; } // Read UTF-16 sequence c = 0; int digit = 0; for (size_t i=0; i<4; ++i) { if (!parse_digit(&digit, pIoBuf[off++], 16)) return pIoBuf[nIoOffset++]; c = (c << 4) + digit; } // All is OK, commit position nIoOffset = off; } else return pIoBuf[nIoOffset++]; // Check for surrogate if ((c < 0xd800) || (c >= 0xde00)) return c; // The character pretends to be a surrogate pair, lookup for next character tail = nIoSize - nIoOffset; if (tail < 6) // 6 characters for "\U1234" { if ((res = fill_buffer()) != STATUS_OK) return -res; tail = nIoSize - nIoOffset; if (tail < 6) return 0xfffd; // Invalid character in incomplete surrogate pair off = nIoOffset; } // Check that character matches lsp_swchar_t c2 = pIoBuf[off++]; if (c2 != '\\') return 0xfffd; c2 = pIoBuf[off++]; if ((c2 != 'u') && (c2 != 'U')) return 0xfffd; c2 = 0; for (size_t i=0; i<4; ++i) { if (!parse_digit(&digit, pIoBuf[off++], 16)) return 0xfffd; c2 = (c << 4) + digit; } // Finally, move surrogate pair into UTF-32 sequence if ((c & 0xfc00) == 0xd800) // surrogate high { if ((c2 & 0xfc00) != 0xdc00) // not surrogate low? return 0xfffd; c = 0x10000 + (((c & 0x3ff) << 10) | (c2 & 0x3ff)); } else // Surrogate low { if ((c2 & 0xfc00) != 0xd800) // not surrogate high? return 0xfffd; c = 0x10000 + (((c2 & 0x3ff) << 10) | (c & 0x3ff)); } // Commit position nIoOffset = off; // Return UTF-32 character return c; } lsp_swchar_t Tokenizer::lookup() { if (cCurrent < 0) cCurrent = get_char(); return cCurrent; } token_t Tokenizer::commit(token_t token) { if (cCurrent < 0) return set_error(STATUS_BAD_STATE); if (!sValue.append(cCurrent)) return set_error(STATUS_NO_MEM); enToken = token; cCurrent = -1; return token; } lsp_swchar_t Tokenizer::commit_lookup(token_t token) { token_t x = commit(token); return (x != JT_ERROR) ? lookup() : -1; } bool Tokenizer::is_identifier_start(lsp_wchar_t ch) { if (::iswupper(ch)) return true; if (::iswlower(ch)) return true; return (ch == '_') || (ch == '$') || (ch == '\\'); } bool Tokenizer::is_identifier(lsp_wchar_t ch) { if (::iswupper(ch)) return true; if (::iswlower(ch)) return true; if (::iswdigit(ch)) return true; return (ch == '_') || (ch == '$'); } bool Tokenizer::is_reserved_word(const LSPString *text) { ssize_t first = 0, last = sizeof(ecma_reserved) / sizeof (const char *) - 1; while (first <= last) { ssize_t center = (first + last) >> 1; int cmp = text->compare_to_ascii(ecma_reserved[center]); if (cmp < 0) last = center - 1; else if (cmp > 0) first = center + 1; else return true; } return false; } bool Tokenizer::is_valid_identifier(const LSPString *text) { size_t len = text->length(); if (len <= 0) return false; if (!is_identifier_start(text->char_at(0))) return false; for (size_t i=1; ichar_at(i))) return false; } return !is_reserved_word(text); } bool Tokenizer::parse_digit(int *digit, lsp_wchar_t ch, int radix) { int res; if ((ch >= '0') && (ch <= '9')) res = ch - '0'; else if ((ch >= 'a') && (ch <= 'f')) res = ch - 'a' + 10; else if ((ch >= 'A') && (ch <= 'F')) res = ch - 'A' + 10; else return false; if (res >= radix) return false; *digit = res; return true; } token_t Tokenizer::parse_string(token_t type) { skip(type); bEscaping = false; // Parse string while (true) { // Read character lsp_swchar_t c = lookup(); if (c < 0) { bEscaping = false; return set_error(-c); } if (bEscaping) { switch (c) { // Escaped characters: ' " \ b f n r t v case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case '0': c = '\0'; break; case '\n': skip(type); // Skip end of line c = lookup(); if (c < 0) { if (c != -STATUS_EOF) { bEscaping = false; return set_error(-c); } } else if (c == '\r') skip(type); c = -1; break; case '\r': case 0x2028: case 0x2029: skip(type); c = -1; // Line terminator break; default: // Any other characters just omit the protector character in ECMA script break; } // Reset escaping flag bEscaping = false; // Append character (if it is present) if (c >= 0) { if (!sValue.append(c)) { bEscaping = false; return set_error(STATUS_NO_MEM); } skip(type); } } else if (c == '\\') { skip(type); bEscaping = true; } else { switch (c) { case '\n': bEscaping = false; return set_error(STATUS_BAD_TOKEN); case '\"': bEscaping = false; if (type == JT_DQ_STRING) return skip(type); break; case '\'': bEscaping = false; if (type == JT_SQ_STRING) return skip(type); break; default: break; } if ((type = commit(type)) == JT_ERROR) { bEscaping = false; return type; } } } bEscaping = false; return enToken = type; } token_t Tokenizer::parse_identifier() { // Commit identifier's start character token_t tok; while (true) { // Read character lsp_swchar_t c = lookup(); if (c < 0) { if (c != -STATUS_EOF) return set_error(-c); break; } // Check that character is an identifier if (!is_identifier(c)) break; if ((tok = commit(JT_IDENTIFIER)) == JT_ERROR) return tok; } // Analyze identifier if (sValue.equals_ascii("true")) return enToken = JT_TRUE; else if (sValue.equals_ascii("false")) return enToken = JT_FALSE; else if (sValue.equals_ascii("null")) return enToken = JT_NULL; else if (sValue.equals_ascii("NaN")) { fValue = NAN; return enToken = JT_DOUBLE; } else if (sValue.equals_ascii("Infinity")) { fValue = INFINITY; return enToken = JT_DOUBLE; } else if (is_reserved_word(&sValue)) return enToken = JT_RESERVED; return enToken = JT_IDENTIFIER; } token_t Tokenizer::parse_single_line_comment() { token_t tok; // Reset the length of string literal sValue.set_length(0); skip(JT_SL_COMMENT); while (true) { // Read character lsp_swchar_t c = lookup(); if (c < 0) return (c == -STATUS_EOF) ? JT_SL_COMMENT : set_error(-c); // Analyze character switch (c) { case '\n': return skip(JT_SL_COMMENT); default: { if ((tok = commit(JT_SL_COMMENT)) == JT_ERROR) return tok; break; } } } return JT_UNKNOWN; } token_t Tokenizer::parse_multiline_comment() { // Reset the length of string literal sValue.set_length(0); skip(JT_SL_COMMENT); lsp_swchar_t last = -1; while (true) { // Read character lsp_swchar_t c = lookup(); if (c < 0) return set_error(-c); // Check state if (last == '*') { if (c == '/') { // Remove last stored character (asterisk) sValue.remove_last(); return skip(JT_ML_COMMENT); } } else if (last == '\n') { if (c == '\r') { skip(JT_ML_COMMENT); continue; } last = -1; } // Append current character to the comment switch (c) { case '\\': { skip(JT_ML_COMMENT); break; } default: { token_t tok = commit(JT_ML_COMMENT); if (tok == JT_ERROR) return tok; } } // Remember last character last = c; } return JT_UNKNOWN; } token_t Tokenizer::parse_number() { enum flags_t { F_NEGATIVE = 1 << 0, F_SIGN = 1 << 1, F_INTEGER = 1 << 2, F_INT = 1 << 3, F_DOT = 1 << 4, F_FRAC = 1 << 5, F_EXP = 1 << 6, F_ESIGN = 1 << 7, F_ENEGATIVE = 1 << 8 }; lsp_swchar_t c = lookup(); size_t flags = 0; int radix = 10; int digit = 0; ssize_t ivalue = 0; double ifrac = 0; double ifpow = 1.0; double rradix = 0.1; ssize_t iexp = 0; // Has sign? if (c == '-') { flags |= F_NEGATIVE | F_SIGN; c = commit_lookup(JT_UNKNOWN); } else if (c == '+') { flags |= F_SIGN; c = commit_lookup(JT_UNKNOWN); } // Has prefix ? if (c == '0') { c = commit_lookup(JT_UNKNOWN); switch (c) { case 'x': // hexadecimal case 'X': radix = 16; rradix = 0.0625; c = commit_lookup(JT_UNKNOWN); flags |= F_INTEGER; break; default: flags |= F_INT; break; } } else if ((c == 'I') || (c == 'N')) // Infinity or NaN? { LSPString tmp; tmp.swap(&sValue); token_t tok = parse_identifier(); if (!tmp.append(&sValue)) return set_error(STATUS_NO_MEM); sValue.swap(&tmp); if (tok != JT_DOUBLE) return enToken = JT_UNKNOWN; if (flags & F_NEGATIVE) fValue = -fValue; return tok; } // Read the integer part while (parse_digit(&digit, c, radix)) { ivalue = ivalue*radix + digit; flags |= F_INT; c = commit_lookup(JT_DECIMAL); } // Is integer only? if (flags & F_INTEGER) { if (!(flags & F_INT)) // There should be at least one integer character return enToken = JT_UNKNOWN; iValue = (flags & F_NEGATIVE) ? -ivalue : ivalue; return enToken = (radix != 16) ? JT_DECIMAL : JT_HEXADECIMAL; } // Has a fraction part? if (c == '.') { flags |= F_DOT; c = commit_lookup(JT_DOUBLE); while (parse_digit(&digit, c, radix)) { ifpow *= rradix; ifrac += digit * ifpow; flags |= F_FRAC; c = commit_lookup(JT_DOUBLE); } } // Is there at least INT or FRAC part defined? if ((flags & (F_INT | F_FRAC)) == 0) return enToken = JT_UNKNOWN; // Has an exponent part? if ((c == 'e') || (c == 'E')) { c = commit_lookup(JT_DOUBLE); // Has sign? if (c == '-') { flags |= F_ENEGATIVE | F_ESIGN; c = commit_lookup(JT_UNKNOWN); } else if (c == '+') { flags |= F_ESIGN; c = commit_lookup(JT_UNKNOWN); } // Parse exponent while (parse_digit(&digit, c, radix)) { iexp = iexp*radix + digit; flags |= F_EXP; c = commit_lookup(JT_DOUBLE); } // Analyze post-condition: if exponent sign is defined, // the exponent value also should be defined if ((flags & (F_ESIGN | F_EXP)) == F_ESIGN) return enToken = JT_UNKNOWN; else if (flags & F_ENEGATIVE) iexp = -iexp; } // Ensure that the next character is not an identifier character (ECMA) c = lookup(); if (c < 0) { if (c != (-STATUS_EOF)) return set_error(-c); } else if (is_identifier_start(c)) return set_error(STATUS_BAD_FORMAT); // Now analyze parsing state if ((flags & (F_INT | F_FRAC | F_EXP | F_DOT)) == F_INT) { iValue = (flags & F_NEGATIVE) ? -ivalue : ivalue; return enToken = JT_DECIMAL; } // Form the floating-point value double fv = (double(ivalue) + ifrac) * pow(radix, iexp); fValue = (flags & F_NEGATIVE) ? -fv : fv; return enToken = JT_DOUBLE; } token_t Tokenizer::get_token(bool get) { // Pre-checks if (!get) return enToken; else if (nUnget > 0) { --nUnget; return enToken; } // Skip whitespaces lsp_swchar_t c = skip_whitespace(); if (c < 0) { enToken = (c == -STATUS_EOF) ? JT_EOF : JT_ERROR; nError = -c; return enToken; } sValue.set_length(0); switch (c) { case '[': return commit(JT_LQ_BRACE); case ']': return commit(JT_RQ_BRACE); case '{': return commit(JT_LC_BRACE); case '}': return commit(JT_RC_BRACE); case ':': return commit(JT_COLON); case ',': return commit(JT_COMMA); case '\'': return parse_string(JT_SQ_STRING); case '\"': return parse_string(JT_DQ_STRING); case '/': c = commit_lookup(JT_UNKNOWN); if (c == '/') return parse_single_line_comment(); else if (c == '*') return parse_multiline_comment(); commit(JT_UNKNOWN); break; default: if (is_identifier_start(c)) return parse_identifier(); return parse_number(); } return enToken = JT_UNKNOWN; } } /* namespace json */ } /* namespace lsp */