File: Tokenizer.cpp.txt

package info (click to toggle)
lsp-plugins 1.2.5-1
links: PTS, VCS
area: main
in suites: bookworm
size: 91,856 kB
sloc: cpp: 427,831; xml: 57,779; makefile: 9,961; php: 1,005; sh: 18
file content (881 lines) | stat: -rw-r--r-- 27,387 bytes
parent folder | download | duplicates (2)
/*
 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
 *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
 *
 * This file is part of lsp-runtime-lib
 * Created on: 14 окт. 2019 г.
 *
 * lsp-runtime-lib is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * any later version.
 *
 * lsp-runtime-lib is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with lsp-runtime-lib. If not, see <https://www.gnu.org/licenses/>.
 */

#include <lsp-plug.in/stdlib/math.h>
#include <lsp-plug.in/fmt/json/Tokenizer.h>
#include <wctype.h>

#define IO_BUF_SIZE     0x1000

namespace lsp
{
    namespace json
    {
        static const char *ecma_reserved[] =
        {
            "Infinity",
            "abstract",
            "arguments",
            "await",
            "boolean",
            "break",
            "byte",
            "case",
            "catch",
            "char",
            "class",
            "const",
            "continue",
            "debugger",
            "default",
            "delete",
            "do",
            "double",
            "instanceof",
            "interface",
            "else",
            "enum",
            "eval",
            "export",
            "extends",
            "false",
            "final",
            "finally",
            "float",
            "for",
            "function",
            "goto",
            "if",
            "implements",
            "import",
            "in",
            "int",
            "let",
            "long",
            "native",
            "new",
            "null",
            "package",
            "private",
            "protected",
            "public",
            "return",
            "short",
            "static",
            "super",
            "switch",
            "synchronized",
            "this",
            "throw",
            "transient",
            "try",
            "true",
            "typeof",
            "var",
            "void",
            "volatile",
            "while",
            "with",
            "yield"
        };
        
        Tokenizer::Tokenizer(io::IInSequence *in)
        {
            pIn         = in;
            cCurrent    = -1;
            enToken     = JT_UNKNOWN;
            nError      = STATUS_OK;
            fValue      = 0;
            iValue      = 0;
            nUnget      = 0;

            bEscaping   = false;
            nIoSize     = 0;
            nIoOffset   = 0;
            pIoBuf      = NULL;
        }
        
        Tokenizer::~Tokenizer()
        {
            pIn         = NULL;
            if (pIoBuf != NULL)
            {
                ::free(pIoBuf);
                pIoBuf      = NULL;
            }
        }

        token_t Tokenizer::set_error(status_t code)
        {
            nError          = code;
            return enToken  = JT_ERROR;
        }

        lsp_swchar_t Tokenizer::skip_whitespace()
        {
            if (cCurrent < 0)
                cCurrent = get_char();

            while (true)
            {
                // Skip whitespace
                if (::iswspace(cCurrent) || (::iswblank(cCurrent)))
                    cCurrent = get_char();
                else
                    return cCurrent;
            }
        }

        status_t Tokenizer::fill_buffer()
        {
            // Check that we need to shift the buffer
            if (nIoOffset > 0)
            {
                size_t tail = nIoSize - nIoOffset;
                if (tail > 0)
                    ::memmove(pIoBuf, &pIoBuf[nIoOffset], tail * sizeof(lsp_wchar_t));
                nIoSize     = tail;
                nIoOffset   = 0;
            }

            // Check that we can read something
            ssize_t to_read = IO_BUF_SIZE - nIoSize;
            if (to_read <= 0)
                return STATUS_OK;

            // Lazy buffer allocation
            if (pIoBuf == NULL)
            {
                pIoBuf      = static_cast<lsp_wchar_t *>(::malloc(IO_BUF_SIZE * sizeof(lsp_wchar_t)));
                if (pIoBuf == NULL)
                    return STATUS_NO_MEM;
            }

            // Read data to buffer
            while (to_read > 0)
            {
                ssize_t nread   = pIn->read(&pIoBuf[nIoSize], to_read);
                if (nread < 0)
                    return (nIoOffset < nIoSize) ? STATUS_OK : -nread;

                nIoSize        += nread;
                to_read        -= nread;
            }

            return STATUS_OK;
        }

        lsp_swchar_t Tokenizer::get_char()
        {
            status_t res;
            int digit;
            size_t tail     = nIoSize - nIoOffset;

            // Escaping mode?
            if (bEscaping)
            {
                if (tail > 0)
                    return pIoBuf[nIoOffset++];
                if ((res = fill_buffer()) != STATUS_OK)
                    return -res;
                return pIoBuf[nIoOffset++];
            }

            // There should be at least 2 characters to lookup for '\u' and '\x' sequence
            size_t off      = nIoOffset;
            if (tail < 2)
            {
                if ((res = fill_buffer()) != STATUS_OK)
                    return -res;
                tail    = nIoSize - nIoOffset;
                if (tail < 2)
                    return pIoBuf[nIoOffset++];
                off     = nIoOffset;
            }

            // Read first character
            lsp_swchar_t c  = pIoBuf[off++];
            if ((c != '\\') || (tail < 2))
                return pIoBuf[nIoOffset++];

            // Got "\", lookup for next character
            c               = pIoBuf[off++];
            if ((c == 'x') || (c == 'X'))
            {
                // Got "\x" sequence, should be at least 4 chars for '\x12' sequence
                if (tail < 4)
                {
                    if ((res = fill_buffer()) != STATUS_OK)
                        return -res;
                    tail    = nIoSize - nIoOffset;
                    if (tail < 4)
                        return pIoBuf[nIoOffset++];
                    off     = nIoOffset;
                }

                c               = 0;
                for (size_t i=0; i<2; ++i)
                {
                    if (!parse_digit(&digit, pIoBuf[off++], 16))
                        return pIoBuf[nIoOffset++];
                    c   = (c << 4) + digit;
                }

                // Commit offset
                nIoOffset       = off;
                return c;
            }
            else if ((c == 'u') || (c == 'U'))
            {
                // Got "\x" sequence, should be at least 6 chars for '\u1234' sequence
                if (tail < 6)
                {
                    if ((res = fill_buffer()) != STATUS_OK)
                        return -res;
                    tail = nIoSize - nIoOffset;
                    if (tail < 6)
                        return pIoBuf[nIoOffset++];
                    off     = nIoOffset;
                }

                // Read UTF-16 sequence
                c               = 0;
                int digit       = 0;
                for (size_t i=0; i<4; ++i)
                {
                    if (!parse_digit(&digit, pIoBuf[off++], 16))
                        return pIoBuf[nIoOffset++];
                    c   = (c << 4) + digit;
                }

                // All is OK, commit position
                nIoOffset       = off;
            }
            else
                return pIoBuf[nIoOffset++];

            // Check for surrogate
            if ((c < 0xd800) || (c >= 0xde00))
                return c;

            // The character pretends to be a surrogate pair, lookup for next character
            tail    = nIoSize - nIoOffset;
            if (tail < 6)  // 6 characters for "\U1234"
            {
                if ((res = fill_buffer()) != STATUS_OK)
                    return -res;
                tail = nIoSize - nIoOffset;
                if (tail < 6)
                    return 0xfffd;          // Invalid character in incomplete surrogate pair
                off     = nIoOffset;
            }

            // Check that character matches
            lsp_swchar_t c2 = pIoBuf[off++];
            if (c2 != '\\')
                return 0xfffd;
            c2 = pIoBuf[off++];
            if ((c2 != 'u') && (c2 != 'U'))
                return 0xfffd;
            c2 = 0;
            for (size_t i=0; i<4; ++i)
            {
                if (!parse_digit(&digit, pIoBuf[off++], 16))
                    return 0xfffd;
                c2  = (c << 4) + digit;
            }

            // Finally, move surrogate pair into UTF-32 sequence
            if ((c & 0xfc00) == 0xd800) // surrogate high
            {
                if ((c2 & 0xfc00) != 0xdc00) // not surrogate low?
                    return 0xfffd;
                c   = 0x10000 + (((c & 0x3ff) << 10) | (c2 & 0x3ff));
            }
            else // Surrogate low
            {
                if ((c2 & 0xfc00) != 0xd800) // not surrogate high?
                    return 0xfffd;
                c   = 0x10000 + (((c2 & 0x3ff) << 10) | (c & 0x3ff));
            }

            // Commit position
            nIoOffset       = off;

            // Return UTF-32 character
            return c;
        }

        lsp_swchar_t Tokenizer::lookup()
        {
            if (cCurrent < 0)
                cCurrent = get_char();
            return cCurrent;
        }

        token_t Tokenizer::commit(token_t token)
        {
            if (cCurrent < 0)
                return set_error(STATUS_BAD_STATE);

            if (!sValue.append(cCurrent))
                return set_error(STATUS_NO_MEM);

            enToken     = token;
            cCurrent      = -1;
            return token;
        }

        lsp_swchar_t Tokenizer::commit_lookup(token_t token)
        {
            token_t x   = commit(token);
            return (x != JT_ERROR) ? lookup() : -1;
        }

        bool Tokenizer::is_identifier_start(lsp_wchar_t ch)
        {
            if (::iswupper(ch))
                return true;
            if (::iswlower(ch))
                return true;
            return (ch == '_') || (ch == '$') || (ch == '\\');
        }

        bool Tokenizer::is_identifier(lsp_wchar_t ch)
        {
            if (::iswupper(ch))
                return true;
            if (::iswlower(ch))
                return true;
            if (::iswdigit(ch))
                return true;
            return (ch == '_') || (ch == '$');
        }

        bool Tokenizer::is_reserved_word(const LSPString *text)
        {
            ssize_t first = 0, last = sizeof(ecma_reserved) / sizeof (const char *) - 1;
            while (first <= last)
            {
                ssize_t center = (first + last) >> 1;
                int cmp = text->compare_to_ascii(ecma_reserved[center]);
                if (cmp < 0)
                    last = center - 1;
                else if (cmp > 0)
                    first = center + 1;
                else
                    return true;
            }

            return false;
        }

        bool Tokenizer::is_valid_identifier(const LSPString *text)
        {
            size_t len = text->length();
            if (len <= 0)
                return false;
            if (!is_identifier_start(text->char_at(0)))
                return false;
            for (size_t i=1; i<len; ++i)
            {
                if (!is_identifier(text->char_at(i)))
                    return false;
            }

            return !is_reserved_word(text);
        }

        bool Tokenizer::parse_digit(int *digit, lsp_wchar_t ch, int radix)
        {
            int res;
            if ((ch >= '0') && (ch <= '9'))
                res = ch - '0';
            else if ((ch >= 'a') && (ch <= 'f'))
                res = ch - 'a' + 10;
            else if ((ch >= 'A') && (ch <= 'F'))
                res = ch - 'A' + 10;
            else
                return false;

            if (res >= radix)
                return false;

            *digit  = res;
            return true;
        }

        token_t Tokenizer::parse_string(token_t type)
        {
            skip(type);

            bEscaping   = false;

            // Parse string
            while (true)
            {
                // Read character
                lsp_swchar_t c = lookup();
                if (c < 0)
                {
                    bEscaping   = false;
                    return set_error(-c);
                }

                if (bEscaping)
                {
                    switch (c)
                    {
                        // Escaped characters: ' " \ b f n r t v
                        case 'b': c = '\b'; break;
                        case 'f': c = '\f'; break;
                        case 'n': c = '\n'; break;
                        case 'r': c = '\r'; break;
                        case 't': c = '\t'; break;
                        case 'v': c = '\v'; break;
                        case '0': c = '\0'; break;
                        case '\n':
                            skip(type); // Skip end of line
                            c = lookup();
                            if (c < 0)
                            {
                                if (c != -STATUS_EOF)
                                {
                                    bEscaping   = false;
                                    return set_error(-c);
                                }
                            }
                            else if (c == '\r')
                                skip(type);
                            c = -1;
                            break;

                        case '\r':
                        case 0x2028:
                        case 0x2029:
                            skip(type);
                            c = -1; // Line terminator
                            break;

                        default: // Any other characters just omit the protector character in ECMA script
                            break;
                    }

                    // Reset escaping flag
                    bEscaping   = false;

                    // Append character (if it is present)
                    if (c >= 0)
                    {
                        if (!sValue.append(c))
                        {
                            bEscaping = false;
                            return set_error(STATUS_NO_MEM);
                        }
                        skip(type);
                    }
                }
                else if (c == '\\')
                {
                    skip(type);
                    bEscaping   = true;
                }
                else
                {
                    switch (c)
                    {
                        case '\n':
                            bEscaping = false;
                            return set_error(STATUS_BAD_TOKEN);
                        case '\"':
                            bEscaping = false;
                            if (type == JT_DQ_STRING)
                                return skip(type);
                            break;
                        case '\'':
                            bEscaping = false;
                            if (type == JT_SQ_STRING)
                                return skip(type);
                            break;
                        default:
                            break;
                    }

                    if ((type = commit(type)) == JT_ERROR)
                    {
                        bEscaping = false;
                        return type;
                    }
                }
            }

            bEscaping = false;
            return enToken = type;
        }

        token_t Tokenizer::parse_identifier()
        {
            // Commit identifier's start character
            token_t tok;

            while (true)
            {
                // Read character
                lsp_swchar_t c = lookup();
                if (c < 0)
                {
                    if (c != -STATUS_EOF)
                        return set_error(-c);
                    break;
                }

                // Check that character is an identifier
                if (!is_identifier(c))
                    break;
                if ((tok = commit(JT_IDENTIFIER)) == JT_ERROR)
                    return tok;
            }

            // Analyze identifier
            if (sValue.equals_ascii("true"))
                return enToken = JT_TRUE;
            else if (sValue.equals_ascii("false"))
                return enToken = JT_FALSE;
            else if (sValue.equals_ascii("null"))
                return enToken = JT_NULL;
            else if (sValue.equals_ascii("NaN"))
            {
                fValue  = NAN;
                return enToken = JT_DOUBLE;
            }
            else if (sValue.equals_ascii("Infinity"))
            {
                fValue  = INFINITY;
                return enToken = JT_DOUBLE;
            }
            else if (is_reserved_word(&sValue))
                return enToken = JT_RESERVED;

            return enToken = JT_IDENTIFIER;
        }

        token_t Tokenizer::parse_single_line_comment()
        {
            token_t tok;

            // Reset the length of string literal
            sValue.set_length(0);
            skip(JT_SL_COMMENT);

            while (true)
            {
                // Read character
                lsp_swchar_t c = lookup();
                if (c < 0)
                    return (c == -STATUS_EOF) ? JT_SL_COMMENT : set_error(-c);

                // Analyze character
                switch (c) {
                    case '\n':
                        return skip(JT_SL_COMMENT);
                    default:
                    {
                        if ((tok = commit(JT_SL_COMMENT)) == JT_ERROR)
                            return tok;
                        break;
                    }
                }
            }

            return JT_UNKNOWN;
        }

        token_t Tokenizer::parse_multiline_comment()
        {
            // Reset the length of string literal
            sValue.set_length(0);
            skip(JT_SL_COMMENT);

            lsp_swchar_t last = -1;
            while (true)
            {
                // Read character
                lsp_swchar_t c = lookup();
                if (c < 0)
                    return set_error(-c);

                // Check state
                if (last == '*')
                {
                    if (c == '/')
                    {
                        // Remove last stored character (asterisk)
                        sValue.remove_last();
                        return skip(JT_ML_COMMENT);
                    }
                }
                else if (last == '\n')
                {
                    if (c == '\r')
                    {
                        skip(JT_ML_COMMENT);
                        continue;
                    }
                    last = -1;
                }

                // Append current character to the comment
                switch (c) {
                    case '\\':
                    {
                        skip(JT_ML_COMMENT);
                        break;
                    }

                    default:
                    {
                        token_t tok = commit(JT_ML_COMMENT);
                        if (tok == JT_ERROR)
                            return tok;
                    }
                }

                // Remember last character
                last = c;
            }

            return JT_UNKNOWN;
        }

        token_t Tokenizer::parse_number()
        {
            enum flags_t
            {
                F_NEGATIVE      = 1 << 0,
                F_SIGN          = 1 << 1,
                F_INTEGER       = 1 << 2,
                F_INT           = 1 << 3,
                F_DOT           = 1 << 4,
                F_FRAC          = 1 << 5,
                F_EXP           = 1 << 6,
                F_ESIGN         = 1 << 7,
                F_ENEGATIVE     = 1 << 8
            };

            lsp_swchar_t c  = lookup();
            size_t flags    = 0;
            int radix       = 10;
            int digit       = 0;
            ssize_t ivalue  = 0;
            double ifrac    = 0;
            double ifpow    = 1.0;
            double rradix   = 0.1;
            ssize_t iexp    = 0;

            // Has sign?
            if (c == '-')
            {
                flags      |= F_NEGATIVE | F_SIGN;
                c           = commit_lookup(JT_UNKNOWN);
            }
            else if (c == '+')
            {
                flags      |= F_SIGN;
                c           = commit_lookup(JT_UNKNOWN);
            }

            // Has prefix ?
            if (c == '0')
            {
                c           = commit_lookup(JT_UNKNOWN);
                switch (c)
                {
                    case 'x': // hexadecimal
                    case 'X':
                        radix       = 16;
                        rradix      = 0.0625;
                        c           = commit_lookup(JT_UNKNOWN);
                        flags      |= F_INTEGER;
                        break;
                    default:
                        flags      |= F_INT;
                        break;
                }
            }
            else if ((c == 'I') || (c == 'N')) // Infinity or NaN?
            {
                LSPString tmp;
                tmp.swap(&sValue);
                token_t tok = parse_identifier();
                if (!tmp.append(&sValue))
                    return set_error(STATUS_NO_MEM);
                sValue.swap(&tmp);
                if (tok != JT_DOUBLE)
                    return enToken = JT_UNKNOWN;
                if (flags & F_NEGATIVE)
                    fValue  = -fValue;
                return tok;
            }

            // Read the integer part
            while (parse_digit(&digit, c, radix))
            {
                ivalue      = ivalue*radix + digit;
                flags      |= F_INT;
                c           = commit_lookup(JT_DECIMAL);
            }

            // Is integer only?
            if (flags & F_INTEGER)
            {
                if (!(flags & F_INT)) // There should be at least one integer character
                    return enToken = JT_UNKNOWN;
                iValue      = (flags & F_NEGATIVE) ? -ivalue : ivalue;
                return enToken  = (radix != 16) ? JT_DECIMAL : JT_HEXADECIMAL;
            }

            // Has a fraction part?
            if (c == '.')
            {
                flags      |= F_DOT;
                c           = commit_lookup(JT_DOUBLE);

                while (parse_digit(&digit, c, radix))
                {
                    ifpow      *= rradix;
                    ifrac      += digit * ifpow;
                    flags      |= F_FRAC;
                    c           = commit_lookup(JT_DOUBLE);
                }
            }

            // Is there at least INT or FRAC part defined?
            if ((flags & (F_INT | F_FRAC)) == 0)
                return enToken = JT_UNKNOWN;

            // Has an exponent part?
            if ((c == 'e') || (c == 'E'))
            {
                c           = commit_lookup(JT_DOUBLE);

                // Has sign?
                if (c == '-')
                {
                    flags      |= F_ENEGATIVE | F_ESIGN;
                    c           = commit_lookup(JT_UNKNOWN);
                }
                else if (c == '+')
                {
                    flags      |= F_ESIGN;
                    c           = commit_lookup(JT_UNKNOWN);
                }

                // Parse exponent
                while (parse_digit(&digit, c, radix))
                {
                    iexp        = iexp*radix + digit;
                    flags      |= F_EXP;
                    c           = commit_lookup(JT_DOUBLE);
                }

                // Analyze post-condition: if exponent sign is defined,
                // the exponent value also should be defined
                if ((flags & (F_ESIGN | F_EXP)) == F_ESIGN)
                    return enToken  = JT_UNKNOWN;
                else if (flags & F_ENEGATIVE)
                    iexp        = -iexp;
            }

            // Ensure that the next character is not an identifier character (ECMA)
            c = lookup();
            if (c < 0)
            {
                if (c != (-STATUS_EOF))
                    return set_error(-c);
            }
            else if (is_identifier_start(c))
                return set_error(STATUS_BAD_FORMAT);

            // Now analyze parsing state
            if ((flags & (F_INT | F_FRAC | F_EXP | F_DOT)) == F_INT)
            {
                iValue      = (flags & F_NEGATIVE) ? -ivalue : ivalue;
                return enToken = JT_DECIMAL;
            }

            // Form the floating-point value
            double fv       = (double(ivalue) + ifrac) * pow(radix, iexp);
            fValue          = (flags & F_NEGATIVE) ? -fv : fv;
            return enToken  = JT_DOUBLE;
        }

        token_t Tokenizer::get_token(bool get)
        {
            // Pre-checks
            if (!get)
                return enToken;
            else if (nUnget > 0)
            {
                --nUnget;
                return enToken;
            }

            // Skip whitespaces
            lsp_swchar_t c = skip_whitespace();
            if (c < 0)
            {
                enToken = (c == -STATUS_EOF) ? JT_EOF : JT_ERROR;
                nError  = -c;
                return enToken;
            }

            sValue.set_length(0);

            switch (c)
            {
                case '[': return commit(JT_LQ_BRACE);
                case ']': return commit(JT_RQ_BRACE);
                case '{': return commit(JT_LC_BRACE);
                case '}': return commit(JT_RC_BRACE);
                case ':': return commit(JT_COLON);
                case ',': return commit(JT_COMMA);
                case '\'': return parse_string(JT_SQ_STRING);
                case '\"': return parse_string(JT_DQ_STRING);
                case '/':
                    c = commit_lookup(JT_UNKNOWN);
                    if (c == '/')
                        return parse_single_line_comment();
                    else if (c == '*')
                        return parse_multiline_comment();
                    commit(JT_UNKNOWN);
                    break;

                default:
                    if (is_identifier_start(c))
                        return parse_identifier();
                    return parse_number();
            }

            return enToken = JT_UNKNOWN;
        }
    
    } /* namespace json */
} /* namespace lsp */