File: lexer.c

package info (click to toggle)
ruby-liquid-c 4.2.0-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 544 kB
  • sloc: ansic: 3,866; ruby: 1,135; makefile: 7
file content (184 lines) | stat: -rw-r--r-- 5,178 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#include "liquid.h"
#include "lexer.h"
#include "usage.h"
#include <stdio.h>

const char *symbol_names[TOKEN_END] = {
    [TOKEN_NONE] = "none",
    [TOKEN_COMPARISON] = "comparison",
    [TOKEN_STRING] = "string",
    [TOKEN_NUMBER] = "number",
    [TOKEN_IDENTIFIER] = "id",
    [TOKEN_DOTDOT] = "dotdot",
    [TOKEN_EOS] = "end_of_string",
    [TOKEN_PIPE] = "pipe",
    [TOKEN_DOT] = "dot",
    [TOKEN_COLON] = "colon",
    [TOKEN_COMMA] = "comma",
    [TOKEN_OPEN_SQUARE] = "open_square",
    [TOKEN_CLOSE_SQUARE] = "close_square",
    [TOKEN_OPEN_ROUND] = "open_round",
    [TOKEN_CLOSE_ROUND] = "close_round",
    [TOKEN_QUESTION] = "question",
    [TOKEN_DASH] = "dash"
};

inline static int is_identifier(char c)
{
    return ISALNUM(c) || c == '_' || c == '-';
}

inline static int is_special(char c)
{
    switch (c) {
        case '|': case '.': case ':': case ',':
        case '[': case ']': case '(': case ')':
        case '?': case '-':
            return 1;
    }
    return 0;
}

// Returns a pointer to the character after the end of the match.
inline static const char *prefix_end(const char *cur, const char *end, const char *pattern)
{
    size_t pattern_len = strlen(pattern);

    if (pattern_len > (size_t)(end - cur)) return NULL;
    if (memcmp(cur, pattern, pattern_len) != 0) return NULL;

    return cur + pattern_len;
}

inline static const char *scan_past(const char *cur, const char *end, char target)
{
    const char *match = memchr(cur + 1, target, end - cur - 1);
    return match ? match + 1 : NULL;
}

#define RETURN_TOKEN(t, n) { \
    const char *tok_end = str + (n); \
    token->type = (t); \
    token->val = str; \
    return (token->val_end = tok_end); \
}

// Reads one token from start, and fills it into the token argument.
// Returns the start of the next token if any, otherwise the end of the string.
const char *lex_one(const char *start, const char *end, lexer_token_t *token)
{
    // str references the start of the token, after whitespace is skipped.
    // cur references the currently processing character during iterative lexing.
    const char *str = start, *cur;

    while (str < end && ISSPACE(*str)) ++str;

    token->val = token->val_end = NULL;
    token->flags = 0;

    if (str >= end) return str;

    char c = *str;  // First character of the token.
    char cn = '\0'; // Second character if available, for lookahead.
    if (str + 1 < end) cn = str[1];

    switch (c) {
        case '<':
            RETURN_TOKEN(TOKEN_COMPARISON, cn == '>' || cn == '=' ? 2 : 1);
        case '>':
            RETURN_TOKEN(TOKEN_COMPARISON, cn == '=' ? 2 : 1);
        case '=':
        case '!':
            if (cn == '=') RETURN_TOKEN(TOKEN_COMPARISON, 2);
            break;
        case '.':
            if (cn == '.') RETURN_TOKEN(TOKEN_DOTDOT, 2);
            break;
    }

    if ((cur = prefix_end(str, end, "contains")))
        RETURN_TOKEN(TOKEN_COMPARISON, cur - str);

    if (c == '\'' || c == '"') {
        cur = scan_past(str, end, c);

        if (cur) {
            // Quote was properly terminated.
            RETURN_TOKEN(TOKEN_STRING, cur - str);
        }
    }

    // Instrument for bug: https://github.com/Shopify/liquid-c/pull/120
    if (c == '-' && str + 1 < end && str[1] == '.') {
        usage_increment("liquid_c_negative_float_without_integer");
    }

    if (ISDIGIT(c) || c == '-') {
        int has_dot = 0;
        cur = str;
        while (++cur < end) {
            if (!has_dot && *cur == '.') {
                has_dot = 1;
            } else if (!ISDIGIT(*cur)) {
                break;
            }
        }
        cur--; // Point to last digit (or dot).

        if (*cur == '.') {
            cur--; // Ignore any trailing dot.
            has_dot = 0;
        }
        if (*cur != '-') {
            if (has_dot) token->flags |= TOKEN_FLOAT_NUMBER;
            RETURN_TOKEN(TOKEN_NUMBER, cur + 1 - str);
        }
    }

    if (ISALPHA(c) || c == '_') {
        cur = str;
        while (++cur < end && is_identifier(*cur)) {}
        if (cur < end && *cur == '?') cur++;
        RETURN_TOKEN(TOKEN_IDENTIFIER, cur - str);
    }

    if (is_special(c)) RETURN_TOKEN(c, 1);

    long remaining_str_len = end - str;
    int char_len = 0;

    // read multibyte UTF-8 character
    if ((c & 0x80) == 0) {
        // 1-byte character
        char_len = 1;
    } else if ((c & 0xE0) == 0xC0) {
        // 2-byte character
        if (remaining_str_len >= 2) {
            char_len = 2;
        }
    } else if ((c & 0xF0) == 0xE0) {
        // 3-byte character
        if (remaining_str_len >= 3) {
            char_len = 3;
        }
    } else if ((c & 0xF8) == 0xF0) {
        // 4-byte character
        if (remaining_str_len >= 4) {
            char_len = 4;
        }
    } else {
        // this should never happen
        rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c", c);
    }

    if (char_len > 0) {
        rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %.*s", char_len, str);
    } else {
        rb_raise(rb_eArgError, "invalid byte sequence in UTF-8");
    }

    return NULL;
}

#undef RETURN_TOKEN