1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
#include "liquid.h"
#include "lexer.h"
#include "usage.h"
#include <stdio.h>
const char *symbol_names[TOKEN_END] = {
[TOKEN_NONE] = "none",
[TOKEN_COMPARISON] = "comparison",
[TOKEN_STRING] = "string",
[TOKEN_NUMBER] = "number",
[TOKEN_IDENTIFIER] = "id",
[TOKEN_DOTDOT] = "dotdot",
[TOKEN_EOS] = "end_of_string",
[TOKEN_PIPE] = "pipe",
[TOKEN_DOT] = "dot",
[TOKEN_COLON] = "colon",
[TOKEN_COMMA] = "comma",
[TOKEN_OPEN_SQUARE] = "open_square",
[TOKEN_CLOSE_SQUARE] = "close_square",
[TOKEN_OPEN_ROUND] = "open_round",
[TOKEN_CLOSE_ROUND] = "close_round",
[TOKEN_QUESTION] = "question",
[TOKEN_DASH] = "dash"
};
inline static int is_identifier(char c)
{
return ISALNUM(c) || c == '_' || c == '-';
}
inline static int is_special(char c)
{
switch (c) {
case '|': case '.': case ':': case ',':
case '[': case ']': case '(': case ')':
case '?': case '-':
return 1;
}
return 0;
}
// Returns a pointer to the character after the end of the match.
inline static const char *prefix_end(const char *cur, const char *end, const char *pattern)
{
size_t pattern_len = strlen(pattern);
if (pattern_len > (size_t)(end - cur)) return NULL;
if (memcmp(cur, pattern, pattern_len) != 0) return NULL;
return cur + pattern_len;
}
inline static const char *scan_past(const char *cur, const char *end, char target)
{
const char *match = memchr(cur + 1, target, end - cur - 1);
return match ? match + 1 : NULL;
}
#define RETURN_TOKEN(t, n) { \
const char *tok_end = str + (n); \
token->type = (t); \
token->val = str; \
return (token->val_end = tok_end); \
}
// Reads one token from start, and fills it into the token argument.
// Returns the start of the next token if any, otherwise the end of the string.
const char *lex_one(const char *start, const char *end, lexer_token_t *token)
{
// str references the start of the token, after whitespace is skipped.
// cur references the currently processing character during iterative lexing.
const char *str = start, *cur;
while (str < end && ISSPACE(*str)) ++str;
token->val = token->val_end = NULL;
token->flags = 0;
if (str >= end) return str;
char c = *str; // First character of the token.
char cn = '\0'; // Second character if available, for lookahead.
if (str + 1 < end) cn = str[1];
switch (c) {
case '<':
RETURN_TOKEN(TOKEN_COMPARISON, cn == '>' || cn == '=' ? 2 : 1);
case '>':
RETURN_TOKEN(TOKEN_COMPARISON, cn == '=' ? 2 : 1);
case '=':
case '!':
if (cn == '=') RETURN_TOKEN(TOKEN_COMPARISON, 2);
break;
case '.':
if (cn == '.') RETURN_TOKEN(TOKEN_DOTDOT, 2);
break;
}
if ((cur = prefix_end(str, end, "contains")))
RETURN_TOKEN(TOKEN_COMPARISON, cur - str);
if (c == '\'' || c == '"') {
cur = scan_past(str, end, c);
if (cur) {
// Quote was properly terminated.
RETURN_TOKEN(TOKEN_STRING, cur - str);
}
}
// Instrument for bug: https://github.com/Shopify/liquid-c/pull/120
if (c == '-' && str + 1 < end && str[1] == '.') {
usage_increment("liquid_c_negative_float_without_integer");
}
if (ISDIGIT(c) || c == '-') {
int has_dot = 0;
cur = str;
while (++cur < end) {
if (!has_dot && *cur == '.') {
has_dot = 1;
} else if (!ISDIGIT(*cur)) {
break;
}
}
cur--; // Point to last digit (or dot).
if (*cur == '.') {
cur--; // Ignore any trailing dot.
has_dot = 0;
}
if (*cur != '-') {
if (has_dot) token->flags |= TOKEN_FLOAT_NUMBER;
RETURN_TOKEN(TOKEN_NUMBER, cur + 1 - str);
}
}
if (ISALPHA(c) || c == '_') {
cur = str;
while (++cur < end && is_identifier(*cur)) {}
if (cur < end && *cur == '?') cur++;
RETURN_TOKEN(TOKEN_IDENTIFIER, cur - str);
}
if (is_special(c)) RETURN_TOKEN(c, 1);
long remaining_str_len = end - str;
int char_len = 0;
// read multibyte UTF-8 character
if ((c & 0x80) == 0) {
// 1-byte character
char_len = 1;
} else if ((c & 0xE0) == 0xC0) {
// 2-byte character
if (remaining_str_len >= 2) {
char_len = 2;
}
} else if ((c & 0xF0) == 0xE0) {
// 3-byte character
if (remaining_str_len >= 3) {
char_len = 3;
}
} else if ((c & 0xF8) == 0xF0) {
// 4-byte character
if (remaining_str_len >= 4) {
char_len = 4;
}
} else {
// this should never happen
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %c", c);
}
if (char_len > 0) {
rb_enc_raise(utf8_encoding, cLiquidSyntaxError, "Unexpected character %.*s", char_len, str);
} else {
rb_raise(rb_eArgError, "invalid byte sequence in UTF-8");
}
return NULL;
}
#undef RETURN_TOKEN
|