1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
|
/*--------------------------------------------------------------------
* Symbols referenced in this file:
* - raw_parser
* - base_yylex
* - raw_parser
*--------------------------------------------------------------------
*/
/*-------------------------------------------------------------------------
*
* parser.c
* Main entry point/driver for PostgreSQL grammar
*
* Note that the grammar is not allowed to perform any table access
* (since we need to be able to do basic parsing even while inside an
* aborted transaction). Therefore, the data structures returned by
* the grammar are "raw" parsetrees that still need to be analyzed by
* analyze.c and related files.
*
*
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development PGGroup
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/parser/parser.c
*
*-------------------------------------------------------------------------
*/
#include "pg_functions.hpp"
#include "parser/gramparse.hpp"
#include "parser/parser.hpp"
#include "parser/kwlist.hpp"
namespace duckdb_libpgquery {
/*
* raw_parser
* Given a query in string form, do lexical and grammatical analysis.
*
* Returns a list of raw (un-analyzed) parse trees. The immediate elements
* of the list are always PGRawStmt nodes.
*/
PGList *raw_parser(const char *str) {
core_yyscan_t yyscanner;
base_yy_extra_type yyextra;
int yyresult;
/* initialize the flex scanner */
yyscanner = scanner_init(str, &yyextra.core_yy_extra, ScanKeywords, NumScanKeywords);
/* base_yylex() only needs this much initialization */
yyextra.have_lookahead = false;
/* initialize the bison parser */
parser_init(&yyextra);
/* Parse! */
yyresult = base_yyparse(yyscanner);
/* Clean up (release memory) */
scanner_finish(yyscanner);
if (yyresult) /* error */
return NIL;
return yyextra.parsetree;
}
PGKeywordCategory is_keyword(const char *text) {
auto keyword = ScanKeywordLookup(text, ScanKeywords, NumScanKeywords);
if (keyword) {
return static_cast<PGKeywordCategory>(keyword->category);
}
return PGKeywordCategory::PG_KEYWORD_NONE;
}
std::vector<PGKeyword> keyword_list() {
std::vector<PGKeyword> result;
for(size_t i = 0; i < NumScanKeywords; i++) {
PGKeyword keyword;
keyword.text = ScanKeywords[i].name;
switch(ScanKeywords[i].category) {
case UNRESERVED_KEYWORD:
keyword.category = PGKeywordCategory::PG_KEYWORD_UNRESERVED;
break;
case RESERVED_KEYWORD:
keyword.category = PGKeywordCategory::PG_KEYWORD_RESERVED;
break;
case TYPE_FUNC_NAME_KEYWORD:
keyword.category = PGKeywordCategory::PG_KEYWORD_TYPE_FUNC;
break;
case COL_NAME_KEYWORD:
keyword.category = PGKeywordCategory::PG_KEYWORD_COL_NAME;
break;
}
result.push_back(keyword);
}
return result;
}
std::vector<PGSimplifiedToken> tokenize(const char *str) {
core_yyscan_t yyscanner;
base_yy_extra_type yyextra;
std::vector<PGSimplifiedToken> result;
yyscanner = scanner_init(str, &yyextra.core_yy_extra, ScanKeywords, NumScanKeywords);
yyextra.have_lookahead = false;
while(true) {
YYSTYPE type;
YYLTYPE loc;
int token;
try {
token = base_yylex(&type, &loc, yyscanner);
} catch(...) {
token = 0;
}
if (token == 0) {
break;
}
PGSimplifiedToken current_token;
switch(token) {
case IDENT:
current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER;
break;
case ICONST:
case FCONST:
current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT;
break;
case SCONST:
case BCONST:
case XCONST:
current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT;
break;
case Op:
case PARAM:
case COLON_EQUALS:
case EQUALS_GREATER:
case LESS_EQUALS:
case GREATER_EQUALS:
case NOT_EQUALS:
current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR;
break;
default:
if (token >= 255) {
// non-ascii value, probably a keyword
current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD;
} else {
// ascii value, probably an operator
current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR;
}
break;
}
current_token.start = loc;
result.push_back(current_token);
}
scanner_finish(yyscanner);
return result;
}
/*
* Intermediate filter between parser and core lexer (core_yylex in scan.l).
*
* This filter is needed because in some cases the standard SQL grammar
* requires more than one token lookahead. We reduce these cases to one-token
* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
*
* Using a filter is simpler than trying to recognize multiword tokens
* directly in scan.l, because we'd have to allow for comments between the
* words. Furthermore it's not clear how to do that without re-introducing
* scanner backtrack, which would cost more performance than this filter
* layer does.
*
* The filter also provides a convenient place to translate between
* the core_YYSTYPE and YYSTYPE representations (which are really the
* same thing anyway, but notationally they're different).
*/
int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) {
base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
int cur_token;
int next_token;
int cur_token_length;
YYLTYPE cur_yylloc;
/* Get next token --- we might already have it */
if (yyextra->have_lookahead) {
cur_token = yyextra->lookahead_token;
lvalp->core_yystype = yyextra->lookahead_yylval;
*llocp = yyextra->lookahead_yylloc;
*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
yyextra->have_lookahead = false;
} else
cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
/*
* If this token isn't one that requires lookahead, just return it. If it
* does, determine the token length. (We could get that via strlen(), but
* since we have such a small set of possibilities, hardwiring seems
* feasible and more efficient.)
*/
switch (cur_token) {
case NOT:
cur_token_length = 3;
break;
case NULLS_P:
cur_token_length = 5;
break;
case WITH:
cur_token_length = 4;
break;
default:
return cur_token;
}
/*
* Identify end+1 of current token. core_yylex() has temporarily stored a
* '\0' here, and will undo that when we call it again. We need to redo
* it to fully revert the lookahead call for error reporting purposes.
*/
yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + *llocp + cur_token_length;
Assert(*(yyextra->lookahead_end) == '\0');
/*
* Save and restore *llocp around the call. It might look like we could
* avoid this by just passing &lookahead_yylloc to core_yylex(), but that
* does not work because flex actually holds onto the last-passed pointer
* internally, and will use that for error reporting. We need any error
* reports to point to the current token, not the next one.
*/
cur_yylloc = *llocp;
/* Get next token, saving outputs into lookahead variables */
next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
yyextra->lookahead_token = next_token;
yyextra->lookahead_yylloc = *llocp;
*llocp = cur_yylloc;
/* Now revert the un-truncation of the current token */
yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
*(yyextra->lookahead_end) = '\0';
yyextra->have_lookahead = true;
/* Replace cur_token if needed, based on lookahead */
switch (cur_token) {
case NOT:
/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
switch (next_token) {
case BETWEEN:
case IN_P:
case LIKE:
case ILIKE:
case SIMILAR:
cur_token = NOT_LA;
break;
}
break;
case NULLS_P:
/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
switch (next_token) {
case FIRST_P:
case LAST_P:
cur_token = NULLS_LA;
break;
}
break;
case WITH:
/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
switch (next_token) {
case TIME:
case ORDINALITY:
cur_token = WITH_LA;
break;
}
break;
}
return cur_token;
}
}
|