File: generate_grammar.py

package info (click to toggle)
duckdb 1.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 299,196 kB
  • sloc: cpp: 865,414; ansic: 57,292; python: 18,871; sql: 12,663; lisp: 11,751; yacc: 7,412; lex: 1,682; sh: 747; makefile: 558
file content (310 lines) | stat: -rw-r--r-- 10,222 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# use bison to generate the parser files
# the following version of bison is used:
# bison (GNU Bison) 2.3
import os
import subprocess
import re
import sys
from python_helpers import open_utf8

bison_location = "bison"
base_dir = 'third_party/libpg_query/grammar'
pg_dir = 'third_party/libpg_query'
namespace = 'duckdb_libpgquery'

counterexamples = False
run_update = False
verbose = False
for arg in sys.argv[1:]:
    if arg.startswith("--bison="):
        bison_location = arg.replace("--bison=", "")
    elif arg.startswith("--counterexamples"):
        counterexamples = True
    elif arg.startswith("--update"):
        run_update = True
    # allow a prefix to the source and target directories
    elif arg.startswith("--custom_dir_prefix"):
        base_dir = arg.split("=")[1] + base_dir
        pg_dir = arg.split("=")[1] + pg_dir
    elif arg.startswith("--namespace"):
        namespace = arg.split("=")[1]
    elif arg.startswith("--verbose"):
        verbose = True
    else:
        raise Exception(
            "Unrecognized argument: "
            + arg
            + ", expected --counterexamples, --bison=/loc/to/bison, --custom_dir_prefix, --namespace, --verbose"
        )

template_file = os.path.join(base_dir, 'grammar.y')
target_file = os.path.join(base_dir, 'grammar.y.tmp')
header_file = os.path.join(base_dir, 'grammar.hpp')
source_file = os.path.join(base_dir, 'grammar.cpp')
type_dir = os.path.join(base_dir, 'types')
rule_dir = os.path.join(base_dir, 'statements')
result_source = os.path.join(base_dir, 'grammar_out.cpp')
result_header = os.path.join(base_dir, 'grammar_out.hpp')
target_source_loc = os.path.join(pg_dir, 'src_backend_parser_gram.cpp')
target_header_loc = os.path.join(pg_dir, 'include/parser/gram.hpp')
kwlist_header = os.path.join(pg_dir, 'include/parser/kwlist.hpp')


# parse the keyword lists
def read_list_from_file(fname):
    with open_utf8(fname, 'r') as f:
        return [x.strip() for x in f.read().split('\n') if len(x.strip()) > 0]


kwdir = os.path.join(base_dir, 'keywords')
unreserved_keywords = read_list_from_file(os.path.join(kwdir, 'unreserved_keywords.list'))
colname_keywords = read_list_from_file(os.path.join(kwdir, 'column_name_keywords.list'))
func_name_keywords = read_list_from_file(os.path.join(kwdir, 'func_name_keywords.list'))
type_name_keywords = read_list_from_file(os.path.join(kwdir, 'type_name_keywords.list'))
reserved_keywords = read_list_from_file(os.path.join(kwdir, 'reserved_keywords.list'))


def strip_p(x):
    if x.endswith("_P"):
        return x[:-2]
    else:
        return x


unreserved_keywords.sort(key=lambda x: strip_p(x))
colname_keywords.sort(key=lambda x: strip_p(x))
func_name_keywords.sort(key=lambda x: strip_p(x))
type_name_keywords.sort(key=lambda x: strip_p(x))
reserved_keywords.sort(key=lambda x: strip_p(x))

statements = read_list_from_file(os.path.join(base_dir, 'statements.list'))
statements.sort()
if len(statements) < 0:
    print("Need at least one statement")
    exit(1)

# verify there are no duplicate keywords and create big sorted list of keywords
kwdict = {}
for kw in unreserved_keywords:
    kwdict[kw] = 'UNRESERVED_KEYWORD'

for kw in colname_keywords:
    kwdict[kw] = 'COL_NAME_KEYWORD'

for kw in func_name_keywords:
    kwdict[kw] = 'TYPE_FUNC_NAME_KEYWORD'

for kw in type_name_keywords:
    kwdict[kw] = 'TYPE_FUNC_NAME_KEYWORD'

for kw in reserved_keywords:
    kwdict[kw] = 'RESERVED_KEYWORD'

kwlist = [(x, kwdict[x]) for x in kwdict.keys()]
# sorting uppercase is different from lowercase: A-Z < _ < a-z
kwlist.sort(key=lambda x: strip_p(x[0].lower()))

# now generate kwlist.h
# PG_KEYWORD("abort", ABORT_P, UNRESERVED_KEYWORD)
kwtext = (
    """
namespace """
    + namespace
    + """ {
#define PG_KEYWORD(a,b,c) {a,b,c},

const PGScanKeyword ScanKeywords[] = {
"""
)
for tpl in kwlist:
    kwtext += 'PG_KEYWORD("%s", %s, %s)\n' % (strip_p(tpl[0]).lower(), tpl[0], tpl[1])
kwtext += (
    """
};

const int NumScanKeywords = lengthof(ScanKeywords);
} // namespace """
    + namespace
    + """
"""
)

with open_utf8(kwlist_header, 'w+') as f:
    f.write(kwtext)


# generate the final main.y.tmp file
# first read the template file
with open_utf8(template_file, 'r') as f:
    text = f.read()

# now perform a series of replacements in the file to construct the final yacc file


def get_file_contents(fpath, add_line_numbers=False):
    with open_utf8(fpath, 'r') as f:
        result = f.read()
        if add_line_numbers:
            return '#line 1 "%s"\n' % (fpath,) + result
        else:
            return result


# grammar.hpp
text = text.replace("{{{ GRAMMAR_HEADER }}}", get_file_contents(header_file, True))

# grammar.cpp
text = text.replace("{{{ GRAMMAR_SOURCE }}}", get_file_contents(source_file, True))

# keyword list
kw_token_list = "%token <keyword> " + " ".join([x[0] for x in kwlist])

text = text.replace("{{{ KEYWORDS }}}", kw_token_list)

# statements
stmt_list = "stmt: " + "\n\t| ".join(statements) + "\n\t| /*EMPTY*/\n\t{ $$ = NULL; }\n"
text = text.replace("{{{ STATEMENTS }}}", stmt_list)

# keywords
# keywords can EITHER be reserved, unreserved, or some combination of (col_name, type_name, func_name)
# that means duplicates are ONLY allowed between (col_name, type_name and func_name)
# having a keyword be both reserved and unreserved is an error
# as is having a keyword both reserved and col_name, for example
# verify that this is the case
reserved_dict = {}
unreserved_dict = {}
other_dict = {}
for r in reserved_keywords:
    if r in reserved_dict:
        print("Duplicate keyword " + r + " in reserved keywords")
        exit(1)
    reserved_dict[r] = True

for ur in unreserved_keywords:
    if ur in unreserved_dict:
        print("Duplicate keyword " + ur + " in unreserved keywords")
        exit(1)
    if ur in reserved_dict:
        print("Keyword " + ur + " is marked as both unreserved and reserved")
        exit(1)
    unreserved_dict[ur] = True


def add_to_other_keywords(kw, list_name):
    global unreserved_dict
    global reserved_dict
    global other_dict
    if kw in unreserved_dict:
        print("Keyword " + kw + " is marked as both unreserved and " + list_name)
        exit(1)
    if kw in reserved_dict:
        print("Keyword " + kw + " is marked as both reserved and " + list_name)
        exit(1)
    other_dict[kw] = True


for cr in colname_keywords:
    add_to_other_keywords(cr, "colname")

type_func_name_dict = {}
for tr in type_name_keywords:
    add_to_other_keywords(tr, "typename")
    type_func_name_dict[tr] = True

for fr in func_name_keywords:
    add_to_other_keywords(fr, "funcname")
    type_func_name_dict[fr] = True

type_func_name_keywords = list(type_func_name_dict.keys())
type_func_name_keywords.sort()

all_keywords = list(reserved_dict.keys()) + list(unreserved_dict.keys()) + list(other_dict.keys())
all_keywords.sort()

other_keyword = list(other_dict.keys())
other_keyword.sort()

kw_definitions = "unreserved_keyword: " + " | ".join(unreserved_keywords) + "\n"
kw_definitions += "col_name_keyword: " + " | ".join(colname_keywords) + "\n"
kw_definitions += "func_name_keyword: " + " | ".join(func_name_keywords) + "\n"
kw_definitions += "type_name_keyword: " + " | ".join(type_name_keywords) + "\n"
kw_definitions += "other_keyword: " + " | ".join(other_keyword) + "\n"
kw_definitions += "type_func_name_keyword: " + " | ".join(type_func_name_keywords) + "\n"
kw_definitions += "reserved_keyword: " + " | ".join(reserved_keywords) + "\n"
text = text.replace("{{{ KEYWORD_DEFINITIONS }}}", kw_definitions)


# types
def concat_dir(dname, extension, add_line_numbers=False):
    result = ""
    for fname in os.listdir(dname):
        fpath = os.path.join(dname, fname)
        if os.path.isdir(fpath):
            result += concat_dir(fpath, extension)
        else:
            if not fname.endswith(extension):
                continue
            result += get_file_contents(fpath, add_line_numbers)
    return result


type_definitions = concat_dir(type_dir, ".yh")
# add statement types as well
for stmt in statements:
    type_definitions += "%type <node> " + stmt + "\n"

text = text.replace("{{{ TYPES }}}", type_definitions)

# grammar rules
grammar_rules = concat_dir(rule_dir, ".y", True)

text = text.replace("{{{ GRAMMAR RULES }}}", grammar_rules)

# finally write the yacc file into the target file
with open_utf8(target_file, 'w+') as f:
    f.write(text)

# generate the bison
cmd = [bison_location]
if counterexamples:
    print("Attempting to print counterexamples (-Wcounterexamples)")
    cmd += ["-Wcounterexamples"]
if run_update:
    cmd += ["--update"]
if verbose:
    cmd += ["--verbose"]
cmd += ["-o", result_source, "-d", target_file]
print(' '.join(cmd))
proc = subprocess.Popen(cmd, stderr=subprocess.PIPE)
res = proc.wait(timeout=10)  # ensure CI does not hang as was seen when running with Bison 3.x release.

if res != 0:
    text = proc.stderr.read().decode('utf8')
    print(text)
    if 'shift/reduce' in text and not counterexamples:
        print("---------------------------------------------------------------------")
        print("In case of shift/reduce conflicts, try re-running with --counterexamples")
        print("Note: this requires a more recent version of Bison (e.g. version 3.8)")
        print("On a Macbook you can obtain this using \"brew install bison\"")
    if counterexamples and 'time limit exceeded' in text:
        print("---------------------------------------------------------------------")
        print(
            "The counterexamples time limit was exceeded. This likely means that no useful counterexample was generated."
        )
        print("")
        print("The counterexamples time limit can be increased by setting the TIME_LIMIT environment variable, e.g.:")
        print("export TIME_LIMIT=100")
    exit(1)


os.rename(result_source, target_source_loc)
os.rename(result_header, target_header_loc)

with open_utf8(target_source_loc, 'r') as f:
    text = f.read()

text = text.replace('#include "grammar_out.hpp"', '#include "include/parser/gram.hpp"')
text = text.replace('yynerrs = 0;', 'yynerrs = 0; (void)yynerrs;')

with open_utf8(target_source_loc, 'w+') as f:
    f.write(text)