File: grammar.bs

package info (click to toggle)
storm-lang 0.7.5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 52,028 kB
  • sloc: ansic: 261,471; cpp: 140,432; sh: 14,891; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (446 lines) | stat: -rw-r--r-- 10,635 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
use core:lang;
use lang:bnf;
use lang:bs:macro;

/**
 * Basic information on grammar.
 *
 * Note: This class does not attempt to expand the repetition possibly present in the grammar. This
 * means that parsers that need to expand these rules might introduce left recursion etc. when
 * expanding the grammar.
 */
class Grammar on Compiler {
	// The 'regexEpsilon' parameter indicates if regexes that may match the empty string are treated
	// as epsilon productions. This has implications for first and follow sets etc.
	init(Rule start, NameSet[] include, Bool regexEpsilon) {
		init() {
			start = start;
			regexEpsilon = regexEpsilon;
		}

		for (x in include)
			add(x);

		sort();

		// Prune unreachable rules.
		prune();

		// Compute first-sets first.
		computeFirst();

		// Compute follow-sets.
		computeFollow();
	}

	// Start rule.
	Rule start;

	// Map of rules -> productions that are visible.
	Rule->Array<Production> rules;

	// True if the rule might match epsilon.
	Rule->Bool epsilon;

	// First-sets for all rules.
	Rule->Set<Regex> first;

	// Follow-sets for all rules.
	Rule->Set<Regex> follow;

	// Are we treating regexes that may match the empty string as epsilon productions?
	Bool regexEpsilon;

	// Output a representation of what we have arrived at.
	void toS(StrBuf to) : override {
		Rule[] order;
		for (k, v in rules)
			order << k;
		// Note: This is not very efficient, as the identifiers are computed on each comparison. It
		// is fine, as it is for debugging, though.
		order.sort((a, b) => a.identifier < b.identifier);

		to << "Start: " << start.identifier << "\n";
		to << "Treating empty regexes as epsilon: " << regexEpsilon << "\n";

		for (i, rule in order) {
			if (i > 0)
				to << "\n";

			to << rule.identifier << ":\n";

			Indent z(to);

			to << "matches epsilon: " << epsilon[rule] << "\n";
			to << "first-set: " << first[rule] << "\n";
			to << "follow-set: " << follow[rule] << "\n";

			to << "---\n";

			for (i in rules[rule])
				to << i << "\n";
		}
	}

	// Compute the first-set for a production.
	Set<Regex> first(Production p) {
		Set<Regex> result;
		updateFirst(result, p.firstA, false);
		updateFirst(result, p.firstB, true);
		result;
	}

	// Helper for 'first'.
	private void updateFirst(Set<Regex> out, ProductionIter at, Bool repeated) {
		if (!at.valid)
			return;
		if (at.end)
			return;

		if (t = at.token as RuleToken) {
			for (x in first[t.rule])
				out.put(x);

			if (!epsilon[t.rule])
				return;
		} else if (t = at.token as RegexToken) {
			out.put(t.regex);

			if (!regexEpsilon)
				return;
			if (!t.regex.matchesEmpty())
				return;
		}

		// Epsilon found, keep going.
		updateFirst(out, at.nextA, repeated);
		if (!repeated)
			updateFirst(out, at.nextB, true);
	}

	// Check if the grammar has left-recursive productions. Returns a list of strings that describe
	// paths with left recursion. The list is empty if none exist.
	// The implementation tries to avoid duplicates in the reporting. Otherwise, two rules that were
	// mutually left-recursive would always appear twice.
	Str[] leftRecursive() {
		Str[] result;
		Rule->Bool visited;
		for (k, v in rules) {
			for (r in v) {
				if (x = leftRecursive(visited, k, r)) {
					result << (k.identifier + " -> " + x);
				}
			}
		}
		result;
	}

	// Check if a particular rule is left recursive. Returns an example of where the grammar is left
	// recursive. Does not attempt to find *all* instances of left recursion.
	Str? leftRecursive(Rule rule) {
		Rule->Bool visited;
		for (x in rules[rule]) {
			if (r = leftRecursive(visited, rule, x)) {
				return rule.identifier + " -> " + r;
			}
		}

		return null;
	}

	// Helper for the function above.
	private Str? leftRecursive(Rule->Bool visited, Rule original, Production prod) {
		if (x = leftRecursive(visited, original, prod.firstA, false))
			x;
		else
			leftRecursive(visited, original, prod.firstB, true);
	}

	private Str? leftRecursive(Rule->Bool visited, Rule original, ProductionIter iter, Bool repeated) {
		if (!iter.valid)
			return null;

		if (iter.end)
			return null;

		if (x = iter.token as RegexToken) {
			if (!x.regex.matchesEmpty)
				return null;
			if (!regexEpsilon)
				return null;
		} else if (x = iter.token as RuleToken) {
			if (r = leftRecursive(visited, original, x.rule)) {
				return r;
			}
			if (!epsilon[x.rule])
				return null;
		} else {
			return null;
		}

		// Continue.
		if (r = leftRecursive(visited, original, iter.nextA, repeated))
			return r;

		if (!repeated)
			return leftRecursive(visited, original, iter.nextB, true);
		else
			return null;
	}

	private Str? leftRecursive(Rule->Bool visited, Rule original, Rule current) {
		if (visited[current])
			return null;

		visited[current] = true;

		if (original is current)
			return original.identifier;

		for (x in rules[current]) {
			if (r = leftRecursive(visited, original, x)) {
				return current.identifier + " -> " + r;
			}
		}

		return null;
	}

	private void add(NameSet from) {
		for (x in from) {
			if (x as Rule) {
				add(x);
			} else if (x as ProductionType) {
				add(x.production);
			}
		}
	}

	private void add(Rule rule) {
		if (!rules.has(rule))
			rules.put(rule, []);
		epsilon[rule] = false;
	}

	private void add(Production prod) {
		if (rule = prod.rule) {
			Production[] p = rules[rule];
			p << prod;
		}
	}

	// Sort rules according to priority.
	private void sort() {
		for (k, v in rules) {
			v.sort((Production a, Production b) => a.priority > b.priority);
		}
	}

	// Check if a rule is a special rule (in the parser:special package and has zero productions).
	private Bool specialRule(Rule r) : static {
		r.parent is named{parser:special};
	}

	// Return the regex that is used as a first-set for special rules. We need something sensible so
	// that parsers behave correctly. A match anything should be fine.
	private Regex specialRegex() : static {
		Regex(".");
	}

	// Compute first-sets. This is done by a fix-point iteration. Also updates 'epsilon'
	private void computeFirst() {
		do {
			Bool changes = false;
			for (k, v in rules)
				changes |= updateFirst(k, v);
		} while (changes);
	}

	private Bool updateFirst(Rule rule, Production[] prods) {
		Set<Regex> s = first[rule];
		Bool changes = false;
		Nat oldCount = s.count;

		if (specialRule(rule)) {
			s.put(specialRegex());
		} else {
			for (x in prods) {
				changes |= putToken(rule, s, x.firstA, false);
				changes |= putToken(rule, s, x.firstB, true);
			}
		}

		changes | oldCount != s.count;
	}

	private Bool putToken(Rule rule, Set<Regex> into, ProductionIter iter, Bool repeated) {
		if (!iter.valid)
			return false;

		if (iter.end) {
			if (!epsilon[rule]) {
				epsilon[rule] = true;
				return true;
			} else {
				return false;
			}
		}

		Bool maybeEpsilon = false;

		if (regex = iter.token as RegexToken) {
			into.put(regex.regex);

			// If we need to check epsilon regexes:
			if (regexEpsilon)
				maybeEpsilon = regex.regex.matchesEmpty();

		} else if (rule = iter.token as RuleToken) {
			// Merge first sets, but don't try to examine the set we're inserting into.
			var firstSet = first[rule.rule];
			if (firstSet !is into) {
				for (x in first[rule.rule]) {
					into.put(x);
				}
			}

			// If the rule can match an epsilon production, we need to look further.
			maybeEpsilon = epsilon[rule.rule];
		}

		if (maybeEpsilon) {
			// Look further.
			Bool changes = putToken(rule, into, iter.nextA, repeated);

			// Don't loop forever.
			if (!repeated)
				changes |= putToken(rule, into, iter.nextB, true);

			return changes;
		} else {
			return false;
		}
	}

	// Compute follow-sets. This is done by a fix-point iteration.
	private void computeFollow() {
		do {
			Bool changes = false;
			for (k, v in rules) {
				// Touch the follow set so that we always have all items.
				follow[k];

				for (p in v) {
					changes |= updateFollow(k, p);
				}
			}
		} while (changes);
	}

	private Bool updateFollow(Rule rule, Production p) {
		Bool updated = false;

		// Check all locations in this production.
		for (Nat i = 0; i < p.tokens.count; i++) {
			unless (r = p.tokens[i] as RuleToken)
				continue;

			var iter = p.posIter(i);
			updated |= updateFollow(rule, r.rule, iter.nextA, false);
			updated |= updateFollow(rule, r.rule, iter.nextB, true);
		}

		updated;
	}

	private Bool updateFollow(Rule inRule, Rule previous, ProductionIter current, Bool repeated) {
		unless (current.valid)
			return false;

		Set<Regex> addTo = follow[previous];
		Nat oldCount = addTo.count;

		if (current.end) {
			// If at the end, we need to add the follow set of ourselves to the follow set of the symbol.
			for (x in follow[inRule])
				addTo.put(x);
		} else if (rt = current.token as RuleToken) {
			// The follow set is the first set of the new rule.
			for (x in first[rt.rule])
				addTo.put(x);

			// Keep going until we find a rule that does not match epsilon, or a regex.
			if (epsilon[rt.rule]) {
				// Note: since these update 'addTo', we don't need to check the return value from these.
				updateFollow(inRule, previous, current.nextA, repeated);
				if (!repeated)
					updateFollow(inRule, previous, current.nextB, true);
			}
		} else if (rt = current.token as RegexToken) {
			addTo.put(rt.regex);

			// Check for empty regexes.
			if (regexEpsilon & rt.regex.matchesEmpty()) {
				// Note: since these update 'addTo', we don't need to check the return value from these.
				updateFollow(inRule, previous, current.nextA, repeated);

				// Don't loop forever.
				if (!repeated)
					updateFollow(inRule, previous, current.nextB, true);
			}
		}

		addTo.count != oldCount;
	}

	// Prune rules that are not reachable from the start production.
	private void prune() {
		Rule->Bool reachable;
		// Populate 'reachable' so that we can iterate through it easily later on.
		for (k, v in rules)
			reachable.put(k, false);

		prune(reachable, start);

		// Remove the ones that were not reachable.
		for (k, v in reachable) {
			if (!v)
				rules.remove(k);
		}
	}

	private void prune(Rule->Bool reachable, Rule current) {
		// Do not re-visit rules.
		if (reachable[current])
			return;

		reachable[current] = true;

		// Traverse all productions.
		for (p in rules[current]) {
			for (token in p.tokens) {
				if (token as RuleToken) {
					prune(reachable, token.rule);
				}
			}
		}
	}
}


/**
 * Generic grammar exception.
 */
class GrammarError extends CodeError {
	init(SrcPos where, Str message) {
		init(where) {
			message = message;
		}
	}

	Str message;

	void messageText(StrBuf to) : override {
		to << "Grammar error: " << message;
	}
}