File: grammar.bs

package info (click to toggle)
storm-lang 0.7.5-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 52,028 kB
sloc: ansic: 261,471; cpp: 140,432; sh: 14,891; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (446 lines) | stat: -rw-r--r-- 10,635 bytes
parent folder | download | duplicates (3)
use core:lang;
use lang:bnf;
use lang:bs:macro;

/**
 * Basic information on grammar.
 *
 * Note: This class does not attempt to expand the repetition possibly present in the grammar. This
 * means that parsers that need to expand these rules might introduce left recursion etc. when
 * expanding the grammar.
 */
class Grammar on Compiler {
	// The 'regexEpsilon' parameter indicates if regexes that may match the empty string are treated
	// as epsilon productions. This has implications for first and follow sets etc.
	init(Rule start, NameSet[] include, Bool regexEpsilon) {
		init() {
			start = start;
			regexEpsilon = regexEpsilon;
		}

		for (x in include)
			add(x);

		sort();

		// Prune unreachable rules.
		prune();

		// Compute first-sets first.
		computeFirst();

		// Compute follow-sets.
		computeFollow();
	}

	// Start rule.
	Rule start;

	// Map of rules -> productions that are visible.
	Rule->Array<Production> rules;

	// True if the rule might match epsilon.
	Rule->Bool epsilon;

	// First-sets for all rules.
	Rule->Set<Regex> first;

	// Follow-sets for all rules.
	Rule->Set<Regex> follow;

	// Are we treating regexes that may match the empty string as epsilon productions?
	Bool regexEpsilon;

	// Output a representation of what we have arrived at.
	void toS(StrBuf to) : override {
		Rule[] order;
		for (k, v in rules)
			order << k;
		// Note: This is not very efficient, as the identifiers are computed on each comparison. It
		// is fine, as it is for debugging, though.
		order.sort((a, b) => a.identifier < b.identifier);

		to << "Start: " << start.identifier << "\n";
		to << "Treating empty regexes as epsilon: " << regexEpsilon << "\n";

		for (i, rule in order) {
			if (i > 0)
				to << "\n";

			to << rule.identifier << ":\n";

			Indent z(to);

			to << "matches epsilon: " << epsilon[rule] << "\n";
			to << "first-set: " << first[rule] << "\n";
			to << "follow-set: " << follow[rule] << "\n";

			to << "---\n";

			for (i in rules[rule])
				to << i << "\n";
		}
	}

	// Compute the first-set for a production.
	Set<Regex> first(Production p) {
		Set<Regex> result;
		updateFirst(result, p.firstA, false);
		updateFirst(result, p.firstB, true);
		result;
	}

	// Helper for 'first'.
	private void updateFirst(Set<Regex> out, ProductionIter at, Bool repeated) {
		if (!at.valid)
			return;
		if (at.end)
			return;

		if (t = at.token as RuleToken) {
			for (x in first[t.rule])
				out.put(x);

			if (!epsilon[t.rule])
				return;
		} else if (t = at.token as RegexToken) {
			out.put(t.regex);

			if (!regexEpsilon)
				return;
			if (!t.regex.matchesEmpty())
				return;
		}

		// Epsilon found, keep going.
		updateFirst(out, at.nextA, repeated);
		if (!repeated)
			updateFirst(out, at.nextB, true);
	}

	// Check if the grammar has left-recursive productions. Returns a list of strings that describe
	// paths with left recursion. The list is empty if none exist.
	// The implementation tries to avoid duplicates in the reporting. Otherwise, two rules that were
	// mutually left-recursive would always appear twice.
	Str[] leftRecursive() {
		Str[] result;
		Rule->Bool visited;
		for (k, v in rules) {
			for (r in v) {
				if (x = leftRecursive(visited, k, r)) {
					result << (k.identifier + " -> " + x);
				}
			}
		}
		result;
	}

	// Check if a particular rule is left recursive. Returns an example of where the grammar is left
	// recursive. Does not attempt to find *all* instances of left recursion.
	Str? leftRecursive(Rule rule) {
		Rule->Bool visited;
		for (x in rules[rule]) {
			if (r = leftRecursive(visited, rule, x)) {
				return rule.identifier + " -> " + r;
			}
		}

		return null;
	}

	// Helper for the function above.
	private Str? leftRecursive(Rule->Bool visited, Rule original, Production prod) {
		if (x = leftRecursive(visited, original, prod.firstA, false))
			x;
		else
			leftRecursive(visited, original, prod.firstB, true);
	}

	private Str? leftRecursive(Rule->Bool visited, Rule original, ProductionIter iter, Bool repeated) {
		if (!iter.valid)
			return null;

		if (iter.end)
			return null;

		if (x = iter.token as RegexToken) {
			if (!x.regex.matchesEmpty)
				return null;
			if (!regexEpsilon)
				return null;
		} else if (x = iter.token as RuleToken) {
			if (r = leftRecursive(visited, original, x.rule)) {
				return r;
			}
			if (!epsilon[x.rule])
				return null;
		} else {
			return null;
		}

		// Continue.
		if (r = leftRecursive(visited, original, iter.nextA, repeated))
			return r;

		if (!repeated)
			return leftRecursive(visited, original, iter.nextB, true);
		else
			return null;
	}

	private Str? leftRecursive(Rule->Bool visited, Rule original, Rule current) {
		if (visited[current])
			return null;

		visited[current] = true;

		if (original is current)
			return original.identifier;

		for (x in rules[current]) {
			if (r = leftRecursive(visited, original, x)) {
				return current.identifier + " -> " + r;
			}
		}

		return null;
	}

	private void add(NameSet from) {
		for (x in from) {
			if (x as Rule) {
				add(x);
			} else if (x as ProductionType) {
				add(x.production);
			}
		}
	}

	private void add(Rule rule) {
		if (!rules.has(rule))
			rules.put(rule, []);
		epsilon[rule] = false;
	}

	private void add(Production prod) {
		if (rule = prod.rule) {
			Production[] p = rules[rule];
			p << prod;
		}
	}

	// Sort rules according to priority.
	private void sort() {
		for (k, v in rules) {
			v.sort((Production a, Production b) => a.priority > b.priority);
		}
	}

	// Check if a rule is a special rule (in the parser:special package and has zero productions).
	private Bool specialRule(Rule r) : static {
		r.parent is named{parser:special};
	}

	// Return the regex that is used as a first-set for special rules. We need something sensible so
	// that parsers behave correctly. A match anything should be fine.
	private Regex specialRegex() : static {
		Regex(".");
	}

	// Compute first-sets. This is done by a fix-point iteration. Also updates 'epsilon'
	private void computeFirst() {
		do {
			Bool changes = false;
			for (k, v in rules)
				changes |= updateFirst(k, v);
		} while (changes);
	}

	private Bool updateFirst(Rule rule, Production[] prods) {
		Set<Regex> s = first[rule];
		Bool changes = false;
		Nat oldCount = s.count;

		if (specialRule(rule)) {
			s.put(specialRegex());
		} else {
			for (x in prods) {
				changes |= putToken(rule, s, x.firstA, false);
				changes |= putToken(rule, s, x.firstB, true);
			}
		}

		changes | oldCount != s.count;
	}

	private Bool putToken(Rule rule, Set<Regex> into, ProductionIter iter, Bool repeated) {
		if (!iter.valid)
			return false;

		if (iter.end) {
			if (!epsilon[rule]) {
				epsilon[rule] = true;
				return true;
			} else {
				return false;
			}
		}

		Bool maybeEpsilon = false;

		if (regex = iter.token as RegexToken) {
			into.put(regex.regex);

			// If we need to check epsilon regexes:
			if (regexEpsilon)
				maybeEpsilon = regex.regex.matchesEmpty();

		} else if (rule = iter.token as RuleToken) {
			// Merge first sets, but don't try to examine the set we're inserting into.
			var firstSet = first[rule.rule];
			if (firstSet !is into) {
				for (x in first[rule.rule]) {
					into.put(x);
				}
			}

			// If the rule can match an epsilon production, we need to look further.
			maybeEpsilon = epsilon[rule.rule];
		}

		if (maybeEpsilon) {
			// Look further.
			Bool changes = putToken(rule, into, iter.nextA, repeated);

			// Don't loop forever.
			if (!repeated)
				changes |= putToken(rule, into, iter.nextB, true);

			return changes;
		} else {
			return false;
		}
	}

	// Compute follow-sets. This is done by a fix-point iteration.
	private void computeFollow() {
		do {
			Bool changes = false;
			for (k, v in rules) {
				// Touch the follow set so that we always have all items.
				follow[k];

				for (p in v) {
					changes |= updateFollow(k, p);
				}
			}
		} while (changes);
	}

	private Bool updateFollow(Rule rule, Production p) {
		Bool updated = false;

		// Check all locations in this production.
		for (Nat i = 0; i < p.tokens.count; i++) {
			unless (r = p.tokens[i] as RuleToken)
				continue;

			var iter = p.posIter(i);
			updated |= updateFollow(rule, r.rule, iter.nextA, false);
			updated |= updateFollow(rule, r.rule, iter.nextB, true);
		}

		updated;
	}

	private Bool updateFollow(Rule inRule, Rule previous, ProductionIter current, Bool repeated) {
		unless (current.valid)
			return false;

		Set<Regex> addTo = follow[previous];
		Nat oldCount = addTo.count;

		if (current.end) {
			// If at the end, we need to add the follow set of ourselves to the follow set of the symbol.
			for (x in follow[inRule])
				addTo.put(x);
		} else if (rt = current.token as RuleToken) {
			// The follow set is the first set of the new rule.
			for (x in first[rt.rule])
				addTo.put(x);

			// Keep going until we find a rule that does not match epsilon, or a regex.
			if (epsilon[rt.rule]) {
				// Note: since these update 'addTo', we don't need to check the return value from these.
				updateFollow(inRule, previous, current.nextA, repeated);
				if (!repeated)
					updateFollow(inRule, previous, current.nextB, true);
			}
		} else if (rt = current.token as RegexToken) {
			addTo.put(rt.regex);

			// Check for empty regexes.
			if (regexEpsilon & rt.regex.matchesEmpty()) {
				// Note: since these update 'addTo', we don't need to check the return value from these.
				updateFollow(inRule, previous, current.nextA, repeated);

				// Don't loop forever.
				if (!repeated)
					updateFollow(inRule, previous, current.nextB, true);
			}
		}

		addTo.count != oldCount;
	}

	// Prune rules that are not reachable from the start production.
	private void prune() {
		Rule->Bool reachable;
		// Populate 'reachable' so that we can iterate through it easily later on.
		for (k, v in rules)
			reachable.put(k, false);

		prune(reachable, start);

		// Remove the ones that were not reachable.
		for (k, v in reachable) {
			if (!v)
				rules.remove(k);
		}
	}

	private void prune(Rule->Bool reachable, Rule current) {
		// Do not re-visit rules.
		if (reachable[current])
			return;

		reachable[current] = true;

		// Traverse all productions.
		for (p in rules[current]) {
			for (token in p.tokens) {
				if (token as RuleToken) {
					prune(reachable, token.rule);
				}
			}
		}
	}
}


/**
 * Generic grammar exception.
 */
class GrammarError extends CodeError {
	init(SrcPos where, Str message) {
		init(where) {
			message = message;
		}
	}

	Str message;

	void messageText(StrBuf to) : override {
		to << "Grammar error: " << message;
	}
}