File: parse.bs

package info (click to toggle)
storm-lang 0.7.0-1
links: PTS, VCS
area: main
in suites:
size: 51,832 kB
sloc: ansic: 261,420; cpp: 138,870; sh: 14,877; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (164 lines) | stat: -rw-r--r-- 6,880 bytes
parent folder | download | duplicates (3)
use parser;

/**
 * Error thrown when there is an error in the Markdown syntax.
 */
class MarkdownError extends Exception {
	init(Str:Iter pos, Str msg) {
		init { pos = pos; msg = msg; }
	}

	private Str:Iter pos;
	private Str msg;

	void message(StrBuf to) : override {
		to << "Failed to parse markdown text:\n";
		to << msg << "\n";
		to << "The error is near the |> in the text below:\n" << pos;
	}
}

// Parse a string containing Markdown text. Returns a symbolic representation of the document.
// Throws a `MarkdownError` on failure.
Document parse(Str str) {
	// We require that the string ends in a newline. Extra newlines do not matter, so just add one
	// always.
	var result = parser(str + "\n");
	if (error = result.error)
		throw MarkdownError(error.pos, error.message);

	unless (v = result.value)
		throw InternalError("Should have got an error!");
	if (result.end != str.end)
		throw MarkdownError(result.end, "Not all data could be parsed.");
	return v;
}

// The parser itself:
private parser : parser(backtracking recursive descent) {
	start = SDocument;

	Document SDocument();
	SDocument => Document(content) : SElementSeq(0) content;

	ElementSeq SElementSeq(Nat indent);
	SElementSeq => ElementSeq() : (SElement(me, indent))*;

	// Match an entire element.
	// The first part matches the indentation and delegates to SElementBaseline.
	void SElement(ElementSeq to, Nat indent);
	SElement[20] => to : parser.special.ExactIndent(indent) - SElementBaseline(indent) -> add;
	SElement[10] : "[ \t]*\n"; // Empty lines may have too little indentation.

	// Match the part after the expected indentation level.
	Element SElementBaseline(Nat indent);
	SElementBaseline[10] => x : SExtraIndent(indent) newIndent - SElementNoIndent(newIndent) x;
	// This is where we would detect things like code blocks, etc.

	// Match the part after all (possibly additional) indentation has been consumed.
	Element SElementNoIndent(Nat indent);
	SElementNoIndent[90] => x : SHashHeading(indent) x;
	SElementNoIndent[80] => x : SUList(indent) x;
	SElementNoIndent[70] => x : SOList(indent) x;
	SElementNoIndent[60] => x : SCode(indent) x;
	SElementNoIndent[10] => x : SText(indent) x;

	// Match a code block.
	CodeBlock SCode(Nat indent);
	SCode => CodeBlock(language) : "```" - "[^\n ]*" language - " *\n" - SCodeContent(indent, me);

	void SCodeContent(Nat indent, CodeBlock to);
	SCodeContent[90] : parser.special.ExactIndent(indent) - "``` *\n";
	SCodeContent[50] => to : parser.special.ExactIndent(indent) - "[^\n]*" -> addLine - "\n" - SCodeContent(indent, to);
	SCodeContent[10] => to : "[ \t]*\n" - "" -> addLine - SCodeContent(indent, to);

	// Match a list. Note: We don't have to care about empty lines with "too little" indentation
	// here since the body matches SElement, which consumes them for us.
	List SUList(Nat indent);
	SUList => List(false) : SUListElem(indent, me) - (parser.special.ExactIndent(indent) - SUListElem(indent, me))*;

	void SUListElem(Nat indent, List list);
	SUListElem => list : SUListMarker(indent) newIndent - SListElemBody(newIndent) -> add;

	List SOList(Nat indent);
	SOList => List(true) : SOListElem(indent, me) - (parser.special.ExactIndent(indent) - SOListElem(indent, me))*;

	void SOListElem(Nat indent, List list);
	SOListElem => list : SOListMarker(indent) newIndent - SListElemBody(newIndent) -> add;

	ElementSeq SListElemBody(Nat indent);
	SListElemBody => ElementSeq() : SElementBaseline(indent) -> add - (SElement(me, indent))*;

	// Match a list marker. Returns extra indentation level.
	Nat SUListMarker(Nat indent);
	SUListMarker => addIndentation(indent, extra) : "[+\-*] +" extra;
	Nat SOListMarker(Nat indent);
	SOListMarker => addIndentation(indent, extra) : "[0-9]+\. +" extra;

	// Match a part of text, after indentation. Then figure out what it is.
	Element SText(Nat indent);
	SText[90] => heading : SFmtText(indent) text - "\n" - parser.special.MinIndent(indent) - SUnderline(text) heading;
	SText[80] => Paragraph(text) : SParText(indent) text;

	// Match a paragraph.
	FormattedText SParText(Nat indent);
	SParText => FormattedText() : SFmtText(indent) -> add - "\n" - (SFmtTextLine(indent) -> addLine)*;

	// Match the underlined part of a heading.
	Heading SUnderline(FormattedText text);
	SUnderline => Heading(1, text) : "=+\n";
	SUnderline => Heading(2, text) : "-+\n";

	// Match a bracket heading (# heading)
	Element SHashHeading(Nat indent);
	SHashHeading => Heading(depth, text) : SHashDepth depth - " *" - SFmtText(indent) text - "\n";

	Nat SHashDepth();
	SHashDepth => count(text) : "#+" text;

	// Match text, ending in a newline. Assumes that this is an entire line, and does not match new
	// lists etc.
	FormattedText SFmtTextLine(Nat indent);
	SFmtTextLine => FormattedText() : parser.special.MinIndent(indent) - SFmtSpanFirst(indent) -> add - (SFmtSpan(indent) -> add)* - "\n";

	// Match text, ending in a newline. Assumes this is in the middle of the line somewhere, and
	// does not account for special formatting etc.
	FormattedText SFmtText(Nat indent);
	SFmtText => FormattedText() : (SFmtSpan(indent) -> add)+;

	TextSpan SFmtSpanFirst(Nat indent);
	SFmtSpanFirst[10] => x : SFmtSpanCommon(indent) x;
	SFmtSpanFirst[20] => PlainTextSpan(text) : "[^*`\n\[\]\-+\\][^\n*\[\]`\\]*" text; // May not start with list characters.

	TextSpan SFmtSpan(Nat indent);
	SFmtSpan[10] => x : SFmtSpanCommon(indent) x;
	SFmtSpan[20] => PlainTextSpan(text) : "[^\n*\[\]`\\]+" text;

	TextSpan SFmtSpanCommon(Nat indent);
	SFmtSpanCommon[90] => BoldText(text) : "\*\*" - SNestedText(indent) text - "\*\*";
	SFmtSpanCommon[80] => ItalicText(text) : "\*" - SNestedText(indent) text - "\*";
	SFmtSpanCommon[70] => InlineCode(text) : "`" - SInlineCode(indent) text - "`";
	SFmtSpanCommon[60] => Link(text, target) : "\[" - SNestedText(indent) text - "\] *(" - "[^\n)]*" target - ")";
	SFmtSpanCommon[50] => CustomText(text) : "\[" - "[^\n\]]+" text - "\]";
	// Escape special characters:
	SFmtSpanCommon[40] => PlainTextSpan(text) : "\\" - "." text;

	// Nested text that may contain a newline.
	FormattedText SNestedText(Nat indent);
	SNestedText[10] => FormattedText() : SFmtText(indent) -> add;
	SNestedText[50] => FormattedText() : SFmtText(indent) -> add - "\n" - parser.special.MinIndent(indent) - SNestedText(indent) -> addLine;

	// Nested inline code that may contain a newline, but no other spans. Also required to contain
	// at least one character, otherwise it will be mis-interpreted as a code block.
	StrBuf SInlineCode(Nat indent);
	SInlineCode => StrBuf() : "[^`\n]+" -> add - ("\n" -> add - parser.special.MinIndent(indent) - "[^`\n]*" -> add)*;

	// Extra indentation.
	Nat SExtraIndent(Nat original);
	SExtraIndent => +(original, extra) : parser.special.Indent() extra;
}

// Add extra indentation.
package Nat addIndentation(Nat original, Str extra) {
	original + extra.count();
}