File: parse.bs

package info (click to toggle)
storm-lang 0.7.0-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 51,832 kB
  • sloc: ansic: 261,420; cpp: 138,870; sh: 14,877; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (164 lines) | stat: -rw-r--r-- 6,880 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
use parser;

/**
 * Error thrown when there is an error in the Markdown syntax.
 */
class MarkdownError extends Exception {
	init(Str:Iter pos, Str msg) {
		init { pos = pos; msg = msg; }
	}

	private Str:Iter pos;
	private Str msg;

	void message(StrBuf to) : override {
		to << "Failed to parse markdown text:\n";
		to << msg << "\n";
		to << "The error is near the |> in the text below:\n" << pos;
	}
}

// Parse a string containing Markdown text. Returns a symbolic representation of the document.
// Throws a `MarkdownError` on failure.
Document parse(Str str) {
	// We require that the string ends in a newline. Extra newlines do not matter, so just add one
	// always.
	var result = parser(str + "\n");
	if (error = result.error)
		throw MarkdownError(error.pos, error.message);

	unless (v = result.value)
		throw InternalError("Should have got an error!");
	if (result.end != str.end)
		throw MarkdownError(result.end, "Not all data could be parsed.");
	return v;
}

// The parser itself:
private parser : parser(backtracking recursive descent) {
	start = SDocument;

	Document SDocument();
	SDocument => Document(content) : SElementSeq(0) content;

	ElementSeq SElementSeq(Nat indent);
	SElementSeq => ElementSeq() : (SElement(me, indent))*;

	// Match an entire element.
	// The first part matches the indentation and delegates to SElementBaseline.
	void SElement(ElementSeq to, Nat indent);
	SElement[20] => to : parser.special.ExactIndent(indent) - SElementBaseline(indent) -> add;
	SElement[10] : "[ \t]*\n"; // Empty lines may have too little indentation.

	// Match the part after the expected indentation level.
	Element SElementBaseline(Nat indent);
	SElementBaseline[10] => x : SExtraIndent(indent) newIndent - SElementNoIndent(newIndent) x;
	// This is where we would detect things like code blocks, etc.

	// Match the part after all (possibly additional) indentation has been consumed.
	Element SElementNoIndent(Nat indent);
	SElementNoIndent[90] => x : SHashHeading(indent) x;
	SElementNoIndent[80] => x : SUList(indent) x;
	SElementNoIndent[70] => x : SOList(indent) x;
	SElementNoIndent[60] => x : SCode(indent) x;
	SElementNoIndent[10] => x : SText(indent) x;

	// Match a code block.
	CodeBlock SCode(Nat indent);
	SCode => CodeBlock(language) : "```" - "[^\n ]*" language - " *\n" - SCodeContent(indent, me);

	void SCodeContent(Nat indent, CodeBlock to);
	SCodeContent[90] : parser.special.ExactIndent(indent) - "``` *\n";
	SCodeContent[50] => to : parser.special.ExactIndent(indent) - "[^\n]*" -> addLine - "\n" - SCodeContent(indent, to);
	SCodeContent[10] => to : "[ \t]*\n" - "" -> addLine - SCodeContent(indent, to);

	// Match a list. Note: We don't have to care about empty lines with "too little" indentation
	// here since the body matches SElement, which consumes them for us.
	List SUList(Nat indent);
	SUList => List(false) : SUListElem(indent, me) - (parser.special.ExactIndent(indent) - SUListElem(indent, me))*;

	void SUListElem(Nat indent, List list);
	SUListElem => list : SUListMarker(indent) newIndent - SListElemBody(newIndent) -> add;

	List SOList(Nat indent);
	SOList => List(true) : SOListElem(indent, me) - (parser.special.ExactIndent(indent) - SOListElem(indent, me))*;

	void SOListElem(Nat indent, List list);
	SOListElem => list : SOListMarker(indent) newIndent - SListElemBody(newIndent) -> add;

	ElementSeq SListElemBody(Nat indent);
	SListElemBody => ElementSeq() : SElementBaseline(indent) -> add - (SElement(me, indent))*;

	// Match a list marker. Returns extra indentation level.
	Nat SUListMarker(Nat indent);
	SUListMarker => addIndentation(indent, extra) : "[+\-*] +" extra;
	Nat SOListMarker(Nat indent);
	SOListMarker => addIndentation(indent, extra) : "[0-9]+\. +" extra;

	// Match a part of text, after indentation. Then figure out what it is.
	Element SText(Nat indent);
	SText[90] => heading : SFmtText(indent) text - "\n" - parser.special.MinIndent(indent) - SUnderline(text) heading;
	SText[80] => Paragraph(text) : SParText(indent) text;

	// Match a paragraph.
	FormattedText SParText(Nat indent);
	SParText => FormattedText() : SFmtText(indent) -> add - "\n" - (SFmtTextLine(indent) -> addLine)*;

	// Match the underlined part of a heading.
	Heading SUnderline(FormattedText text);
	SUnderline => Heading(1, text) : "=+\n";
	SUnderline => Heading(2, text) : "-+\n";

	// Match a bracket heading (# heading)
	Element SHashHeading(Nat indent);
	SHashHeading => Heading(depth, text) : SHashDepth depth - " *" - SFmtText(indent) text - "\n";

	Nat SHashDepth();
	SHashDepth => count(text) : "#+" text;

	// Match text, ending in a newline. Assumes that this is an entire line, and does not match new
	// lists etc.
	FormattedText SFmtTextLine(Nat indent);
	SFmtTextLine => FormattedText() : parser.special.MinIndent(indent) - SFmtSpanFirst(indent) -> add - (SFmtSpan(indent) -> add)* - "\n";

	// Match text, ending in a newline. Assumes this is in the middle of the line somewhere, and
	// does not account for special formatting etc.
	FormattedText SFmtText(Nat indent);
	SFmtText => FormattedText() : (SFmtSpan(indent) -> add)+;

	TextSpan SFmtSpanFirst(Nat indent);
	SFmtSpanFirst[10] => x : SFmtSpanCommon(indent) x;
	SFmtSpanFirst[20] => PlainTextSpan(text) : "[^*`\n\[\]\-+\\][^\n*\[\]`\\]*" text; // May not start with list characters.

	TextSpan SFmtSpan(Nat indent);
	SFmtSpan[10] => x : SFmtSpanCommon(indent) x;
	SFmtSpan[20] => PlainTextSpan(text) : "[^\n*\[\]`\\]+" text;

	TextSpan SFmtSpanCommon(Nat indent);
	SFmtSpanCommon[90] => BoldText(text) : "\*\*" - SNestedText(indent) text - "\*\*";
	SFmtSpanCommon[80] => ItalicText(text) : "\*" - SNestedText(indent) text - "\*";
	SFmtSpanCommon[70] => InlineCode(text) : "`" - SInlineCode(indent) text - "`";
	SFmtSpanCommon[60] => Link(text, target) : "\[" - SNestedText(indent) text - "\] *(" - "[^\n)]*" target - ")";
	SFmtSpanCommon[50] => CustomText(text) : "\[" - "[^\n\]]+" text - "\]";
	// Escape special characters:
	SFmtSpanCommon[40] => PlainTextSpan(text) : "\\" - "." text;

	// Nested text that may contain a newline.
	FormattedText SNestedText(Nat indent);
	SNestedText[10] => FormattedText() : SFmtText(indent) -> add;
	SNestedText[50] => FormattedText() : SFmtText(indent) -> add - "\n" - parser.special.MinIndent(indent) - SNestedText(indent) -> addLine;

	// Nested inline code that may contain a newline, but no other spans. Also required to contain
	// at least one character, otherwise it will be mis-interpreted as a code block.
	StrBuf SInlineCode(Nat indent);
	SInlineCode => StrBuf() : "[^`\n]+" -> add - ("\n" -> add - parser.special.MinIndent(indent) - "[^`\n]*" -> add)*;

	// Extra indentation.
	Nat SExtraIndent(Nat original);
	SExtraIndent => +(original, extra) : parser.special.Indent() extra;
}

// Add extra indentation.
package Nat addIndentation(Nat original, Str extra) {
	original + extra.count();
}