File: Tokenizer.h

package info (click to toggle)
storm-lang 0.7.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 52,004 kB
  • sloc: ansic: 261,462; cpp: 140,405; sh: 14,891; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (125 lines) | stat: -rw-r--r-- 2,564 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#pragma once
#include "Core/Str.h"
#include "Core/SrcPos.h"

namespace storm {
	namespace syntax {

		/**
		 * Tokenizer for parsing the syntax language. This is not designed for being accessed from
		 * Storm.
		 */

		/**
		 * A token from the parser. Do not store these anywhere except on the stack.
		 */
		class Token {
		public:
			Token(Str *str, Nat len, const SrcPos &pos);

			// Contents of this token.
			SrcPos pos;

			// Empty token?
			bool empty() const;

			// Is this token equal to 'str'?
			bool operator ==(const wchar *str) const;
			bool operator !=(const wchar *str) const { return !(*this == str); }

			// Is his a string literal.
			bool isStrLiteral() const;

			// Get the string literal without quotes.
			Str *strLiteral() const;

			// Return a Str representing this token.
			Str *toS() const;

		private:
			// Contents of this token. 'start' is stored inside 'pos'.
			Str *str;
			Nat len;

			// Get the start pointer.
			const wchar *start() const;
		};

		// Output.
		wostream &operator <<(wostream &to, const Token &t);


		/**
		 * Tokenizer. Handles strings and basic operators (including ()[] and {}). Also handles
		 * comments of the form // ... \n
		 */
		class Tokenizer : NoCopy {
		public:
			// Tokenize data in 'src' from 'start', assuming the content comes from 'path'.
			Tokenizer(Url *path, Str *src, Nat start);

			// Get the next token in the stream. Throws an exception if the end of stream has been reached.
			Token next();

			// Peek the current token.
			Token peek() const;

			// Skip one token.
			void skip();

			// Skip a token if it is 'token'.
			bool skipIf(const wchar *token);

			// Expect a token.
			void expect(const wchar *token);

			// More tokens to get?
			bool more() const;

			// Current position.
			SrcPos position() const;

			// Position of the last comment block. Returns SrcPos() if none.
			SrcPos comment() const;

			// Clear the current comment.
			void clearComment();

		private:
			// Source string.
			Str *src;

			// Source file.
			Url *file;

			// Current position.
			Nat pos;

			// Start of the last comment block. Set to Nat(-1) if none.
			Nat commentStart;

			// No comment.
			static Nat invalid;

			// Different states of the tokenizer.
			enum State {
				sStart,
				sText,
				sOperator,
				sString,
				sComment,
				sDone,
			};

			// The next token.
			Token lookahead;

			// Find the next token.
			Token findNext();

			// Do one step in the state machine.
			void processChar(Nat &start, State &state, bool &firstComment);
		};

	}
}