File: Lexer.h

package info (click to toggle)
devtodo 0.1.20%2Bgit20200830.0ad52b0-4
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 968 kB
  • sloc: ansic: 5,307; cpp: 3,908; perl: 112; sh: 106; makefile: 45; csh: 2
file content (125 lines) | stat: -rw-r--r-- 3,009 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#ifndef CRASH_LEXER
#define CRASH_LEXER

#include <stdexcept>
#include <string>
#include <map>
#include <iterator>
#include "Strings.h"
#include "Regex.h"

using namespace std;

/**
	Lexer is a lexical analyser.

	02/01/01	Fixed the bug where ignored tokens caused it to die horribly.
	20/09/00	Restarted after I couldn't find an annoying bug. The interface
				needed to be cleaned up anyway.
	24/08/00	Fixed a bug where the list of tokens wasn't being cleared 
				in between calls to the scanner.
	13/08/00	Created.
*/

class Lexer {
	public :
		class exception : public runtime_error {
			public :
				exception(string const &str, unsigned line) : runtime_error(str), _line(line) {}
				unsigned line() { return _line; }
			private :
				unsigned _line;
		};

		class iterator {
			public :
				iterator() : tokeniser(0), in(0), _line(1) {}

				operator string const & () const { return _value; }
				iterator operator ++ (int) {
				iterator i = *this;

					get();
					return i;
				}

				iterator &operator ++ () {
					get();
					return *this;
				}

				int operator != (iterator const &other) const {
					return in != other.in || tokeniser != other.tokeniser;
				}
				int operator == (iterator const &other) const {
					return  in == other.in && tokeniser == other.tokeniser;
				}

				int operator [] (unsigned index) const { return _value[index]; }
				unsigned type() const { return _type; }
				unsigned line() const { return _line; }
				unsigned size() const { return _value.size(); }
				string const &value() const { return _value; }
				char const *source() const { return in; }
				void skip(unsigned skip) { 
					for (unsigned i = 0; i < skip; i++) 
						if (*in) 
						{
							if (*in == '\n') _line++;
							in++; 
						} else 
							break; 
				}
			private :
				void get() {
					assert(in && tokeniser);
					if (!tokeniser->get(*this, in, _line))
						in = 0;
				}
				iterator(Lexer *t, char const *in) : tokeniser(t), in(in), _line(1) {
					if (in) get();
				}
				friend class Lexer;

				Lexer *tokeniser;
				string _value;
				char const *in;
				unsigned _type;
				unsigned _line;
				struct {
					int start, end;
				} match[50];
				int matches;
		};
		friend class iterator;

		Lexer();
		virtual ~Lexer();

		iterator begin(char const *in) { return iterator(this, in); }
		iterator end() { return iterator(this, 0); }

		Lexer &ignorePattern(unsigned index, char const *pattern) { return addPattern(index, pattern, true); }
		Lexer &addPattern(unsigned index, char const *pattern, bool ignore = false);

		void ignore(unsigned index, bool state = true);
		void enable(unsigned index, bool state = true);
	protected :
		enum { Character = 1000000 };

		struct Pattern {
			Regex rx;
			bool ignore, enabled;
		};
		map<int, Pattern> pattern;
		bool initialised;

		virtual bool get(iterator &it, char const *&in, unsigned &line);
};

inline ostream &operator << (ostream &out, Lexer::iterator const &it) {
	out << it.value();
	return out;
}

#endif