File: Tokenizer.h

package info (click to toggle)
storm-lang 0.7.5-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 52,028 kB
sloc: ansic: 261,471; cpp: 140,432; sh: 14,891; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (180 lines) | stat: -rw-r--r-- 3,706 bytes
parent folder | download | duplicates (3)
#pragma once
#include "SrcPos.h"

/**
 * Token generated by the tokenizer.
 */
class Token {
public:
	Token(const String &token, const SrcPos &pos) : token(token), pos(pos) {}

	// Contents of this token.
	String token;

	// Where is the starting point of this token?
	SrcPos pos;

	// Compare the actual token.
	inline bool operator ==(const Token &o) const { return token == o.token; }
	inline bool operator !=(const Token &o) const { return token != o.token; }
	inline bool operator ==(const String &o) const { return token == o; }
	inline bool operator !=(const String &o) const { return token != o; }

	// Empty token?
	inline bool empty() const { return token.size() == 0; }

	// Is this a string?
	bool isStr() const;

	// Extract the string from the token. Assumes isStr().
	String strVal() const;
};

// Output.
wostream &operator <<(wostream &to, const Token &t);


/**
 * Comment generated by the tokenizer.
 */
class Comment {
public:
	Comment() : src(null), fileId(0), begin(0), end(0) {}
	Comment(const String &src, nat fileId, nat begin, nat end) : src(&src), fileId(fileId), begin(begin), end(end) {}

	// Create a string from this comment. This strips all formatting that is applied to the comment,
	// such as leading asterisks.
	String str() const;

	// Any comment at all?
	inline bool empty() const { return begin == end; }
	inline bool any() const { return begin > end; }

private:
	// Source string.
	const String *src;

	// Source file.
	nat fileId;

	// Start and end position.
	nat begin;
	nat end;

	// State used when parsing.
	enum State {
		start,
		start2,
		done,

		// Single line comments.
		singleStart,
		singleInside,
		singleNewline,
		singleHalf,
		singleBefore,

		// Multi line comments.
		multiStart,
		multiInside,
		multiNewline,
		multiBefore,
	};

	// Parameters for the parsing.
	struct Params {
		// Number of spaces after the comment 'start'.
		nat space;

		// Current number of spaces.
		nat curr;

		// Number of empty lines encountered.
		nat empty;
	};

	// Helper for parsing comments.
	static const wchar_t *parse(std::wostringstream &to, State &state, Params &par, wchar_t ch);
};

// Output.
wostream &operator <<(wostream &to, const Comment &c);


/**
 * Tokenizer designed to properly tokenize the contents of .bnf files.
 * Handles strings and basic operators (including ()[] and {}).
 * Also handles comments. Comments have the form // ... \n
 */
class Tokenizer : NoCopy {
public:
	// Tokenize data in 'SrcPos::files[pathId]' from 'start'.
	Tokenizer(nat pathId);

	// Get the next token in the stream. Throws an exception if the end of stream
	// has been reached.
	Token next();

	// Skip current token.
	void skip();

	// Peek.
	Token peek();

	// More tokens to get?
	bool more() const;

	// Get a token and see it is the correct one.
	void expect(const String &s);

	// Skip the current token if it is 'str'.
	bool skipIf(const String &s);

	// Get the last comment relevant to the current token.
	Comment comment() const;

	// Clear the current comment.
	void clearComment();

private:
	// Source string.
	const String src;

	// Source id.
	nat pathId;

	// Current position.
	nat pos;

	// Current SrcPos.
	SrcPos srcPos;

	// Start and end of the last comment.
	nat commentBegin;
	nat commentEnd;

	// Different states of the tokenizer.
	enum State {
		sStart,
		sText,
		sOperator,
		sString,
		sComment,
		sMlComment,
		sPreproc,
		sPreprocExtend,
		sDone,
	};

	// The next found token.
	Token nextToken;

	// Find the next token.
	Token findNext();

	// Do one step in the state-machine.
	void processChar(nat &start, State &state, bool &firstComment);

	// Advance a srcPos.
	void advance(SrcPos &pos, nat from, nat to) const;
};