1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
////////////////////////////////////////////////////////////////////////////////////////////////////
// //
// tokenizer.h +----------------------+ //
// =========== | generic TOKENIZER | //
// +----------------------+ //
// Code: Benjamin Jurke, http://benjaminjurke.net //
// //
////////////////////////////////////////////////////////////////////////////////////////////////////
// //
// File history: //
// - 10.06.2009 File created as tokenizer.h //
// //
////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef INC_TOKENIZER_H
#define INC_TOKENIZER_H
#include <cstddef>
#include <stdint.h>
#include <vector>
#include <string>
////////////////////////////////////////////////////////////////////////////////////////////////////
// The basic goal of a tokenizer is to break down some input string (i.e. a simple series of characters)
// down into individual tokens (like strings, numbers, special symbols), which can then be easily used
// for further processing.
//
// The CToken class corresponds to the individual "tokens" to which the tokenizer breaks down the input
// string.
enum MyTokenType
{
// Data content
TOKEN_SYMBOL = 1,
TOKEN_WORD = 2,
TOKEN_INTEGER = 3,
TOKEN_STRING = 4,
// Control tokens
TOKEN_END = 0,
TOKEN_ERR = -1
};
class CToken
{
friend class CTokenizer;
private:
MyTokenType tkType;
ptrdiff_t iInputOffset;
// Data variables
char symbol;
int64_t integer;
std::string str;
private:
inline CToken()
{ tkType = TOKEN_ERR; iInputOffset = 0; symbol = 0; integer = 0; };
// Data storage/insertion
inline void StoreSymbol(char cSymbol, ptrdiff_t offset)
{ symbol = cSymbol; tkType = TOKEN_SYMBOL; iInputOffset = offset; };
inline void StoreWord(const char *strWord, ptrdiff_t offset)
{ str = strWord; tkType = TOKEN_WORD; iInputOffset = offset; };
inline void StoreInteger(int64_t iInteger, ptrdiff_t offset)
{ integer = iInteger; tkType = TOKEN_INTEGER; iInputOffset = offset; };
inline void StoreString(const char *strString, ptrdiff_t offset)
{ str = strString; tkType = TOKEN_STRING; iInputOffset = offset; };
inline void SetEndToken(ptrdiff_t offset)
{ tkType = TOKEN_END; iInputOffset = offset; };
public:
// Comparision operator
friend bool operator!=(const CToken &lhs, const CToken &rhs);
friend inline bool operator==(const CToken &lhs, const CToken &rhs) { return (!(lhs != rhs)); };
// Type and control structures retrival
inline MyTokenType WhatType() const
{ return tkType; };
inline bool IsEndToken() const
{ return (WhatType() == TOKEN_END); }
inline ptrdiff_t GetInputOffset() const
{ return iInputOffset; }
// Data retrival
inline bool GetSymbol(char &cSymbol) const
{ if (tkType == TOKEN_SYMBOL) { cSymbol = symbol; return true; } return false; };
inline bool GetWord(std::string &strWord) const
{ if (tkType == TOKEN_WORD) { strWord = str; return true; } return false; };
inline bool GetInteger(int64_t &iInteger) const
{ if (tkType == TOKEN_INTEGER) { iInteger = integer; return true; } return false; };
inline bool GetString(std::string &strString) const
{ if (tkType == TOKEN_STRING) { strString = str; return true; } return false; };
bool GetBool(bool &bBool) const;
// For debugging purposes
std::string GetTokenString() const;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
class CTokenizer
{
private:
// Data variables
const char *pInputLine;
const char *pCurChar;
std::vector<CToken> vTokens;
size_t iCurToken;
private:
// Internal functions for reading tokens
bool ReadWord();
bool ReadInteger();
bool ReadString();
bool ReadSymbol();
bool ReadNextToken();
void SkipWhitespaces();
public:
CTokenizer();
void Clear();
// Data retrieval
inline size_t GetNumberOfTokens() const { return vTokens.size(); };
inline size_t GetCurTokenIndex() const { return iCurToken; };
inline const CToken &GetToken(size_t index) const { return vTokens.at(index); };
inline const CToken &GetCurToken() const { return GetToken(iCurToken); };
inline const CToken &GetNextToken() { return GetToken(iCurToken++); }; // Note that GetNextToken POST-increments,
inline const CToken &GetPrevToken() { return GetToken(--iCurToken); }; // whereas GetPrevToken PRE-decrements!
bool GetIntegerList(std::vector<int64_t> &out_list, char cBeginDelim = '(', char cSeperator = ',', char cEndDelim = ')');
// Output functions
void OutputTokenList() const;
// Main function to initialize the tokenizer
bool TokenizeInputString(const std::string &input);
};
#endif
|