File: tokenizer.h

package info (click to toggle)
cohomcalg 0.32%2Bds-6
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 2,008 kB
  • sloc: cpp: 3,291; makefile: 46; ansic: 17
file content (150 lines) | stat: -rw-r--r-- 5,783 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
////////////////////////////////////////////////////////////////////////////////////////////////////
//                                                                                                //
//  tokenizer.h                                                        +----------------------+   //
//  ===========                                                        |  generic TOKENIZER   |   //
//                                                                     +----------------------+   //
//  Code: Benjamin Jurke, http://benjaminjurke.net                                                //
//                                                                                                //
////////////////////////////////////////////////////////////////////////////////////////////////////
//                                                                                                //
//  File history:                                                                                 //
//        - 10.06.2009  File created as tokenizer.h                                               //
//                                                                                                //
////////////////////////////////////////////////////////////////////////////////////////////////////


#ifndef INC_TOKENIZER_H
#define INC_TOKENIZER_H


#include <cstddef>
#include <stdint.h>
#include <vector>
#include <string>


////////////////////////////////////////////////////////////////////////////////////////////////////

// The basic goal of a tokenizer is to break down some input string (i.e. a simple series of characters)
// down into individual tokens (like strings, numbers, special symbols), which can then be easily used 
// for further processing.
//
// The CToken class corresponds to the individual "tokens" to which the tokenizer breaks down the input
// string. 


enum MyTokenType
{
    // Data content
    TOKEN_SYMBOL  = 1,
    TOKEN_WORD    = 2,
    TOKEN_INTEGER = 3,
    TOKEN_STRING  = 4,

    // Control tokens
    TOKEN_END     = 0,
    TOKEN_ERR     = -1
};

class CToken
{
  friend class CTokenizer;

  private:
    MyTokenType tkType;
    ptrdiff_t iInputOffset;

    // Data variables
    char symbol;
    int64_t integer;
    std::string str;

  private:
    inline CToken()
        { tkType = TOKEN_ERR; iInputOffset = 0; symbol = 0; integer = 0; };

    // Data storage/insertion
    inline void StoreSymbol(char cSymbol, ptrdiff_t offset)
         { symbol = cSymbol;   tkType = TOKEN_SYMBOL;  iInputOffset = offset; };
    inline void StoreWord(const char *strWord, ptrdiff_t offset)
         { str = strWord;      tkType = TOKEN_WORD;    iInputOffset = offset; };
    inline void StoreInteger(int64_t iInteger, ptrdiff_t offset)
         { integer = iInteger; tkType = TOKEN_INTEGER; iInputOffset = offset; };
    inline void StoreString(const char *strString, ptrdiff_t offset)
         { str = strString;    tkType = TOKEN_STRING;  iInputOffset = offset; };
    inline void SetEndToken(ptrdiff_t offset)
        { tkType = TOKEN_END; iInputOffset = offset; };

  public:
    // Comparision operator
    friend        bool operator!=(const CToken &lhs, const CToken &rhs);
    friend inline bool operator==(const CToken &lhs, const CToken &rhs) { return (!(lhs != rhs)); };

    // Type and control structures retrival
    inline MyTokenType WhatType() const  
        { return tkType; };
    inline bool      IsEndToken() const     
        { return (WhatType() == TOKEN_END); }
    inline ptrdiff_t GetInputOffset() const 
        { return iInputOffset; }

    // Data retrival
    inline bool GetSymbol(char &cSymbol) const
         { if (tkType == TOKEN_SYMBOL)  { cSymbol = symbol;   return true; } return false; };
    inline bool GetWord(std::string &strWord) const
         { if (tkType == TOKEN_WORD)    { strWord = str;      return true; } return false; };
    inline bool GetInteger(int64_t &iInteger) const
         { if (tkType == TOKEN_INTEGER) { iInteger = integer; return true; } return false; };
    inline bool GetString(std::string &strString) const
         { if (tkType == TOKEN_STRING)  { strString = str;    return true; } return false; };
    bool        GetBool(bool &bBool) const;

    // For debugging purposes
    std::string GetTokenString() const;
};


////////////////////////////////////////////////////////////////////////////////////////////////////


class CTokenizer
{
  private:
    // Data variables
    const char *pInputLine;
    const char *pCurChar;
    std::vector<CToken> vTokens;
    size_t iCurToken;

  private:
    // Internal functions for reading tokens
    bool ReadWord();
    bool ReadInteger();
    bool ReadString();
    bool ReadSymbol();
    bool ReadNextToken();
    void SkipWhitespaces();

  public:
    CTokenizer();
    void Clear();

    // Data retrieval
    inline size_t  GetNumberOfTokens() const          { return vTokens.size(); };
    inline size_t  GetCurTokenIndex() const           { return iCurToken; };
    inline const CToken &GetToken(size_t index) const { return vTokens.at(index); };
    inline const CToken &GetCurToken() const          { return GetToken(iCurToken); };
    inline const CToken &GetNextToken()               { return GetToken(iCurToken++); };      // Note that GetNextToken POST-increments,
    inline const CToken &GetPrevToken()               { return GetToken(--iCurToken); };        // whereas GetPrevToken PRE-decrements!

    bool GetIntegerList(std::vector<int64_t> &out_list, char cBeginDelim = '(', char cSeperator = ',', char cEndDelim = ')');

    // Output functions
    void OutputTokenList() const;

    // Main function to initialize the tokenizer
    bool TokenizeInputString(const std::string &input);
};


#endif