File: tokenizer.cpp

package info (click to toggle)
cohomcalg 0.32%2Bds-6
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, sid, trixie
  • size: 2,008 kB
  • sloc: cpp: 3,291; makefile: 46; ansic: 17
file content (379 lines) | stat: -rw-r--r-- 13,260 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
////////////////////////////////////////////////////////////////////////////////////////////////////
//                                                                                                //
//  tokenizer.cpp                                                      +----------------------+   //
//  =============                                                      |  generic TOKENIZER   |   //
//                                                                     +----------------------+   //
//  Code: Benjamin Jurke, http://benjaminjurke.net                                                //
//                                                                                                //
////////////////////////////////////////////////////////////////////////////////////////////////////
//                                                                                                //
//  File history:                                                                                 //
//        - 04.06.2009  File created as tokenizer.cpp                                             //
//                      Contains a simple generic tokenizer class to read in a string and break   //
//                      it down to its tokens for further parsing.                                //
//        - 17.04.2010  Changed WORD tokens from alpha to alphanums                               //
//                                                                                                //
////////////////////////////////////////////////////////////////////////////////////////////////////


#include <cstdlib>
#include <algorithm>
#include <cctype>
#include <cstdio>
#include <stdlib.h>
#include <errno.h>
#include "platform.h"
#include "tokenizer.h"
#include "main.h"

using namespace std;


////////////////////////////////////////////////////////////////////////////////////////////////////


// This macro (yeah, I know... spare me the lecture...) determines which characters we interpret as
// whitespaces, i.e. spaces, tabs, newline and carriage returns
#define IS_WHITESPACE(__x_)  ((__x_ == ' ') || (__x_ == '\t') || (__x_ == '\n') || (__x_ == '\r'))


////////////////////////////////////////////////////////////////////////////////////////////////////


bool operator!=(const CToken &lhs, const CToken &rhs)
{
    /* The operator function handles the comparison of two tokens. We do NOT compare the
       InputOffset, i.e. just the data and type of the tokens has to be equal. */

    if (lhs.tkType != rhs.tkType) return true;

    // Have same type, but do they have same data?
    switch (lhs.tkType)
    {
        case TOKEN_SYMBOL:  return (lhs.symbol != rhs.symbol);
        case TOKEN_INTEGER: return (lhs.integer != rhs.integer);
        case TOKEN_STRING:
        case TOKEN_WORD:    return (lhs.str.compare(rhs.str) != 0);
        case TOKEN_END:
        case TOKEN_ERR:     return false;
    }

    // We should never reach here...
    return false;
}

bool CToken::GetBool(bool &bBool) const
{
    /* As there is no fundamental boolean token, we try to interpret the current token as
       boolean. Therefore, we consider the following values:
        true  -->  WORD:    true (case-inv.)    false  -->  WORD:    false (case-inv.)
                   STRING:  true (case-inv.)           -->  STRING:  false (case-inv.)
                   INTEGER: non-zero value             -->  INTEGER: zero value
       Tokens of any other type cannot be converted to a boolean value and therefore
       return false. */

    switch (tkType)
    {
        case TOKEN_WORD:
        case TOKEN_STRING:
            // We try to recognize either 'true' or 'false' from the string-like tokens
            {
                if (str.length() > 5) return false;
                string strTemp = str;
                transform(strTemp.begin(), strTemp.end(), strTemp.begin(), (int (*) (int)) tolower);
                if ((strTemp.compare("false") == 0) || (strTemp.compare("0") == 0))
                {
                    bBool = false;
                    return true;
                }
                else if ((strTemp.compare("true") == 0) || (strTemp.compare("1") == 0))
                {
                    bBool = true;
                    return true;
                }
                return false;
            }

        case TOKEN_INTEGER:
            // From numerical tokens, we recognize the non-zero values as true
            bBool = (integer != 0);
            return true;

        case TOKEN_SYMBOL:
        case TOKEN_END:
        case TOKEN_ERR:
            // All other tokens cannot be converted to boolean value
            return false;
    }
    return false;
}

string CToken::GetTokenString() const
{
    /* This function converts a token into human readable form, i.e. it prints the token
       type and the token value. */

    char buf[128];
    switch (tkType)
    {
        case TOKEN_SYMBOL:  safe_sprintf(buf, sizeof(buf), "SYMBOL:  %c  (0x%x)", symbol, (int) symbol); break;
        case TOKEN_WORD:    safe_sprintf(buf, sizeof(buf), "WORD:    %s", str.c_str()); break;
        case TOKEN_INTEGER: safe_sprintf(buf, sizeof(buf), "INTEGER: %ld", (long int) integer); break;
        case TOKEN_STRING:  safe_sprintf(buf, sizeof(buf), "STRING:  %s", str.c_str()); break;
        case TOKEN_END:     safe_sprintf(buf, sizeof(buf), "END"); break;
        case TOKEN_ERR:     safe_sprintf(buf, sizeof(buf), "ERR"); break;
        default:            safe_sprintf(buf, sizeof(buf), "--INVALID TOKEN--"); break;
    }
    return buf;
}


////////////////////////////////////////////////////////////////////////////////////////////////////


CTokenizer::CTokenizer()
{
    Clear();
}

void CTokenizer::Clear()
{
    pInputLine = NULL;
    pCurChar = NULL;
    iCurToken = 0;
    vTokens.clear();
}


bool CTokenizer::ReadWord()
{
    /* This internal function tries to read a WORD type token at the current position of the
       input data. A WORD token is specified to be any non-seperated alphanumeric sequence of
       characters not starting with a number. */

    // Determine the length of the alphanumeric sequence
    if (!isalpha(pCurChar[0]))
        return false;
    ptrdiff_t len=1;
    while (isalnum(pCurChar[len]))
        len++;

    // Create a duplicate of the WORD token string
    string strTmp;
    strTmp.assign(pCurChar, len);

    // Create a new token and add to token list
    CToken TmpToken;
    TmpToken.StoreWord(strTmp.c_str(), pCurChar - pInputLine);
    vTokens.push_back(TmpToken);

    pCurChar += len;

    return true;
}

bool CTokenizer::ReadInteger()
{
    /* This internal function tries to read a non-negative INTEGER type token at the current
       position of the input data. Note that the calling function has to take care of a
       potential minus sign in from of this number. If the number does not fit into signed
       64-bit variable, the return value is false. */

    // We determine the length of the numeric sequence
    if (!isdigit(pCurChar[0]))
        return false;
    ptrdiff_t len=1;
    while (isdigit(pCurChar[len]))
        len++;

    // Convert the number
    int64_t iTmp = string_to_int64(pCurChar);
    if (errno == ERANGE)
    {
        // In case the number is larger than 64 bit
        return false;
    }

    // Create a new token and add to the token list
    CToken TmpToken;
    TmpToken.StoreInteger(iTmp, pCurChar - pInputLine);
    vTokens.push_back(TmpToken);

    pCurChar += len;

    return true;
}

bool CTokenizer::ReadString()
{
    /* This internal function tries to read a STRING type token, which is any sequence
       starting and ending with a '"'. Therefore it currently not possible to have a
       '"' character in the string. */

    // Determine the length of the string
    if (pCurChar[0] != '"')
        return false;
    int i=1;
    while ((pCurChar[i] != 0) && (pCurChar[i] != '"'))
        i++;
    if (pCurChar[i] != '"')
        return false;

    // Create a duplicate of the string
    string strTmp;
    strTmp.assign(pCurChar, 1, i-1);

    // Create a new token and add to the token list
    CToken TmpToken;
    TmpToken.StoreString(strTmp.c_str(), pCurChar - pInputLine);
    vTokens.push_back(TmpToken);

    pCurChar += i+1;

    return true;
}

bool CTokenizer::ReadSymbol()
{
    /* This function tries to read in a symbol character. If a '-' character is found it tries
       to read in a subsequent integer and applies the sign. Otherwise the character is simply
       stored as a symbol. Note that this function does NOT check if the character is of
       alphanumeric type, which may allow for a different interpretation. Therefore, this
       function should be the fallback option wenn determining the token type. */

    const char c = pCurChar[0];
    pCurChar++;

    switch (c)
    {
        case '-':
            // If we have a minus sign, look for an integer
            if (ReadInteger())
            {
                // If there is indeed an integer (which is now stored at the last position of
                // the token list) flip the sign
                const size_t numTokens = GetNumberOfTokens();
                int64_t num = 0;
                vTokens[numTokens-1].GetInteger(num);
                vTokens[numTokens-1].StoreInteger(-num, pCurChar - pInputLine - 1);
                break;
            }

        default:
            // If we have no minus sign or cannot find an integer, simply store the character
            // as a SYMBOL token
            CToken TmpToken;
            TmpToken.StoreSymbol(c, pCurChar - pInputLine - 1);
            vTokens.push_back(TmpToken);
    }

    return true;
}

bool CTokenizer::ReadNextToken()
{
    /* This function advances the current character position to the next value which is not
       considered to be a whitespace and then calls the indivual Read*** functions in order
       to properly recognize the token. */

    // First skip all whitespace
    SkipWhitespaces();

    // End of string?
    if (pCurChar[0] == 0)
        return true;

    // Then we read the next token
    if (pCurChar[0] == '"')
        return ReadString();
    if (isdigit(pCurChar[0]))
        return ReadInteger();
    if (isalpha(pCurChar[0]))
        return ReadWord();

    // Note that the ReadSymbol must come last, because everything can be interpreted as an
    // ordinary SYMBOL, which is simply a character.
    return ReadSymbol();
}

void CTokenizer::SkipWhitespaces()
{
    /* Skips whitespace characters (as defined by the macro at the top of the file) until
       either the end of the string of a non-whitespace character is reached. */

    while ((pCurChar[0] != 0) && IS_WHITESPACE(pCurChar[0]))
        pCurChar++;
}

bool CTokenizer::TokenizeInputString(const string &input)
{
    /* This function clears the tokenizer of all prior data and starts the process of
       breaking up the input string into individual tokens. Note that the input string
       is not changed, nor is copy of the original string kept. */

    // Note that the usage of the STL class string ensures that we truly have terminating
    // zero character, i.e. this class is reasonably safe
    pInputLine = input.c_str();
    pCurChar = pInputLine;

    // Read tokens until we hit the end of the string
    while (pCurChar[0] != 0)
    {
        if (!ReadNextToken())
        return false;
    }

    // Add and END token
    CToken EndToken;
    EndToken.SetEndToken(pCurChar - pInputLine);
    vTokens.push_back(EndToken);

    // Set the current token to start
    iCurToken = 0;

    // Clear the "critical" variables for safety
    pInputLine = NULL;
    pCurChar = NULL;

    // OutputTokenList();  // Just for debugging

    return true;
}

void CTokenizer::OutputTokenList() const  // Just for debugging
{
    /* This output function prints a full list of all tokens, which might be useful for
       debugging or writing parsing functions. */

    const size_t numTokens = GetNumberOfTokens();
    MSG_OUT("There are " << numTokens << " tokens in the input string:");
    for (size_t i=0; i<numTokens; i++)
        MSG_OUT("Token " << i << ", Offset " << vTokens[i].GetInputOffset() << ", Type " << vTokens[i].GetTokenString());
}

bool CTokenizer::GetIntegerList(vector<int64_t> &out_list, char cBeginDelim, char cSeperator, char cEndDelim)
{
    /* This function is a semi-parser function, which simplifies the recurring task of reading
       in symbol-seperated integer lists like e.g. comma-seperated bracket-delimited vectors (2,3,1). */

    char c;
    int64_t integer;

    // First we expect a SYMBOL containing the cBeginDelim character
    if (!GetNextToken().GetSymbol(c)) return false;
    if (c != cBeginDelim) return false;

    // The we loop as long as we find and INTEGER followed by the cSeperator character SYMBOL
    while (GetNextToken().GetInteger(integer))
    {
        out_list.push_back(integer);
        if (!GetNextToken().GetSymbol(c)) return false;
        if (c == cEndDelim) break;
        if (c != cSeperator) return false;
    }

    // Finally, there should be a SYMBOL containing the cEndDelim character
    if (c != cEndDelim) return false;

    return true;
}