File: lexicon.c

package info (click to toggle)
rel 1.3-3
  • links: PTS
  • area: non-free
  • in suites: hamm, potato, slink
  • size: 496 kB
  • ctags: 216
  • sloc: ansic: 1,868; sh: 254; makefile: 142
file content (433 lines) | stat: -rw-r--r-- 14,595 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
/*

------------------------------------------------------------------------------

A license is hereby granted to reproduce this software source code and
to create executable versions from this source code for personal,
non-commercial use.  The copyright notice included with the software
must be maintained in all copies produced.

THIS PROGRAM IS PROVIDED "AS IS". THE AUTHOR PROVIDES NO WARRANTIES
WHATSOEVER, EXPRESSED OR IMPLIED, INCLUDING WARRANTIES OF
MERCHANTABILITY, TITLE, OR FITNESS FOR ANY PARTICULAR PURPOSE.  THE
AUTHOR DOES NOT WARRANT THAT USE OF THIS PROGRAM DOES NOT INFRINGE THE
INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD PARTY IN ANY COUNTRY.

Copyright (c) 1995, John Conover, All Rights Reserved.

Comments and/or bug reports should be addressed to:

    john@johncon.com (John Conover)

------------------------------------------------------------------------------

lexicon.c, lexical analysis for infix/postfix notation

enum token_type lexicon (char *buffer, char *token);

    get the next token from buffer, placing it in token and return the
    lexicon value of the token-the lexicon value is one of enum
    token_type; token must be adequately large to hold the parsed
    token (eg., it should be at least twice as large as buffer)

The algorithm is as follows:

    while the lexical state of the token is not finished

        get a character from buffer

        if the character is a grouping operator, or operator, and this
        is the first character of the token, save the character in
        token, and set the lexical state of the token to finished-else
        backup over the character in buffer since the current token is
        finished, and this is the beginning of the next token

        if the character is whitespace, and this is the beginning of a
        token, ignore it, else set the lexical state of the token to
        finished

        if the character is a part of an identifier, if this is the
        first character of the token, set the lexical state of the
        token to started, but not finished

        if the character is the "pseudo-operator," the '\\', which is
        used to "escape" operator symbols when they appear in an
        identifier, skip the character, and assume that the next
        character is a character of an identifier, set the lexical
        state of the token to started, but not finished

Note: this algorithm is essentially a very simple finite state machine
with the following transition table:

    state | white   '('    ')'    '&'    '|'   '!'   '\0'  '\\'  letter
    ------+------------------------------------------------------------
      0   |   0      2      2      2      2     2     2      1     1
      1   |   2      2      2      2      2     2     2      1     1

where state 0 is the initial state, state 1 is a "token in process,"
(eg., an identifier,) and state 2 is the final state

The transition diagram looks like:

                                               +--------+
                                               |+------+|
            +--------------------------------->||letter||-+
            |                                  |+------+| |
            |                                  +--------+ |
            |                                     ^       |
            |                                     |       |
            |                                     +-------+
            |                                     ^
            |                                     |
            |                                  +------+
            |                                  |+----+|
            +--------------------------------->||'\\'||
            |                                  |+----+|
            |                                  +------+
            |
            |                                  +-----+
            |                                  |+---+|
            +--------------------------------->||'('||
            |                                  |+---+|
            |                                  +-----+
            |
            |                                  +-----+
            |                                  |+---+|
            +--------------------------------->||')'||
            |                                  |+---+|
            |                                  +-----+
            |
            |                                  +-----+
    +-----+ |                                  |+---+|
    |white|-+--------------------------------->||'&'||
    +-----+ |                                  |+---+|
       ^    |                                  +-----+
       |    |
       +----+                                  +-----+
            |                                  |+---+|
            +--------------------------------->||'|'||
            |                                  |+---+|
            |                                  +-----+
            |
            |                                  +-----+
            |                                  |+---+|
            +--------------------------------->||'!'||
            |                                  |+---+|
            |                                  +-----+
            |
            |                                  +------+
            |                                  |+----+|
            +--------------------------------->||'\0'||
                                               |+----+|
                                               +------+

Usage is a call with null arguments to reset the internal static
buffer reference, and then repeated calls with the same buffer until
the buffer's tokens are exhausted; this is signified by a return of
enum token_type NONE, for example, in a construct:

    char buffer[SIZE],
         token[2 * SIZE];

    enum token_type lex_val;

    load_buffer (buffer);

    (void) lexicon ((char *) 0, (char *) 0);

    while ((lex_val = lexicon (buffer, token)) != NONE)
    {
        process_token (token,lex_val);
    }

When finished, buffer is not disturbed, and token contains the
contents of buffer, with the tokens separated by exactly one '\0'
character, and no whitespace, ie., if the contents of buffer were:

    +------------------
    |sym1 sym2 sym3 ...
    +------------------

then the contents of token would be:

    +----------------------
    |sym1\0sym2\0sym3\0 ...
    +----------------------

For a detailed description of lexical analysis using state machines,
see "Information Retrieval: Data Structures & Algorithms," William
B. Frakes, Ricardo Baeza-Yates, Editors, Prentice Hall, Englewood
Cliffs, New Jersey, 1992, ISBN 0-13-463837-9, pp 102.

The argument, buffer, references the tokens to be parsed, and the
argument token, references space where the tokens will be parsed, and
must be at least twice as large as buffer

There are no errors, and the lexicon type of the token is returned-a
lexicon type of NONE signifies that there are no more tokens

enum token_type is defined in lexicon.h

To test this module, compile the module source with -DTEST_LEXICON

$Revision: 1.0 $
$Date: 1995/04/22 05:13:18 $
$Id: lexicon.c,v 1.0 1995/04/22 05:13:18 john Exp $
$Log: lexicon.c,v $
 * Revision 1.0  1995/04/22  05:13:18  john
 * Initial revision
 *

*/

#include "rel.h"

#ifndef LINT /* include rcsid only if not running lint */

static char rcsid[] = "$Id: lexicon.c,v 1.0 1995/04/22 05:13:18 john Exp $"; /* module version */
static char rcsid_h[] = LEXICON_H_ID; /* module include version */

#endif

#ifdef __STDC__

enum token_type lexicon (char *buffer, unsigned char *token)

#else

enum token_type lexicon (buffer, token)
    char *buffer;
    unsigned char *token;

#endif

{
    static char *buffer_ref = (char *) 0; /* references to current character in buffer, null = no buffer referenced */

    unsigned char *token_ref; /* reference to current character in token buffer */

    int lexical_state = 0; /* lexical state, 0 = token not started, 1 = token started but not finished, 2 = token finished */

    enum token_type retval = NONE; /* return value, assume no token */

    token_ref = token; /* reference first charater in token buffer */

    if (buffer != (char *) 0) /* null buffer is signal to reset for new buffer */
    {

        if (buffer_ref == (char *) 0) /* first token for this buffer? */
        {
            buffer_ref = buffer; /* yes, reference the first character in the buffer */
        }

        while (lexical_state != 2) /* while the token is not in final state */
        {

            switch (*buffer_ref) /* translate the buffer character */
            {

                case '(': /* '(' grouping operator */

                case ')': /* ')' grouping operator */

                case '&': /* "and" operator */

                case '|': /* "or" operator */

                case '!': /* "not" operator */

                case '\0': /* end of buffer "operator" */

                    if (lexical_state == 0) /* initial state? */
                    {
                        *token_ref = (unsigned char) *buffer_ref; /* yes, save the character in the token buffer */
                        token_ref++; /* reference next character in token buffer */

                        switch (*buffer_ref)
                        {

                            case '(': /* '(' grouping operator */

                                retval = LEFT; /* '(' grouping operator */
                                break;

                            case ')': /* ')' grouping operator */

                                retval = RIGHT; /* ')' grouping operator */
                                break;

                            case '&': /* "and" operator */

                                retval = AND; /* "and" operator */
                                break;

                            case '|': /* "or" operator */

                                retval = OR; /* "or" operator */
                                break;

                            case '!': /* "not" operator */

                                retval = NOT; /* "not" operator */
                                break;

                            default:

                                retval = NONE; /* end of buffer "operator" */
                                break;
                        }

                    }

                    else
                    {
                        buffer_ref--; /* yes, "push" the character back */
                    }

                    lexical_state = 2; /* change lexical state to token finished */
                    break;

                case ' ': /* white space */

                case '\f':

                case '\r':

                case '\v':

                case '\t':

                case '\n':

                    if (lexical_state != 0) /* initial state? */
                    {
                        lexical_state = 2; /* no, first character of a token's trailing white space, lexical state = final */
                    }

                    break;

                default:

                    if (*buffer_ref == '\\') /* character an escape character? */
                    {
                        buffer_ref++; /* yes, next character */
                    }

                    *token_ref = (unsigned char) *buffer_ref; /* identifier, save the character in the token buffer */
                    token_ref++; /* reference next character in token buffer */
                    retval = IDENTIFIER; /* a word */
                    lexical_state = 1; /* change lexical state to token started but not finished */
                    break;


            }

            buffer_ref++; /* reference next character in buffer */
        }

        *token_ref = '\0'; /* token finished, add EOS character to token buffer */
    }

    else
    {
        buffer_ref = (char *) 0; /* signal to reset buffer_ref, null the buffer reference */
    }

    return (retval); /* return the lexicon type */
}

#ifdef TEST_LEXICON

/*

simple exerciser for testing lexicon (); get a string from stdin,
parse it, and print it to stdout; ignore the:

declared global, could be static
    lexicon             lexicon.c(xxx)

from lint

as a simple test, redirecting the following example into stdin:

THESE | THOSE
THESE & THOSE | THEM | THAT
THESE ! THOSE ! THAT
(THESE & THOSE & THEM ) | (THAT | OTHERS & THINGS)

should produce:

"THESE" (5), "|" (1), "THOSE" (5)
"THESE" (5), "&" (2), "THOSE" (5), "|" (1), "THEM" (5), "|" (1), "THAT" (5)
"THESE" (5), "!" (3), "THOSE" (5), "!" (3), "THAT" (5)
"(" (0), "THESE" (5), "&" (2), "THOSE" (5), "&" (2), "THEM" (5), ")" (4), "|" (1), "(" (0), "THAT" (5), "|" (1), "OTHERS" (5), "&" (2), "THINGS" (5), ")" (4)

on stdout

*/

#ifdef __STDC__

int main (void)

#else

int main ()

#endif

{
    char buffer[BUFSIZ]; /* buffer to be parsed */

    unsigned char tokens[BUFSIZ * 2], /* buffers containing tokens */
                  *token; /* reference to current token in tokens */

    int token_count, /* count of tokens */
        retval = URMEM_ERR; /* return value, assume error allocating memory */

    enum token_type lex_val; /* return value from lexicon () */

    token = tokens; /* reference to the beginning of a token in tokens */

    if (make_uppercase () != (unsigned char *) 0) /* setup the uppercase array */
    {
        retval = NO_ERROR; /* assume no error */

        while (gets (buffer) != 0) /* input the string to be parsed */
        {
            (void) lexicon ((char *) 0, (unsigned char *) 0); /* reset the lexical analyzer */
            token_count = 0; /* no tokens */

            while ((lex_val = lexicon (buffer, token)) != NONE) /* while more tokens in the buffer, get the next token */
            {

                if (token_count == 0) /* first token? */
                {
                    (void) printf ("\"%s\" (%d)", token, (int) lex_val); /* yes, print the token and value, with no comma */
                }

                else
                {
                    (void) printf (", \"%s\" (%d)", token, (int) lex_val); /* no, print the token and value, with leading comma */
                }

                token_count++; /* one more token */
            }

            (void) printf ("\n"); /* print EOL */
        }

    }

    message (retval, (char *) 0); /* print any errors */
    exit (retval); /* return any errors */

#ifdef LINT /* include only if running lint */

    return (0); /* for LINT formality */

#endif

}

#endif