1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
|
/*
------------------------------------------------------------------------------
A license is hereby granted to reproduce this software source code and
to create executable versions from this source code for personal,
non-commercial use. The copyright notice included with the software
must be maintained in all copies produced.
THIS PROGRAM IS PROVIDED "AS IS". THE AUTHOR PROVIDES NO WARRANTIES
WHATSOEVER, EXPRESSED OR IMPLIED, INCLUDING WARRANTIES OF
MERCHANTABILITY, TITLE, OR FITNESS FOR ANY PARTICULAR PURPOSE. THE
AUTHOR DOES NOT WARRANT THAT USE OF THIS PROGRAM DOES NOT INFRINGE THE
INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD PARTY IN ANY COUNTRY.
Copyright (c) 1995, John Conover, All Rights Reserved.
Comments and/or bug reports should be addressed to:
john@johncon.com (John Conover)
------------------------------------------------------------------------------
lexicon.c, lexical analysis for infix/postfix notation
enum token_type lexicon (char *buffer, char *token);
get the next token from buffer, placing it in token and return the
lexicon value of the token-the lexicon value is one of enum
token_type; token must be adequately large to hold the parsed
token (eg., it should be at least twice as large as buffer)
The algorithm is as follows:
while the lexical state of the token is not finished
get a character from buffer
if the character is a grouping operator, or operator, and this
is the first character of the token, save the character in
token, and set the lexical state of the token to finished-else
backup over the character in buffer since the current token is
finished, and this is the beginning of the next token
if the character is whitespace, and this is the beginning of a
token, ignore it, else set the lexical state of the token to
finished
if the character is a part of an identifier, if this is the
first character of the token, set the lexical state of the
token to started, but not finished
if the character is the "pseudo-operator," the '\\', which is
used to "escape" operator symbols when they appear in an
identifier, skip the character, and assume that the next
character is a character of an identifier, set the lexical
state of the token to started, but not finished
Note: this algorithm is essentially a very simple finite state machine
with the following transition table:
state | white '(' ')' '&' '|' '!' '\0' '\\' letter
------+------------------------------------------------------------
0 | 0 2 2 2 2 2 2 1 1
1 | 2 2 2 2 2 2 2 1 1
where state 0 is the initial state, state 1 is a "token in process,"
(eg., an identifier,) and state 2 is the final state
The transition diagram looks like:
+--------+
|+------+|
+--------------------------------->||letter||-+
| |+------+| |
| +--------+ |
| ^ |
| | |
| +-------+
| ^
| |
| +------+
| |+----+|
+--------------------------------->||'\\'||
| |+----+|
| +------+
|
| +-----+
| |+---+|
+--------------------------------->||'('||
| |+---+|
| +-----+
|
| +-----+
| |+---+|
+--------------------------------->||')'||
| |+---+|
| +-----+
|
| +-----+
+-----+ | |+---+|
|white|-+--------------------------------->||'&'||
+-----+ | |+---+|
^ | +-----+
| |
+----+ +-----+
| |+---+|
+--------------------------------->||'|'||
| |+---+|
| +-----+
|
| +-----+
| |+---+|
+--------------------------------->||'!'||
| |+---+|
| +-----+
|
| +------+
| |+----+|
+--------------------------------->||'\0'||
|+----+|
+------+
Usage is a call with null arguments to reset the internal static
buffer reference, and then repeated calls with the same buffer until
the buffer's tokens are exhausted; this is signified by a return of
enum token_type NONE, for example, in a construct:
char buffer[SIZE],
token[2 * SIZE];
enum token_type lex_val;
load_buffer (buffer);
(void) lexicon ((char *) 0, (char *) 0);
while ((lex_val = lexicon (buffer, token)) != NONE)
{
process_token (token,lex_val);
}
When finished, buffer is not disturbed, and token contains the
contents of buffer, with the tokens separated by exactly one '\0'
character, and no whitespace, ie., if the contents of buffer were:
+------------------
|sym1 sym2 sym3 ...
+------------------
then the contents of token would be:
+----------------------
|sym1\0sym2\0sym3\0 ...
+----------------------
For a detailed description of lexical analysis using state machines,
see "Information Retrieval: Data Structures & Algorithms," William
B. Frakes, Ricardo Baeza-Yates, Editors, Prentice Hall, Englewood
Cliffs, New Jersey, 1992, ISBN 0-13-463837-9, pp 102.
The argument, buffer, references the tokens to be parsed, and the
argument token, references space where the tokens will be parsed, and
must be at least twice as large as buffer
There are no errors, and the lexicon type of the token is returned-a
lexicon type of NONE signifies that there are no more tokens
enum token_type is defined in lexicon.h
To test this module, compile the module source with -DTEST_LEXICON
$Revision: 1.0 $
$Date: 1995/04/22 05:13:18 $
$Id: lexicon.c,v 1.0 1995/04/22 05:13:18 john Exp $
$Log: lexicon.c,v $
* Revision 1.0 1995/04/22 05:13:18 john
* Initial revision
*
*/
#include "rel.h"
#ifndef LINT /* include rcsid only if not running lint */
static char rcsid[] = "$Id: lexicon.c,v 1.0 1995/04/22 05:13:18 john Exp $"; /* module version */
static char rcsid_h[] = LEXICON_H_ID; /* module include version */
#endif
#ifdef __STDC__
enum token_type lexicon (char *buffer, unsigned char *token)
#else
enum token_type lexicon (buffer, token)
char *buffer;
unsigned char *token;
#endif
{
static char *buffer_ref = (char *) 0; /* references to current character in buffer, null = no buffer referenced */
unsigned char *token_ref; /* reference to current character in token buffer */
int lexical_state = 0; /* lexical state, 0 = token not started, 1 = token started but not finished, 2 = token finished */
enum token_type retval = NONE; /* return value, assume no token */
token_ref = token; /* reference first charater in token buffer */
if (buffer != (char *) 0) /* null buffer is signal to reset for new buffer */
{
if (buffer_ref == (char *) 0) /* first token for this buffer? */
{
buffer_ref = buffer; /* yes, reference the first character in the buffer */
}
while (lexical_state != 2) /* while the token is not in final state */
{
switch (*buffer_ref) /* translate the buffer character */
{
case '(': /* '(' grouping operator */
case ')': /* ')' grouping operator */
case '&': /* "and" operator */
case '|': /* "or" operator */
case '!': /* "not" operator */
case '\0': /* end of buffer "operator" */
if (lexical_state == 0) /* initial state? */
{
*token_ref = (unsigned char) *buffer_ref; /* yes, save the character in the token buffer */
token_ref++; /* reference next character in token buffer */
switch (*buffer_ref)
{
case '(': /* '(' grouping operator */
retval = LEFT; /* '(' grouping operator */
break;
case ')': /* ')' grouping operator */
retval = RIGHT; /* ')' grouping operator */
break;
case '&': /* "and" operator */
retval = AND; /* "and" operator */
break;
case '|': /* "or" operator */
retval = OR; /* "or" operator */
break;
case '!': /* "not" operator */
retval = NOT; /* "not" operator */
break;
default:
retval = NONE; /* end of buffer "operator" */
break;
}
}
else
{
buffer_ref--; /* yes, "push" the character back */
}
lexical_state = 2; /* change lexical state to token finished */
break;
case ' ': /* white space */
case '\f':
case '\r':
case '\v':
case '\t':
case '\n':
if (lexical_state != 0) /* initial state? */
{
lexical_state = 2; /* no, first character of a token's trailing white space, lexical state = final */
}
break;
default:
if (*buffer_ref == '\\') /* character an escape character? */
{
buffer_ref++; /* yes, next character */
}
*token_ref = (unsigned char) *buffer_ref; /* identifier, save the character in the token buffer */
token_ref++; /* reference next character in token buffer */
retval = IDENTIFIER; /* a word */
lexical_state = 1; /* change lexical state to token started but not finished */
break;
}
buffer_ref++; /* reference next character in buffer */
}
*token_ref = '\0'; /* token finished, add EOS character to token buffer */
}
else
{
buffer_ref = (char *) 0; /* signal to reset buffer_ref, null the buffer reference */
}
return (retval); /* return the lexicon type */
}
#ifdef TEST_LEXICON
/*
simple exerciser for testing lexicon (); get a string from stdin,
parse it, and print it to stdout; ignore the:
declared global, could be static
lexicon lexicon.c(xxx)
from lint
as a simple test, redirecting the following example into stdin:
THESE | THOSE
THESE & THOSE | THEM | THAT
THESE ! THOSE ! THAT
(THESE & THOSE & THEM ) | (THAT | OTHERS & THINGS)
should produce:
"THESE" (5), "|" (1), "THOSE" (5)
"THESE" (5), "&" (2), "THOSE" (5), "|" (1), "THEM" (5), "|" (1), "THAT" (5)
"THESE" (5), "!" (3), "THOSE" (5), "!" (3), "THAT" (5)
"(" (0), "THESE" (5), "&" (2), "THOSE" (5), "&" (2), "THEM" (5), ")" (4), "|" (1), "(" (0), "THAT" (5), "|" (1), "OTHERS" (5), "&" (2), "THINGS" (5), ")" (4)
on stdout
*/
#ifdef __STDC__
int main (void)
#else
int main ()
#endif
{
char buffer[BUFSIZ]; /* buffer to be parsed */
unsigned char tokens[BUFSIZ * 2], /* buffers containing tokens */
*token; /* reference to current token in tokens */
int token_count, /* count of tokens */
retval = URMEM_ERR; /* return value, assume error allocating memory */
enum token_type lex_val; /* return value from lexicon () */
token = tokens; /* reference to the beginning of a token in tokens */
if (make_uppercase () != (unsigned char *) 0) /* setup the uppercase array */
{
retval = NO_ERROR; /* assume no error */
while (gets (buffer) != 0) /* input the string to be parsed */
{
(void) lexicon ((char *) 0, (unsigned char *) 0); /* reset the lexical analyzer */
token_count = 0; /* no tokens */
while ((lex_val = lexicon (buffer, token)) != NONE) /* while more tokens in the buffer, get the next token */
{
if (token_count == 0) /* first token? */
{
(void) printf ("\"%s\" (%d)", token, (int) lex_val); /* yes, print the token and value, with no comma */
}
else
{
(void) printf (", \"%s\" (%d)", token, (int) lex_val); /* no, print the token and value, with leading comma */
}
token_count++; /* one more token */
}
(void) printf ("\n"); /* print EOL */
}
}
message (retval, (char *) 0); /* print any errors */
exit (retval); /* return any errors */
#ifdef LINT /* include only if running lint */
return (0); /* for LINT formality */
#endif
}
#endif
|