File: lexicon.c

package info (click to toggle)
rel 1.3-3
links: PTS
area: non-free
in suites: hamm, potato, slink
size: 496 kB
ctags: 216
sloc: ansic: 1,868; sh: 254; makefile: 142
file content (433 lines) | stat: -rw-r--r-- 14,595 bytes
/*

------------------------------------------------------------------------------

A license is hereby granted to reproduce this software source code and
to create executable versions from this source code for personal,
non-commercial use.  The copyright notice included with the software
must be maintained in all copies produced.

THIS PROGRAM IS PROVIDED "AS IS". THE AUTHOR PROVIDES NO WARRANTIES
WHATSOEVER, EXPRESSED OR IMPLIED, INCLUDING WARRANTIES OF
MERCHANTABILITY, TITLE, OR FITNESS FOR ANY PARTICULAR PURPOSE.  THE
AUTHOR DOES NOT WARRANT THAT USE OF THIS PROGRAM DOES NOT INFRINGE THE
INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD PARTY IN ANY COUNTRY.

Copyright (c) 1995, John Conover, All Rights Reserved.

Comments and/or bug reports should be addressed to:

    john@johncon.com (John Conover)

------------------------------------------------------------------------------

lexicon.c, lexical analysis for infix/postfix notation

enum token_type lexicon (char *buffer, char *token);

    get the next token from buffer, placing it in token and return the
    lexicon value of the token-the lexicon value is one of enum
    token_type; token must be adequately large to hold the parsed
    token (eg., it should be at least twice as large as buffer)

The algorithm is as follows:

    while the lexical state of the token is not finished

        get a character from buffer

        if the character is a grouping operator, or operator, and this
        is the first character of the token, save the character in
        token, and set the lexical state of the token to finished-else
        backup over the character in buffer since the current token is
        finished, and this is the beginning of the next token

        if the character is whitespace, and this is the beginning of a
        token, ignore it, else set the lexical state of the token to
        finished

        if the character is a part of an identifier, if this is the
        first character of the token, set the lexical state of the
        token to started, but not finished

        if the character is the "pseudo-operator," the '\\', which is
        used to "escape" operator symbols when they appear in an
        identifier, skip the character, and assume that the next
        character is a character of an identifier, set the lexical
        state of the token to started, but not finished

Note: this algorithm is essentially a very simple finite state machine
with the following transition table:

    state | white   '('    ')'    '&'    '|'   '!'   '\0'  '\\'  letter
    ------+------------------------------------------------------------
      0   |   0      2      2      2      2     2     2      1     1
      1   |   2      2      2      2      2     2     2      1     1

where state 0 is the initial state, state 1 is a "token in process,"
(eg., an identifier,) and state 2 is the final state

The transition diagram looks like:

                                               +--------+
                                               |+------+|
            +--------------------------------->||letter||-+
            |                                  |+------+| |
            |                                  +--------+ |
            |                                     ^       |
            |                                     |       |
            |                                     +-------+
            |                                     ^
            |                                     |
            |                                  +------+
            |                                  |+----+|
            +--------------------------------->||'\\'||
            |                                  |+----+|
            |                                  +------+
            |
            |                                  +-----+
            |                                  |+---+|
            +--------------------------------->||'('||
            |                                  |+---+|
            |                                  +-----+
            |
            |                                  +-----+
            |                                  |+---+|
            +--------------------------------->||')'||
            |                                  |+---+|
            |                                  +-----+
            |
            |                                  +-----+
    +-----+ |                                  |+---+|
    |white|-+--------------------------------->||'&'||
    +-----+ |                                  |+---+|
       ^    |                                  +-----+
       |    |
       +----+                                  +-----+
            |                                  |+---+|
            +--------------------------------->||'|'||
            |                                  |+---+|
            |                                  +-----+
            |
            |                                  +-----+
            |                                  |+---+|
            +--------------------------------->||'!'||
            |                                  |+---+|
            |                                  +-----+
            |
            |                                  +------+
            |                                  |+----+|
            +--------------------------------->||'\0'||
                                               |+----+|
                                               +------+

Usage is a call with null arguments to reset the internal static
buffer reference, and then repeated calls with the same buffer until
the buffer's tokens are exhausted; this is signified by a return of
enum token_type NONE, for example, in a construct:

    char buffer[SIZE],
         token[2 * SIZE];

    enum token_type lex_val;

    load_buffer (buffer);

    (void) lexicon ((char *) 0, (char *) 0);

    while ((lex_val = lexicon (buffer, token)) != NONE)
    {
        process_token (token,lex_val);
    }

When finished, buffer is not disturbed, and token contains the
contents of buffer, with the tokens separated by exactly one '\0'
character, and no whitespace, ie., if the contents of buffer were:

    +------------------
    |sym1 sym2 sym3 ...
    +------------------

then the contents of token would be:

    +----------------------
    |sym1\0sym2\0sym3\0 ...
    +----------------------

For a detailed description of lexical analysis using state machines,
see "Information Retrieval: Data Structures & Algorithms," William
B. Frakes, Ricardo Baeza-Yates, Editors, Prentice Hall, Englewood
Cliffs, New Jersey, 1992, ISBN 0-13-463837-9, pp 102.

The argument, buffer, references the tokens to be parsed, and the
argument token, references space where the tokens will be parsed, and
must be at least twice as large as buffer

There are no errors, and the lexicon type of the token is returned-a
lexicon type of NONE signifies that there are no more tokens

enum token_type is defined in lexicon.h

To test this module, compile the module source with -DTEST_LEXICON

$Revision: 1.0 $
$Date: 1995/04/22 05:13:18 $
$Id: lexicon.c,v 1.0 1995/04/22 05:13:18 john Exp $
$Log: lexicon.c,v $
 * Revision 1.0  1995/04/22  05:13:18  john
 * Initial revision
 *

*/

#include "rel.h"

#ifndef LINT /* include rcsid only if not running lint */

static char rcsid[] = "$Id: lexicon.c,v 1.0 1995/04/22 05:13:18 john Exp $"; /* module version */
static char rcsid_h[] = LEXICON_H_ID; /* module include version */

#endif

#ifdef __STDC__

enum token_type lexicon (char *buffer, unsigned char *token)

#else

enum token_type lexicon (buffer, token)
    char *buffer;
    unsigned char *token;

#endif

{
    static char *buffer_ref = (char *) 0; /* references to current character in buffer, null = no buffer referenced */

    unsigned char *token_ref; /* reference to current character in token buffer */

    int lexical_state = 0; /* lexical state, 0 = token not started, 1 = token started but not finished, 2 = token finished */

    enum token_type retval = NONE; /* return value, assume no token */

    token_ref = token; /* reference first charater in token buffer */

    if (buffer != (char *) 0) /* null buffer is signal to reset for new buffer */
    {

        if (buffer_ref == (char *) 0) /* first token for this buffer? */
        {
            buffer_ref = buffer; /* yes, reference the first character in the buffer */
        }

        while (lexical_state != 2) /* while the token is not in final state */
        {

            switch (*buffer_ref) /* translate the buffer character */
            {

                case '(': /* '(' grouping operator */

                case ')': /* ')' grouping operator */

                case '&': /* "and" operator */

                case '|': /* "or" operator */

                case '!': /* "not" operator */

                case '\0': /* end of buffer "operator" */

                    if (lexical_state == 0) /* initial state? */
                    {
                        *token_ref = (unsigned char) *buffer_ref; /* yes, save the character in the token buffer */
                        token_ref++; /* reference next character in token buffer */

                        switch (*buffer_ref)
                        {

                            case '(': /* '(' grouping operator */

                                retval = LEFT; /* '(' grouping operator */
                                break;

                            case ')': /* ')' grouping operator */

                                retval = RIGHT; /* ')' grouping operator */
                                break;

                            case '&': /* "and" operator */

                                retval = AND; /* "and" operator */
                                break;

                            case '|': /* "or" operator */

                                retval = OR; /* "or" operator */
                                break;

                            case '!': /* "not" operator */

                                retval = NOT; /* "not" operator */
                                break;

                            default:

                                retval = NONE; /* end of buffer "operator" */
                                break;
                        }

                    }

                    else
                    {
                        buffer_ref--; /* yes, "push" the character back */
                    }

                    lexical_state = 2; /* change lexical state to token finished */
                    break;

                case ' ': /* white space */

                case '\f':

                case '\r':

                case '\v':

                case '\t':

                case '\n':

                    if (lexical_state != 0) /* initial state? */
                    {
                        lexical_state = 2; /* no, first character of a token's trailing white space, lexical state = final */
                    }

                    break;

                default:

                    if (*buffer_ref == '\\') /* character an escape character? */
                    {
                        buffer_ref++; /* yes, next character */
                    }

                    *token_ref = (unsigned char) *buffer_ref; /* identifier, save the character in the token buffer */
                    token_ref++; /* reference next character in token buffer */
                    retval = IDENTIFIER; /* a word */
                    lexical_state = 1; /* change lexical state to token started but not finished */
                    break;


            }

            buffer_ref++; /* reference next character in buffer */
        }

        *token_ref = '\0'; /* token finished, add EOS character to token buffer */
    }

    else
    {
        buffer_ref = (char *) 0; /* signal to reset buffer_ref, null the buffer reference */
    }

    return (retval); /* return the lexicon type */
}

#ifdef TEST_LEXICON

/*

simple exerciser for testing lexicon (); get a string from stdin,
parse it, and print it to stdout; ignore the:

declared global, could be static
    lexicon             lexicon.c(xxx)

from lint

as a simple test, redirecting the following example into stdin:

THESE | THOSE
THESE & THOSE | THEM | THAT
THESE ! THOSE ! THAT
(THESE & THOSE & THEM ) | (THAT | OTHERS & THINGS)

should produce:

"THESE" (5), "|" (1), "THOSE" (5)
"THESE" (5), "&" (2), "THOSE" (5), "|" (1), "THEM" (5), "|" (1), "THAT" (5)
"THESE" (5), "!" (3), "THOSE" (5), "!" (3), "THAT" (5)
"(" (0), "THESE" (5), "&" (2), "THOSE" (5), "&" (2), "THEM" (5), ")" (4), "|" (1), "(" (0), "THAT" (5), "|" (1), "OTHERS" (5), "&" (2), "THINGS" (5), ")" (4)

on stdout

*/

#ifdef __STDC__

int main (void)

#else

int main ()

#endif

{
    char buffer[BUFSIZ]; /* buffer to be parsed */

    unsigned char tokens[BUFSIZ * 2], /* buffers containing tokens */
                  *token; /* reference to current token in tokens */

    int token_count, /* count of tokens */
        retval = URMEM_ERR; /* return value, assume error allocating memory */

    enum token_type lex_val; /* return value from lexicon () */

    token = tokens; /* reference to the beginning of a token in tokens */

    if (make_uppercase () != (unsigned char *) 0) /* setup the uppercase array */
    {
        retval = NO_ERROR; /* assume no error */

        while (gets (buffer) != 0) /* input the string to be parsed */
        {
            (void) lexicon ((char *) 0, (unsigned char *) 0); /* reset the lexical analyzer */
            token_count = 0; /* no tokens */

            while ((lex_val = lexicon (buffer, token)) != NONE) /* while more tokens in the buffer, get the next token */
            {

                if (token_count == 0) /* first token? */
                {
                    (void) printf ("\"%s\" (%d)", token, (int) lex_val); /* yes, print the token and value, with no comma */
                }

                else
                {
                    (void) printf (", \"%s\" (%d)", token, (int) lex_val); /* no, print the token and value, with leading comma */
                }

                token_count++; /* one more token */
            }

            (void) printf ("\n"); /* print EOL */
        }

    }

    message (retval, (char *) 0); /* print any errors */
    exit (retval); /* return any errors */

#ifdef LINT /* include only if running lint */

    return (0); /* for LINT formality */

#endif

}

#endif