1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
|
/*****************************************************************************
* Written by Chris Dunlap <cdunlap@llnl.gov>.
* Copyright (C) 2007-2022 Lawrence Livermore National Security, LLC.
* Copyright (C) 2001-2007 The Regents of the University of California.
* UCRL-CODE-2002-009.
*
* This file is part of ConMan: The Console Manager.
* For details, see <https://dun.github.io/conman/>.
*
* ConMan is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option)
* any later version.
*
* ConMan is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with ConMan. If not, see <http://www.gnu.org/licenses/>.
*****************************************************************************/
#ifndef _LEX_H
#define _LEX_H
/*****************************************************************************\
* Laws of the Lexer:
*----------------------------------------------------------------------------
* - Whitespace is ignored.
* - Comments are ignored (from the pound char to the newline).
* - Lines may be terminated by either carriage-returns (CR),
* linefeeds (LF), or carriage-return/linefeed (CR/LF) pairs.
* - A newline may be escaped by immediately preceding it with a backslash.
* - Integers may begin with either a plus or minus, and contain only digits.
* - Strings may be single-quoted or double-quoted.
* - Strings cannot contain CRs or LFs.
* - Unquoted strings are sequences of letters, digits, and underscores;
* they may not begin with a digit (just like a C identifier).
* - Tokens are unquoted case-insensitive strings.
\*****************************************************************************/
/***********\
** Notes **
\***********/
/* When a memory allocation request fails, the lexer returns out_of_memory().
* By default, this is a macro definition that returns NULL; this macro may
* be redefined to invoke another routine instead. Furthermore, if WITH_OOMF
* is defined, this macro will not be defined and the lexer will expect an
* external Out-Of-Memory Function to be defined.
*/
/***************\
** Constants **
\***************/
#define LEX_MAX_STR 1024 /* max length of lexer string */
enum common_tokens {
LEX_ERR = -1, /* lex error token */
LEX_EOF = 0, /* end-of-file/buffer token */
LEX_EOL = 256, /* end-of-line token */
LEX_INT, /* integer token: ([+-]?[0-9]+) */
LEX_STR, /* string token */
LEX_TOK_OFFSET /* enum value at which toks[] begin */
};
/****************\
** Data Types **
\****************/
typedef struct lexer_state *Lex;
/*
* Lex opaque data type.
*/
/************\
** Macros **
\************/
#define LEX_TOK2STR(tokstrs,tok) ((tokstrs)[(tok) - LEX_TOK_OFFSET])
/*
* Returns a string in the (tokstrs) array corresponding to the token (tok).
* Only use when (tok) is known to be a valid array index corresponding to a
* string in the (tokstrs) array of strings since no bounds-checking is
* performed.
*/
/**********************\
** Lexing Functions **
\**********************/
Lex lex_create(void *buf, char *toks[]);
/*
* Creates and returns a new lexer, or out_of_memory() on failure.
* The text to be lexed is specified by the NUL-terminated buffer (buf);
* this buffer WILL NOT be modified by the lexer.
* The NULL-terminated array of strings (toks) defines the set of tokens
* that will be recognized by the lexer; these strings must be listed
* in a case-insensitive ascending order (ie, according to strcasecmp).
* Note: Abadoning a lexer without calling lex_destroy() will result
* in a memory leak.
*/
void lex_destroy(Lex l);
/*
* Destroys lexer (l), freeing memory used for the lexer itself.
*/
int lex_next(Lex l);
/*
* Returns the next token in the buffer given to lex_create()
* according to the Laws of the Lexer.
* Single-character tokens (eg, punctuation) are specified by
* their ASCII code. Common tokens are specified by the
* common_token enumeration. Tokens specified by the (toks)
* array of strings begin at LEX_TOK_OFFSET.
*/
int lex_prev(Lex l);
/*
* Returns the last token returned by lex_next().
*/
int lex_line(Lex l);
/*
* Returns the line number of the last token returned by lex_next().
*/
const char * lex_text(Lex l);
/*
* Returns the string corresponding to the last token returned by lex_next().
*/
const char * lex_tok_to_str(Lex l, int tok);
/*
* Returns the string from the lex_create() toks[] array corresponding to the
* token (tok), or NULL if tok is outside of the toks[] array bounds.
*/
/*************************\
** Auxiliary Functions **
\*************************/
char * lex_encode(char *str);
/*
* Encodes the string (str) so that it may safely be used by the lexer.
* This is needed if the string may contain quote characters.
* The string cannot be a constant as it will be modified in place.
* Returns the encoded string.
*/
char * lex_decode(char *str);
/*
* Decodes the string (str) that has been encoded with lex_encode().
* The string cannot be a constant as it will be modified in place.
* Returns the decoded string.
*/
/********************\
** Test Functions **
\********************/
void lex_parse_test(char *buf, char *toks[]);
/*
* Example code that tokenizes the buffer (buf) based upon the
* NULL-terminated array of strings (toks) that defines the
* set of recognized tokens.
*/
#endif /* !_LEX_H */
|