1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
|
#ifndef __JREGEX_H__ /* file wrapper */
#define __JREGEX_H__
#define jregex_version 106 /* 1.06 */
/*
* Jeffrey Friedl
* Omron Corporation ʳ
* Nagaokakyoshi, Japan 617Ĺ
*
* jfriedl@nff.ncl.omron.co.jp
*
* This work is placed under the terms of the GNU General Purpose License
* (the "GNU Copyleft").
*
*
* Jeffrey's REGEX routines.
* October 1993
*
* To apply a regular expression to a string, do something like:
*
*
* int match(const unsigned char *pattern, const unsigned char *string)
* {
* regex_t compiled;
*
* if (regcomp(&compiled, pattern, 0) != 0)
* return 0; <-- bad pattern
* else
* {
* int doesmatch = regexec(&compiled, string, strlen(string));
* regfree(&compiled);
* return doesmatch;
* }
* }
*
* These routines are optimized so that the actual matching routine
* (regexec) is very fast, at the expense of the compile routine (regcomp).
*
* Patterns and strings consist of any mixture of 7-bit ASCII and 16 bit
* EUC Japanese.
*
* A pattern consists of exact text to be matched, as well as the special
* characters:
*
* ( ) For grouping.
* < > For beginning/end of word (actually, same as \b)
* ^ $ For beginning/end of line.
* * Zero or more of the previous thing.
* $ Zero or one of the previous thing.
* + One or more of the previous thing
* . Any character (ASCII or EUC) except newline.
* \D Any character (ASCII OR EUC) not \d (see below)
* \W Any character (ASCII OR EUC) not \w (see below)
* \S Any character (ASCII OR EUC) not \s (see below)
* \a Any ASCII character not \n
* \A Any multibyte character.
* \k Any katakana character, including .
* \K Any non-katakana character except \n.
* \h Any hiragana character.
* \H Any non-hiragana character except \n.
* \c Any jis208 kanji (kuten rows 16-84).
* \C Anything not \c except \n.
* \b Boundary between words.
*
* [ ] Indicate character classes. Within classes, none of the
* above are special. If the first character after the open
* bracket is '^', any character *not* specified in the
* class will match.
*
* \n Newline.
* \t Tab.
* \r Carriage Return.
* \f Form-feed.
* \d A digit [0-9]
* \w An ASCII word element [0-9a-zA-Z_]
* \s Whitespace [\t \n\r\f]
* \## \### Two or three digit octal specifier is that octal number.
* Creating octal values above 127 will result in undefined
* behavior. Probably unfun undefined behavior.
#ifndef NO_PAREN_INFO
* \# To match the parenthesized expression starting with
* the #th paren. For example "(a)\1" will match "aa".
#endif
*
* \ A backslash before a character (other than the backslash
* combos indicated above) causes that character to be taken
* raw... i.e. removes the specialness from '+', etc.
*
* The highest precedence is with '*', '$', and '+'. Then parens.
* Then concatenation (sequences of characters, classes, etc).
* Then '|'.
*
* Note that when case is folded, \k, \K, \h, and \H do NOT fold.
*/
/* struct used to hold a compiled regular expression */
typedef struct regexbuf regex_t;
/*
* Given a null-terminated pattern, a REGEX_T to fill, and some flags,
* return one of the REGCOMP_* return values (i.e. REGCOMP_SUCCESS)
*/
extern int regcomp(regex_t *r, const unsigned char *pattern, unsigned flags);
/* status returned by regcomp() */
#define REGCOMP_SUCCESS 0 /* yeah! */
#define REGCOMP_INTERNAL_ERROR 1 /* Mmmmm. */
#define REGCOMP_INVALID_DATA 2 /* null pointer passed to */
#define REGCOMP_EMPTY_PATTERN 3 /* empty pattern */
#define REGCOMP_UNCLOSED_CLASS 4 /* forgot that ']' */
#define REGCOMP_UNMATCHED_OPEN_PAREN 5 /* more ( than ) */
#define REGCOMP_UNMATCHED_CLOSE_PAREN 6 /* less ( than ) */
#define REGCOMP_MISUSED_COUNT_SPEC 7 /* ill-used +,*, or ? */
#define REGCOMP_INFINITE_PLUS 8 /* something like "(x?)+" */
#define REGCOMP_INFINITE_STAR 9 /* something like "(x?)*" */
#define REGCOMP_EMPTY_CLASS 10 /* an empty clas [] */
#define REGCOMP_BAD_BRACE 11 /* misuse of | */
#ifndef NO_PAREN_INFO
# define REGCOMP_PAREN_LEVEL_INVALID 12 /* \# when paren # not defined */
# define REGCOMP_NEED_SAVE_PAREN_INFO 13 /* has internal paren references*/
#endif
#define REGCOMP_EUC_IN_CLASS_RANGE 14 /* EUC can't be part of range */
#define REGCOMP_INVALID_OCTAL_VALUE 15 /* something like \678 */
#define REGCOMP_CORRUPTED_TEXT 16 /* bad character codes in text */
/* flags accepted by regcomp() and regexec_setflags() */
/* turns on debugging */
#define REGCOMP_DEBUG 0x0001
#define REGEXEC_DEBUG 0x0002
#define REGEX_DEBUG (REGCOMP_DEBUG|REGEXEC_DEBUG)
/* causes alphabetic case to not matter in comparing */
#define REGCOMP_IGNORE_ALPHA_CASE 0x0004
/* causes kana case (hiragana vs katakana) to not matter in comparing */
#define REGCOMP_IGNORE_KANA_CASE 0x0008
/* both of the above */
#define REGCOMP_IGNORE_CASE (REGCOMP_IGNORE_ALPHA_CASE|\
REGCOMP_IGNORE_KANA_CASE)
/*
* Used when you want a binary "does this line match or not" answer.
* It will allow various shortcuts to be taken, such as ignoring
* the final ``z*'' in the example pattern ``xyz*'' (as every line
* that could match the pattern ``xyz*'' will also be matched by
* the more simple ``xy'').
* Note that this will tend to have strange but generally predictable
* results upon regexec_match_start, regexec_match_end, and paren
* info.
*/
#define REGCOMP_JUST_MATCH 0x0010
/*
* Tells regcomp to calculate the list of characters that any
* matching line must have, and make it available via regmusthave().
*/
#define REGCOMP_CALC_MUSTHAVE 0x0020
#ifndef NO_PAREN_INFO
/*
* If set tells regcomp to compile so that regexec will save info about
* what text was matched by what parens. This is saved to
* regexec_paren_info (but, of course, won't be if that variable is null).
*/
#define REGCOMP_SAVE_MATCHED_PAREN_INFO 0x0040
#endif
/*
* If set will cause regexec to set
* regexec_match_at_start_of_word
* and
* regexec_match_at_end_of_word
* upon exit.
*/
#define REGCOMP_WANT_WORD_MATCH_INFO 0x0080
/*
* If set, will cause to match "appropriately" either way.
* When set, and will match each-other.
*/
#define REGCOMP_FUZZY_KANA_REPETITION 0x0100
/*
* Has the regex run in longest-leftmost (for overall regex) mode.
*/
#define REGEXEC_LLM 0x0200
/****************************************************/
/*
* Execute the given compiled REGEX_T on the given LINE (of the given LENGTH)
* and return true if it matches, false otherwise.
*/
extern unsigned int regexec(const regex_t *r,
const unsigned char *line,
unsigned len);
#ifndef NO_REGEXEC_MATCH_POINTS
/*
* When regexec returns REGCOMP_SUCCESS, the following will point to the
* the start and end of the text that was actually matched.
* Undefined otherwise.
*/
extern const unsigned char *regexec_match_start;
extern const unsigned char *regexec_match_end;
#endif
/*
* If requested via REGCOMP_WANT_WORD_MATCH_INFO, indicates if the match
* started and/or ended at a word boundary.
*/
extern int regexec_match_at_start_of_word;
extern int regexec_match_at_end_of_word;
#ifndef NO_PAREN_INFO
typedef struct
{
const unsigned char *match_start;
const unsigned char *match_end;
} matched_paren_t;
/*
* User fills in the following two if they want the matched paren info saved.
*/
extern matched_paren_t *regexec_paren_info;
extern unsigned regexec_paren_info_size; /* entries in regexec_paren_info[] */
/* While regexec will fill in this one telling how many are now valid. */
extern unsigned regexec_paren_info_used; /* entries in above actually used */
#endif
/*
* Used to set flags (such as REGEXEC_DEBUG) for regexec().
* The previous value is returned. If debugging has not been compiled in,
* the value ~0 is returned.
*/
extern unsigned int regexec_setflags(unsigned int flags);
/*
* If the user sets this to a function, it will be called upon a memory
* error. It shouldn't return.
*/
extern void (*regex_memory_error)(void);
/*
* Free any (non-temporary) memory associated with the given REGEX_T.
* The REGEX_T itself is not freed, however, as it wasn't allocated here.
* Safe to re-free a free'd regex.
*/
extern void regfree(regex_t *r);
/*
* Return a list of characters that will be part of every matching
* line, if known. Regcomp must have been called with REGCOMP_CALC_MUSTHAVE,
* and will return NULL if not. If there are no known required characters
* (such as for "a|b"), an empty string ("") will be returned.
*/
extern const unsigned char *regmusthave(const regex_t *r);
/*
* When there's an error in regcomp, REGCOMP_ERROR is the error
* number, and REGCOMP_EPTR (if non-null) points into the pattern
* string near where the error was realized.
*/
extern unsigned int regcomp_error;
extern const unsigned char *regcomp_eptr;
extern const unsigned char *regcomp_last_pattern;
extern const unsigned char *regcomp_error_report(void);
#ifndef NO_SHOWREGEX
/* for debugging */
extern void showregex(const regex_t *r);
#endif
/*
* User doesn't need to be concerned with what's inside here, but we
* have to make it available to the outside...
*/
struct regexbuf
{
unsigned char *buf;
unsigned char *bufend;
unsigned int min_length_match;
unsigned char *musthave;
#ifndef NO_PAREN_INFO
unsigned char max_paren_level;
unsigned char max_lpc_l;
unsigned char max_lpc_c;
unsigned char paren_info_required;
#define reg_max_paren_level_used(BUF) ((BUF)->max_paren_level)
#define reg_max_paren_level_required(BUF) ((BUF)->paren_info_required)
#endif
unsigned anchor_to_start_of_line:1,
fold_acase:1,
fold_kcase:1;
};
#ifndef NO_REGEX_STATS
extern struct regex_stats
{
int
states_pushed,
states_popped,
max_state_depth,
parens_pushed,
parens_popped,
parens_entered,
parens_saved,
cycles,
matches,
failures,
tests;
} regex_stats;
#endif /* NO_REGEX_STATS */
extern void regex_reset_stats(void);
#endif /* file wrapper */
|