File: jregex.h

package info (click to toggle)
lookup 1.08b-9
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 1,112 kB
  • ctags: 1,308
  • sloc: ansic: 12,637; makefile: 245; perl: 174; sh: 53
file content (330 lines) | stat: -rw-r--r-- 10,980 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#ifndef __JREGEX_H__ /* file wrapper */
#define __JREGEX_H__

#define jregex_version 106 /* 1.06 */

/*
 * Jeffrey Friedl
 * Omron Corporation			ʳ
 * Nagaokakyoshi, Japan			617Ĺ
 *
 * jfriedl@nff.ncl.omron.co.jp
 *
 * This work is placed under the terms of the GNU General Purpose License
 * (the "GNU Copyleft").
 *
 *
 * Jeffrey's REGEX routines.
 * October 1993
 * 
 * To apply a regular expression to a string, do something like:
 *
 * 
 *   int match(const unsigned char *pattern, const unsigned char *string)
 *   {
 *      regex_t compiled;
 * 
 *      if (regcomp(&compiled, pattern, 0) != 0)
 *         return 0;   <-- bad pattern 
 *      else
 *      {
 *         int doesmatch = regexec(&compiled, string, strlen(string));
 *         regfree(&compiled);
 *         return doesmatch;
 *      }
 *   }
 * 
 *   These routines are optimized so that the actual matching routine
 *   (regexec) is very fast, at the expense of the compile routine (regcomp).
 *
 *   Patterns and strings consist of any mixture of 7-bit ASCII and 16 bit
 *   EUC Japanese.
 *
 *   A pattern consists of exact text to be matched, as well as the special
 *   characters:
 *
 *         (  )      For grouping.
 *         <  >      For beginning/end of word (actually, same as \b)
 *         ^  $      For beginning/end of line.
 *         *         Zero or more of the previous thing.
 *         $         Zero or one of the previous thing.
 *         +         One or more of the previous thing
 *         .         Any character (ASCII or EUC) except newline.
 *	   \D        Any character (ASCII OR EUC) not \d (see below)
 *	   \W        Any character (ASCII OR EUC) not \w (see below)
 *	   \S        Any character (ASCII OR EUC) not \s (see below)
 *	   \a	     Any ASCII character not \n
 *	   \A        Any multibyte character.
 *	   \k        Any katakana character, including .
 *	   \K	     Any non-katakana character except \n. 
 *	   \h	     Any hiragana character.
 *	   \H	     Any non-hiragana character except \n.
 *         \c        Any jis208 kanji (kuten rows 16-84).
 *         \C        Anything not \c except \n.
 *	   \b        Boundary between words.
 *
 *         [  ]      Indicate character classes. Within classes, none of the
 *                   above are special. If the first character after the open
 *                   bracket is '^', any character *not* specified in the
 *                   class will match. 
 *
 *	   \n	     Newline.
 *	   \t	     Tab.
 *	   \r	     Carriage Return.
 *	   \f        Form-feed.
 *	   \d	     A digit [0-9]
 *	   \w	     An ASCII word element [0-9a-zA-Z_]
 *	   \s	     Whitespace [\t \n\r\f]
 *	   \## \###  Two or three digit octal specifier is that octal number.
 *                   Creating octal values above 127 will result in undefined
 *		     behavior. Probably unfun undefined behavior.
#ifndef NO_PAREN_INFO
 *	   \#        To match the parenthesized expression starting with
 *		     the #th paren. For example "(a)\1" will match "aa".
#endif
 *
 *         \         A backslash before a character (other than the backslash
 *                   combos indicated above) causes that character to be taken
 *                   raw... i.e. removes the specialness from '+', etc.
 * 
 *   The highest precedence is with '*', '$', and '+'. Then parens.
 *   Then concatenation (sequences of characters, classes, etc).
 *   Then '|'.
 *
 *   Note that when case is folded, \k, \K, \h, and \H do NOT fold.
 */

/* struct used to hold a compiled regular expression */
typedef struct regexbuf regex_t;

/*
 * Given a null-terminated pattern, a REGEX_T to fill, and some flags,
 * return one of the REGCOMP_* return values (i.e. REGCOMP_SUCCESS)
 */
extern int regcomp(regex_t *r, const unsigned char *pattern, unsigned flags);

/* status returned by regcomp() */
#define REGCOMP_SUCCESS			0    /* yeah!                      */
#define REGCOMP_INTERNAL_ERROR		1    /* Mmmmm.                     */
#define REGCOMP_INVALID_DATA            2    /* null pointer passed to     */
#define REGCOMP_EMPTY_PATTERN	        3    /* empty  pattern             */

#define REGCOMP_UNCLOSED_CLASS		4    /* forgot that ']'            */
#define REGCOMP_UNMATCHED_OPEN_PAREN	5    /* more ( than )              */
#define REGCOMP_UNMATCHED_CLOSE_PAREN	6    /* less ( than )              */

#define REGCOMP_MISUSED_COUNT_SPEC	7    /* ill-used +,*, or ?         */
#define REGCOMP_INFINITE_PLUS           8    /* something like "(x?)+"     */
#define REGCOMP_INFINITE_STAR           9    /* something like "(x?)*"     */
#define REGCOMP_EMPTY_CLASS	       10    /* an empty clas []           */
#define REGCOMP_BAD_BRACE	       11    /* misuse of |                */

#ifndef NO_PAREN_INFO
# define REGCOMP_PAREN_LEVEL_INVALID   12    /* \# when paren # not defined */
# define REGCOMP_NEED_SAVE_PAREN_INFO  13    /* has internal paren references*/
#endif

#define REGCOMP_EUC_IN_CLASS_RANGE     14    /* EUC can't be part of range */
#define REGCOMP_INVALID_OCTAL_VALUE    15    /* something like \678        */
#define REGCOMP_CORRUPTED_TEXT	       16    /* bad character codes in text */

/* flags accepted by regcomp() and regexec_setflags() */
  /* turns on debugging */
  #define REGCOMP_DEBUG   			0x0001
  #define REGEXEC_DEBUG   			0x0002
  #define REGEX_DEBUG 			(REGCOMP_DEBUG|REGEXEC_DEBUG)

  /* causes alphabetic case to not matter in comparing */
  #define REGCOMP_IGNORE_ALPHA_CASE		0x0004

  /* causes kana case (hiragana vs katakana) to not matter in comparing */
  #define REGCOMP_IGNORE_KANA_CASE		0x0008

  /* both of the above */
  #define REGCOMP_IGNORE_CASE             (REGCOMP_IGNORE_ALPHA_CASE|\
					   REGCOMP_IGNORE_KANA_CASE)

  /*
   * Used when you want a binary "does this line match or not" answer.
   * It will allow various shortcuts to be taken, such as ignoring
   * the final ``z*'' in the example pattern ``xyz*'' (as every line
   * that could match the pattern ``xyz*'' will also be matched by
   * the more simple ``xy'').
   * Note that this will tend to have strange but generally predictable
   * results upon regexec_match_start, regexec_match_end, and paren
   * info.
   */
  #define REGCOMP_JUST_MATCH			0x0010

  /*
   * Tells regcomp to calculate the list of characters that any
   * matching line must have, and make it available via regmusthave().
   */
  #define REGCOMP_CALC_MUSTHAVE         	0x0020

#ifndef NO_PAREN_INFO
  /*
   * If set tells regcomp to compile so that regexec will save info about
   * what text was matched by what parens. This is saved to
   * regexec_paren_info (but, of course, won't be if that variable is null).
   */
  #define REGCOMP_SAVE_MATCHED_PAREN_INFO	0x0040
#endif

  /*
   * If set will cause regexec to set
   * 	regexec_match_at_start_of_word
   * and
   *    regexec_match_at_end_of_word
   * upon exit.
   */
  #define REGCOMP_WANT_WORD_MATCH_INFO		0x0080

  /*
   * If set, will cause  to match "appropriately" either way.
   * When set,  and  will match each-other.
   */
  #define REGCOMP_FUZZY_KANA_REPETITION		0x0100

  /*
   * Has the regex run in longest-leftmost (for overall regex) mode.
   */
  #define REGEXEC_LLM   			0x0200

/****************************************************/


/*
 * Execute the given compiled REGEX_T on the given LINE (of the given LENGTH)
 * and return true if it matches, false otherwise.
 */
extern unsigned int regexec(const regex_t *r,
			    const unsigned char *line,
			    unsigned len);

#ifndef NO_REGEXEC_MATCH_POINTS
/*
 * When regexec returns REGCOMP_SUCCESS, the following will point to the
 * the start and end of the text that was actually matched.
 * Undefined otherwise.
 */
extern const unsigned char *regexec_match_start;
extern const unsigned char *regexec_match_end;
#endif

/*
 * If requested via REGCOMP_WANT_WORD_MATCH_INFO, indicates if the match
 * started and/or ended at a word boundary.
 */
extern int regexec_match_at_start_of_word;
extern int regexec_match_at_end_of_word;

#ifndef NO_PAREN_INFO
typedef struct
{
    const unsigned char *match_start;
    const unsigned char *match_end;
} matched_paren_t;
/*
 * User fills in the following two if they want the matched paren info saved.
 */
extern matched_paren_t *regexec_paren_info;
extern unsigned regexec_paren_info_size; /* entries in regexec_paren_info[] */

/* While regexec will fill in this one telling how many are now valid. */
extern unsigned regexec_paren_info_used; /* entries in above actually used */

#endif


/*
 * Used to set flags (such as REGEXEC_DEBUG) for regexec().
 * The previous value is returned. If debugging has not been compiled in,
 * the value ~0 is returned.
 */
extern unsigned int regexec_setflags(unsigned int flags);
			   
/*
 * If the user sets this to a function, it will be called upon a memory
 * error. It shouldn't return.
 */
extern void (*regex_memory_error)(void);

/*
 * Free any (non-temporary) memory associated with the given REGEX_T.
 * The REGEX_T itself is not freed, however, as it wasn't allocated here.
 * Safe to re-free a free'd regex.
 */
extern void regfree(regex_t *r);

/*
 * Return a list of characters that will be part of every matching
 * line, if known. Regcomp must have been called with REGCOMP_CALC_MUSTHAVE,
 * and will return NULL if not. If there are no known required characters
 * (such as for "a|b"), an empty string ("") will be returned.
 */
extern const unsigned char *regmusthave(const regex_t *r);

/*

 * When there's an error in regcomp, REGCOMP_ERROR is the error
 * number, and REGCOMP_EPTR (if non-null) points into the pattern
 * string near where the error was realized.
 */
extern unsigned int regcomp_error;
extern const unsigned char *regcomp_eptr;
extern const unsigned char *regcomp_last_pattern;
extern const unsigned char *regcomp_error_report(void);



#ifndef NO_SHOWREGEX
/* for debugging */
extern void showregex(const regex_t *r);
#endif


/*
 * User doesn't need to be concerned with what's inside here, but we
 * have to make it available to the outside...
 */
struct regexbuf
{
    unsigned char *buf;
    unsigned char *bufend;
    unsigned int min_length_match;
    unsigned char *musthave;
    #ifndef NO_PAREN_INFO
     unsigned char max_paren_level;
     unsigned char max_lpc_l;
     unsigned char max_lpc_c;
     unsigned char paren_info_required;
     #define reg_max_paren_level_used(BUF)     ((BUF)->max_paren_level)
     #define reg_max_paren_level_required(BUF) ((BUF)->paren_info_required)
    #endif
    unsigned anchor_to_start_of_line:1,
             fold_acase:1,
             fold_kcase:1;
};

#ifndef NO_REGEX_STATS
  extern struct regex_stats
  {
      int
	  states_pushed,
	  states_popped,
	  max_state_depth,
	  parens_pushed,
	  parens_popped,
          parens_entered,
          parens_saved,
	  cycles,
	  matches,
	  failures,
	  tests;
  } regex_stats;

#endif /* NO_REGEX_STATS */
extern void regex_reset_stats(void);

#endif /* file wrapper */