File: rxsuper.h

package info (click to toggle)
rplay 3.3.2-8
  • links: PTS
  • area: main
  • in suites: sarge, woody
  • size: 3,136 kB
  • ctags: 3,519
  • sloc: ansic: 42,159; makefile: 1,242; perl: 742; tcl: 345; sh: 312; java: 220
file content (446 lines) | stat: -rw-r--r-- 16,342 bytes parent folder | download | duplicates (15)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
/* classes: h_files */

#ifndef RXSUPERH
#define RXSUPERH

/*	Copyright (C) 1995, 1996 Tom Lord
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 * 
 * You should have received a copy of the GNU Library General Public License
 * along with this software; see the file COPYING.  If not, write to
 * the Free Software Foundation, 59 Temple Place - Suite 330, 
 * Boston, MA 02111-1307, USA. 
 */

/*  lord	Sun May  7 12:40:17 1995	*/



#include "rxnfa.h"



/* This begins the description of the superstate NFA.
 *
 * The superstate NFA corresponds to the NFA in these ways:
 *
 * Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
 *
 * Superstate edges correspond to NFA paths.
 *
 * The superstate has no epsilon transitions;
 * every edge has a character label, and a (possibly empty) side
 * effect label.   The side effect label corresponds to a list of
 * side effects that occur in the NFA.  These parts are referred
 * to as:   superedge_character(EDGE) and superedge_sides(EDGE).
 *
 * For a superstate edge EDGE starting in some superstate SUPER,
 * the following is true (in pseudo-notation :-):
 *
 *       exists DEST in nfa_states s.t. 
 *         exists nfaEDGE in nfa_edges s.t.
 *                 origin (nfaEDGE) == DEST
 *              && origin (nfaEDGE) is a member of nfa_states(SUPER)
 *              && exists PF in possible_futures(dest(nfaEDGE)) s.t.
 * 	                sides_of_possible_future (PF) == superedge_sides (EDGE)
 *
 * also:
 *
 *      let SUPER2 := superedge_destination(EDGE)
 *          nfa_states(SUPER2)
 *           == union of all nfa state sets S s.t.
 *                          exists PF in possible_futures(dest(nfaEDGE)) s.t.
 * 	                       sides_of_possible_future (PF) == superedge_sides (EDGE)
 *                          && S == dests_of_possible_future (PF) }
 *
 * Or in english, every superstate is a set of nfa states.  A given
 * character and a superstate implies many transitions in the NFA --
 * those that begin with an edge labeled with that character from a
 * state in the set corresponding to the superstate.
 * 
 * The destinations of those transitions each have a set of possible
 * futures.  A possible future is a list of side effects and a set of
 * destination NFA states.  Two sets of possible futures can be
 * `merged' by combining all pairs of possible futures that have the
 * same side effects.  A pair is combined by creating a new future
 * with the same side effect but the union of the two destination sets.
 * In this way, all the possible futures suggested by a superstate
 * and a character can be merged into a set of possible futures where
 * no two elements of the set have the same set of side effects.
 *
 * The destination of a possible future, being a set of NFA states, 
 * corresponds to a supernfa state.  So, the merged set of possible
 * futures we just created can serve as a set of edges in the
 * supernfa.
 *
 * The representation of the superstate nfa and the nfa is critical.
 * The nfa has to be compact, but has to facilitate the rapid
 * computation of missing superstates.  The superstate nfa has to 
 * be fast to interpret, lazilly constructed, and bounded in space.
 *
 * To facilitate interpretation, the superstate data structures are 
 * peppered with `instruction frames'.  There is an instruction set
 * defined below which matchers using the supernfa must be able to
 * interpret.
 *
 * We'd like to make it possible but not mandatory to use code
 * addresses to represent instructions (c.f. gcc's computed goto).
 * Therefore, we define an enumerated type of opcodes, and when
 * writing one of these instructions into a data structure, use
 * the opcode as an index into a table of instruction values.
 * 
 * Below are the opcodes that occur in the superstate nfa.
 *
 * The descriptions of the opcodes refer to data structures that are
 * described further below. 
 */

enum rx_opcode
{
  /* 
   * BACKTRACK_POINT is invoked when a character transition in 
   * a superstate leads to more than one edge.  In that case,
   * the edges have to be explored independently using a backtracking
   * strategy.
   *
   * A BACKTRACK_POINT instruction is stored in a superstate's 
   * transition table for some character when it is known that that
   * character crosses more than one edge.  On encountering this
   * instruction, the matcher saves enough state to backtrack to this
   * point later in the match.
   */
  rx_backtrack_point = 0,	/* data is (struct transition_class *) */

  /* 
   * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
   * There is one occurence of this instruction per rx_distinct_future.
   * This instruction is skipped if a rx_distinct_future has no side effects.
   */
  rx_do_side_effects = rx_backtrack_point + 1,

  /* data is (struct rx_distinct_future *) */

  /* 
   * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
   * destination superstate has been reclaimed (or was never built).
   * It recomputes the destination superstate.
   * RX_CACHE_MISS is also stored in a superstate transition table before
   * any of its edges have been built.
   */
  rx_cache_miss = rx_do_side_effects + 1,
  /* data is (struct rx_distinct_future *) */

  /* 
   * RX_NEXT_CHAR is called to consume the next character and take the
   * corresponding transition.  This is the only instruction that uses 
   * the DATA field of the instruction frame instead of DATA_2.
   * The comments about rx_inx explain this further.
   */
  rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */

  /* RX_BACKTRACK indicates that a transition fails.  Don't
   * confuse this with rx_backtrack_point.
   */
  rx_backtrack = rx_next_char + 1, /* no data */

  /* 
   * RX_ERROR_INX is stored only in places that should never be executed.
   */
  rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */

  rx_num_instructions = rx_error_inx + 1
};

/* The heart of the matcher is a `word-code-interpreter' 
 * (like a byte-code interpreter, except that instructions
 * are a full word wide).
 *
 * Instructions are not stored in a vector of code, instead,
 * they are scattered throughout the data structures built
 * by the regexp compiler and the matcher.  One word-code instruction,
 * together with the arguments to that instruction, constitute
 * an instruction frame (struct rx_inx).
 *
 * This structure type is padded by hand to a power of 2 because
 * in one of the dominant cases, we dispatch by indexing a table
 * of instruction frames.  If that indexing can be accomplished
 * by just a shift of the index, we're happy.
 *
 * Instructions take at most one argument, but there are two
 * slots in an instruction frame that might hold that argument.
 * These are called data and data_2.  The data slot is only
 * used for one instruction (RX_NEXT_CHAR).  For all other 
 * instructions, data should be set to 0.
 *
 * RX_NEXT_CHAR is the most important instruction by far.
 * By reserving the data field for its exclusive use, 
 * instruction dispatch is sped up in that case.  There is
 * no need to fetch both the instruction and the data,
 * only the data is needed.  In other words, a `cycle' begins
 * by fetching the field data.  If that is non-0, then it must
 * be the destination state of a next_char transition, so
 * make that value the current state, advance the match position
 * by one character, and start a new cycle.  On the other hand,
 * if data is 0, fetch the instruction and do a more complicated
 * dispatch on that.
 */

struct rx_inx 
{
  void * data;
  void * data_2;
  void * inx;
  void * fnord;
};

#ifndef RX_TAIL_ARRAY
#define RX_TAIL_ARRAY  1
#endif

/* A superstate corresponds to a set of nfa states.  Those sets are
 * represented by STRUCT RX_SUPERSET.  The constructors
 * guarantee that only one (shared) structure is created for a given set.
 */
struct rx_superset
{
  int refs;			/* This is a reference counted structure. */

  /* We keep these sets in a cache because (in an unpredictable way),
   * the same set is often created again and again.  
   *
   * When an NFA is destroyed, some of the supersets for that NFA
   * may still exist.  This can lead to false cache hits -- an apparent cache
   * hit on a superset that properly belongs to an already free NFA.
   *
   * When a cache hit appears to occur, we will have in hand the
   * nfa for which it may have happened.  Every nfa is given
   * its own sequence number.  The cache is validated
   * by comparing the nfa sequence number to this field:
   */
  int id;

  struct rx_nfa_state * car;	/* May or may not be a valid addr. */
  struct rx_superset * cdr;

  /* If the corresponding superstate exists: */
  struct rx_superstate * superstate;

  /* That is_final field of the constiuent nfa states which has the greatest magnitude. */
  int is_final;

  /* The OR of the corresponding fields of the constiuent nfa states. */
  int has_cset_edges;


  /* There is another bookkeeping problem.  It is expensive to 
   * compute the starting nfa state set for an nfa.  So, once computed,
   * it is cached in the `struct rx'.
   *
   * But, the state set can be flushed from the superstate cache.
   * When that happens, the cached value in the `struct rx' has
   * to be flushed.
   */
  struct rx * starts_for;

  /* This is used to link into a hash bucket so these objects can
   * be `hash-consed'.
   */
  struct rx_hash_item hash_item;
};

#define rx_protect_superset(RX,CON) (++(CON)->refs)

/* The terminology may be confusing (rename this structure?).
 * Every character occurs in at most one rx_super_edge per super-state.
 * But, that structure might have more than one option, indicating a point
 * of non-determinism. 
 *
 * In other words, this structure holds a list of superstate edges
 * sharing a common starting state and character label.  The edges
 * are in the field OPTIONS.  All superstate edges sharing the same
 * starting state and character are in this list.
 */
struct rx_super_edge
{
  struct rx_super_edge *next;
  struct rx_inx rx_backtrack_frame;
  int cset_size;
  rx_Bitset cset;
  struct rx_distinct_future *options;
};

/* A superstate is a set of nfa states (RX_SUPERSET) along
 * with a transition table.  Superstates are built on demand and reclaimed
 * without warning.  To protect a superstate from this ghastly fate,
 * use LOCK_SUPERSTATE. 
 */
struct rx_superstate
{
  int rx_id;			/* c.f. the id field of rx_superset */
  int locks;			/* protection from reclamation */

  /* Within a superstate cache, all the superstates are kept in a big
   * queue.  The tail of the queue is the state most likely to be
   * reclaimed.  The *recyclable fields hold the queue position of 
   * this state.
   */
  struct rx_superstate * next_recyclable;
  struct rx_superstate * prev_recyclable;

  /* The supernfa edges that exist in the cache and that have
   * this state as their destination are kept in this list:
   */
  struct rx_distinct_future * transition_refs;

  /* The list of nfa states corresponding to this superstate: */
  struct rx_superset * contents;

  /* The list of edges in the cache beginning from this state. */
  struct rx_super_edge * edges;

  /* A tail of the recyclable queue is marked as semifree.  A semifree
   * state has no incoming next_char transitions -- any transition
   * into a semifree state causes a complex dispatch with the side
   * effect of rescuing the state from its semifree state into a 
   * fully free state at the head of the queue.
   *
   * An alternative to this might be to make next_char more expensive,
   * and to move a state to the head of the recyclable queue whenever
   * it is entered.  That way, popular states would never be recycled.
   *
   * But unilaterally making next_char more expensive actually loses.
   * So, incoming transitions are only made expensive for states near
   * the tail of the recyclable queue.  The more cache contention
   * there is, the more frequently a state will have to prove itself
   * and be moved back to the front of the queue.  If there is less 
   * contention, then popular states just aggregate in the front of 
   * the queue and stay there.
   */
  int is_semifree;


  /* This keeps track of the size of the transition table for this
   * state.  There is a half-hearted attempt to support variable sized
   * superstates.
   */
  int trans_size;

  /* Indexed by characters... */
  struct rx_inx transitions[RX_TAIL_ARRAY];
};


/* A list of distinct futures define the edges that leave from a 
 * given superstate on a given character.  c.f. rx_super_edge.
 */
struct rx_distinct_future
{
  struct rx_distinct_future * next_same_super_edge[2];
  struct rx_distinct_future * next_same_dest;
  struct rx_distinct_future * prev_same_dest;
  struct rx_superstate * present;	/* source state */
  struct rx_superstate * future;	/* destination state */
  struct rx_super_edge * edge;


  /* The future_frame holds the instruction that should be executed
   * after all the side effects are done, when it is time to complete
   * the transition to the next state.
   *
   * Normally this is a next_char instruction, but it may be a
   * cache_miss instruction as well, depending on whether or not
   * the superstate is in the cache and semifree.
   * 
   * If this is the only future for a given superstate/char, and
   * if there are no side effects to be performed, this frame is
   * not used (directly) at all.  Instead, its contents are copied
   * into the transition table of the starting state of this dist. future
   * (a sort of goto elimination).
   */
  struct rx_inx future_frame;

  struct rx_inx side_effects_frame;
  struct rx_se_list * effects;
};

#define rx_lock_superstate(R,S)  ((S)->locks++)
#define rx_unlock_superstate(R,S) (--(S)->locks)

struct rx_cache;

#ifdef __STDC__
typedef void (*rx_morecore_fn)(struct rx_cache *);
#else
typedef void (*rx_morecore_fn)();
#endif

struct rx_cache
{
  struct rx_hash_rules superset_hash_rules;

  struct rx_superstate * lru_superstate;
  struct rx_superstate * semifree_superstate;

  struct rx_superset * empty_superset;

  int superstates;
  int semifree_superstates;
  int hits;
  int misses;

  int bytes_allowed;
  int bytes_used;

  int local_cset_size;
  void ** instruction_table;

  struct rx_hash superset_table;
};

#ifndef RX_DEFAULT_DFA_CACHE_SIZE
/* This is an upper bound on the number of bytes that may (normally)
 * be allocated for DFA states.  If this threshold would be exceeded,
 * Rx tries to flush some DFA states from the cache.
 *
 * This value is used whenever the rx_default_cache is used (for example,
 * with the Posix entry points).
 */
#define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19)
#endif

extern struct rx_cache * rx_default_cache;


#ifdef __STDC__
extern char * rx_cache_malloc (struct rx_cache * cache, int size);
extern void rx_cache_free (struct rx_cache * cache, int size, char * mem);
extern void rx_release_superset (struct rx *rx,
				 struct rx_superset *set);
extern struct rx_superset * rx_superset_cons (struct rx * rx,
					      struct rx_nfa_state *car, struct rx_superset *cdr);
extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ;
extern struct rx_superstate * rx_superstate (struct rx *rx,
					     struct rx_superset *set);
extern struct rx_inx * rx_handle_cache_miss (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ;

#else /* STDC */
extern char * rx_cache_malloc ();
extern void rx_cache_free ();
extern void rx_release_superset ();
extern struct rx_superset * rx_superset_cons ();
extern struct rx_superset * rx_superstate_eclosure_union ();
extern struct rx_superstate * rx_superstate ();
extern struct rx_inx * rx_handle_cache_miss ();

#endif /* STDC */

#endif  /* RXSUPERH */