1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
|
/* classes: h_files */
#ifndef RXSUPERH
#define RXSUPERH
/* Copyright (C) 1995, 1996 Tom Lord
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Library General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this software; see the file COPYING. If not, write to
* the Free Software Foundation, 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/* lord Sun May 7 12:40:17 1995 */
#include "rxnfa.h"
/* This begins the description of the superstate NFA.
*
* The superstate NFA corresponds to the NFA in these ways:
*
* Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
*
* Superstate edges correspond to NFA paths.
*
* The superstate has no epsilon transitions;
* every edge has a character label, and a (possibly empty) side
* effect label. The side effect label corresponds to a list of
* side effects that occur in the NFA. These parts are referred
* to as: superedge_character(EDGE) and superedge_sides(EDGE).
*
* For a superstate edge EDGE starting in some superstate SUPER,
* the following is true (in pseudo-notation :-):
*
* exists DEST in nfa_states s.t.
* exists nfaEDGE in nfa_edges s.t.
* origin (nfaEDGE) == DEST
* && origin (nfaEDGE) is a member of nfa_states(SUPER)
* && exists PF in possible_futures(dest(nfaEDGE)) s.t.
* sides_of_possible_future (PF) == superedge_sides (EDGE)
*
* also:
*
* let SUPER2 := superedge_destination(EDGE)
* nfa_states(SUPER2)
* == union of all nfa state sets S s.t.
* exists PF in possible_futures(dest(nfaEDGE)) s.t.
* sides_of_possible_future (PF) == superedge_sides (EDGE)
* && S == dests_of_possible_future (PF) }
*
* Or in english, every superstate is a set of nfa states. A given
* character and a superstate implies many transitions in the NFA --
* those that begin with an edge labeled with that character from a
* state in the set corresponding to the superstate.
*
* The destinations of those transitions each have a set of possible
* futures. A possible future is a list of side effects and a set of
* destination NFA states. Two sets of possible futures can be
* `merged' by combining all pairs of possible futures that have the
* same side effects. A pair is combined by creating a new future
* with the same side effect but the union of the two destination sets.
* In this way, all the possible futures suggested by a superstate
* and a character can be merged into a set of possible futures where
* no two elements of the set have the same set of side effects.
*
* The destination of a possible future, being a set of NFA states,
* corresponds to a supernfa state. So, the merged set of possible
* futures we just created can serve as a set of edges in the
* supernfa.
*
* The representation of the superstate nfa and the nfa is critical.
* The nfa has to be compact, but has to facilitate the rapid
* computation of missing superstates. The superstate nfa has to
* be fast to interpret, lazilly constructed, and bounded in space.
*
* To facilitate interpretation, the superstate data structures are
* peppered with `instruction frames'. There is an instruction set
* defined below which matchers using the supernfa must be able to
* interpret.
*
* We'd like to make it possible but not mandatory to use code
* addresses to represent instructions (c.f. gcc's computed goto).
* Therefore, we define an enumerated type of opcodes, and when
* writing one of these instructions into a data structure, use
* the opcode as an index into a table of instruction values.
*
* Below are the opcodes that occur in the superstate nfa.
*
* The descriptions of the opcodes refer to data structures that are
* described further below.
*/
enum rx_opcode
{
/*
* BACKTRACK_POINT is invoked when a character transition in
* a superstate leads to more than one edge. In that case,
* the edges have to be explored independently using a backtracking
* strategy.
*
* A BACKTRACK_POINT instruction is stored in a superstate's
* transition table for some character when it is known that that
* character crosses more than one edge. On encountering this
* instruction, the matcher saves enough state to backtrack to this
* point later in the match.
*/
rx_backtrack_point = 0, /* data is (struct transition_class *) */
/*
* RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
* There is one occurence of this instruction per rx_distinct_future.
* This instruction is skipped if a rx_distinct_future has no side effects.
*/
rx_do_side_effects = rx_backtrack_point + 1,
/* data is (struct rx_distinct_future *) */
/*
* RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
* destination superstate has been reclaimed (or was never built).
* It recomputes the destination superstate.
* RX_CACHE_MISS is also stored in a superstate transition table before
* any of its edges have been built.
*/
rx_cache_miss = rx_do_side_effects + 1,
/* data is (struct rx_distinct_future *) */
/*
* RX_NEXT_CHAR is called to consume the next character and take the
* corresponding transition. This is the only instruction that uses
* the DATA field of the instruction frame instead of DATA_2.
* The comments about rx_inx explain this further.
*/
rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
/* RX_BACKTRACK indicates that a transition fails. Don't
* confuse this with rx_backtrack_point.
*/
rx_backtrack = rx_next_char + 1, /* no data */
/*
* RX_ERROR_INX is stored only in places that should never be executed.
*/
rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
rx_num_instructions = rx_error_inx + 1
};
/* The heart of the matcher is a `word-code-interpreter'
* (like a byte-code interpreter, except that instructions
* are a full word wide).
*
* Instructions are not stored in a vector of code, instead,
* they are scattered throughout the data structures built
* by the regexp compiler and the matcher. One word-code instruction,
* together with the arguments to that instruction, constitute
* an instruction frame (struct rx_inx).
*
* This structure type is padded by hand to a power of 2 because
* in one of the dominant cases, we dispatch by indexing a table
* of instruction frames. If that indexing can be accomplished
* by just a shift of the index, we're happy.
*
* Instructions take at most one argument, but there are two
* slots in an instruction frame that might hold that argument.
* These are called data and data_2. The data slot is only
* used for one instruction (RX_NEXT_CHAR). For all other
* instructions, data should be set to 0.
*
* RX_NEXT_CHAR is the most important instruction by far.
* By reserving the data field for its exclusive use,
* instruction dispatch is sped up in that case. There is
* no need to fetch both the instruction and the data,
* only the data is needed. In other words, a `cycle' begins
* by fetching the field data. If that is non-0, then it must
* be the destination state of a next_char transition, so
* make that value the current state, advance the match position
* by one character, and start a new cycle. On the other hand,
* if data is 0, fetch the instruction and do a more complicated
* dispatch on that.
*/
struct rx_inx
{
void * data;
void * data_2;
void * inx;
void * fnord;
};
#ifndef RX_TAIL_ARRAY
#define RX_TAIL_ARRAY 1
#endif
/* A superstate corresponds to a set of nfa states. Those sets are
* represented by STRUCT RX_SUPERSET. The constructors
* guarantee that only one (shared) structure is created for a given set.
*/
struct rx_superset
{
int refs; /* This is a reference counted structure. */
/* We keep these sets in a cache because (in an unpredictable way),
* the same set is often created again and again.
*
* When an NFA is destroyed, some of the supersets for that NFA
* may still exist. This can lead to false cache hits -- an apparent cache
* hit on a superset that properly belongs to an already free NFA.
*
* When a cache hit appears to occur, we will have in hand the
* nfa for which it may have happened. Every nfa is given
* its own sequence number. The cache is validated
* by comparing the nfa sequence number to this field:
*/
int id;
struct rx_nfa_state * car; /* May or may not be a valid addr. */
struct rx_superset * cdr;
/* If the corresponding superstate exists: */
struct rx_superstate * superstate;
/* That is_final field of the constiuent nfa states which has the greatest magnitude. */
int is_final;
/* The OR of the corresponding fields of the constiuent nfa states. */
int has_cset_edges;
/* There is another bookkeeping problem. It is expensive to
* compute the starting nfa state set for an nfa. So, once computed,
* it is cached in the `struct rx'.
*
* But, the state set can be flushed from the superstate cache.
* When that happens, the cached value in the `struct rx' has
* to be flushed.
*/
struct rx * starts_for;
/* This is used to link into a hash bucket so these objects can
* be `hash-consed'.
*/
struct rx_hash_item hash_item;
};
#define rx_protect_superset(RX,CON) (++(CON)->refs)
/* The terminology may be confusing (rename this structure?).
* Every character occurs in at most one rx_super_edge per super-state.
* But, that structure might have more than one option, indicating a point
* of non-determinism.
*
* In other words, this structure holds a list of superstate edges
* sharing a common starting state and character label. The edges
* are in the field OPTIONS. All superstate edges sharing the same
* starting state and character are in this list.
*/
struct rx_super_edge
{
struct rx_super_edge *next;
struct rx_inx rx_backtrack_frame;
int cset_size;
rx_Bitset cset;
struct rx_distinct_future *options;
};
/* A superstate is a set of nfa states (RX_SUPERSET) along
* with a transition table. Superstates are built on demand and reclaimed
* without warning. To protect a superstate from this ghastly fate,
* use LOCK_SUPERSTATE.
*/
struct rx_superstate
{
int rx_id; /* c.f. the id field of rx_superset */
int locks; /* protection from reclamation */
/* Within a superstate cache, all the superstates are kept in a big
* queue. The tail of the queue is the state most likely to be
* reclaimed. The *recyclable fields hold the queue position of
* this state.
*/
struct rx_superstate * next_recyclable;
struct rx_superstate * prev_recyclable;
/* The supernfa edges that exist in the cache and that have
* this state as their destination are kept in this list:
*/
struct rx_distinct_future * transition_refs;
/* The list of nfa states corresponding to this superstate: */
struct rx_superset * contents;
/* The list of edges in the cache beginning from this state. */
struct rx_super_edge * edges;
/* A tail of the recyclable queue is marked as semifree. A semifree
* state has no incoming next_char transitions -- any transition
* into a semifree state causes a complex dispatch with the side
* effect of rescuing the state from its semifree state into a
* fully free state at the head of the queue.
*
* An alternative to this might be to make next_char more expensive,
* and to move a state to the head of the recyclable queue whenever
* it is entered. That way, popular states would never be recycled.
*
* But unilaterally making next_char more expensive actually loses.
* So, incoming transitions are only made expensive for states near
* the tail of the recyclable queue. The more cache contention
* there is, the more frequently a state will have to prove itself
* and be moved back to the front of the queue. If there is less
* contention, then popular states just aggregate in the front of
* the queue and stay there.
*/
int is_semifree;
/* This keeps track of the size of the transition table for this
* state. There is a half-hearted attempt to support variable sized
* superstates.
*/
int trans_size;
/* Indexed by characters... */
struct rx_inx transitions[RX_TAIL_ARRAY];
};
/* A list of distinct futures define the edges that leave from a
* given superstate on a given character. c.f. rx_super_edge.
*/
struct rx_distinct_future
{
struct rx_distinct_future * next_same_super_edge[2];
struct rx_distinct_future * next_same_dest;
struct rx_distinct_future * prev_same_dest;
struct rx_superstate * present; /* source state */
struct rx_superstate * future; /* destination state */
struct rx_super_edge * edge;
/* The future_frame holds the instruction that should be executed
* after all the side effects are done, when it is time to complete
* the transition to the next state.
*
* Normally this is a next_char instruction, but it may be a
* cache_miss instruction as well, depending on whether or not
* the superstate is in the cache and semifree.
*
* If this is the only future for a given superstate/char, and
* if there are no side effects to be performed, this frame is
* not used (directly) at all. Instead, its contents are copied
* into the transition table of the starting state of this dist. future
* (a sort of goto elimination).
*/
struct rx_inx future_frame;
struct rx_inx side_effects_frame;
struct rx_se_list * effects;
};
#define rx_lock_superstate(R,S) ((S)->locks++)
#define rx_unlock_superstate(R,S) (--(S)->locks)
struct rx_cache;
#ifdef __STDC__
typedef void (*rx_morecore_fn)(struct rx_cache *);
#else
typedef void (*rx_morecore_fn)();
#endif
struct rx_cache
{
struct rx_hash_rules superset_hash_rules;
struct rx_superstate * lru_superstate;
struct rx_superstate * semifree_superstate;
struct rx_superset * empty_superset;
int superstates;
int semifree_superstates;
int hits;
int misses;
int bytes_allowed;
int bytes_used;
int local_cset_size;
void ** instruction_table;
struct rx_hash superset_table;
};
#ifndef RX_DEFAULT_DFA_CACHE_SIZE
/* This is an upper bound on the number of bytes that may (normally)
* be allocated for DFA states. If this threshold would be exceeded,
* Rx tries to flush some DFA states from the cache.
*
* This value is used whenever the rx_default_cache is used (for example,
* with the Posix entry points).
*/
#define RX_DEFAULT_DFA_CACHE_SIZE (1 << 19)
#endif
extern struct rx_cache * rx_default_cache;
#ifdef __STDC__
extern char * rx_cache_malloc (struct rx_cache * cache, int size);
extern void rx_cache_free (struct rx_cache * cache, int size, char * mem);
extern void rx_release_superset (struct rx *rx,
struct rx_superset *set);
extern struct rx_superset * rx_superset_cons (struct rx * rx,
struct rx_nfa_state *car, struct rx_superset *cdr);
extern struct rx_superset * rx_superstate_eclosure_union (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ;
extern struct rx_superstate * rx_superstate (struct rx *rx,
struct rx_superset *set);
extern struct rx_inx * rx_handle_cache_miss (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ;
#else /* STDC */
extern char * rx_cache_malloc ();
extern void rx_cache_free ();
extern void rx_release_superset ();
extern struct rx_superset * rx_superset_cons ();
extern struct rx_superset * rx_superstate_eclosure_union ();
extern struct rx_superstate * rx_superstate ();
extern struct rx_inx * rx_handle_cache_miss ();
#endif /* STDC */
#endif /* RXSUPERH */
|