File: jregex.c

package info (click to toggle)
lookup 1.08b-15
links: PTS
area: main
in suites: sid
size: 1,784 kB
sloc: ansic: 12,638; makefile: 247; perl: 174; sh: 53
file content (4132 lines) | stat: -rw-r--r-- 128,588 bytes
parent folder | download | duplicates (2)
/*
 * Jeffrey Friedl
 * Omron Corporation			ʳ
 * Nagaokakyoshi, Japan			617Ĺ
 *
 * jfriedl@nff.ncl.omron.co.jp
 *
 * This work is placed under the terms of the GNU General Purpose License
 * (the "GNU Copyleft").
 *
 *
 * Jeffrey's REGEX routines with Japanese EUC support.
 * October 1993.
 *
 * This file is huge, but don't let it intimidate you. Most of it is comments.
 *
 * See "jregex.h" for overall usage info.
 *
 * Note: the terms "ASCII" is synonymous with "JIS/ROMAN" as far as
 * this file is concerned.
 *
 * * * * * * * * * * * * * * * * * * * * * * * * * *
 *
 * The general idea is that the function
 *	regcomp()	-- ``regular expression compile''
 * accepts a regular expression (and some flags to control how the expression
 * it to be interpreted), and it fills in a STRUCT REGEXBUF (see jregex.h)
 * with an internal representation (a "compiled" form) of said regex.
 *
 * The user then gives this STRUCT REGEXBUF along with a string to check
 * the pattern against to
 * 	regexec()	-- ``regular expression execute''
 * which returns true if the string matches the pattern, false otherwise.
 * As byproducts, regexec may fill in some global information concerning
 * which text was matched by which sets of parenthesis, etc.
 *
 * * * * * * * * * * * * * * * * * * * * * * * * * *
 *
 * A STRUCT REGEXBUF is the compiled (internal form of the) regex as
 * returned to the user.
 *
 * BUF is the array of bytes representing the original regex. This is called
 * the "compiled form" or the "regex program" (since it is "executed").
 * See the enum TYPE below for a description of how these bytes are
 * interpreted. BUFEND points to the first byte after the array.
 *
 * MIN_LENGTH_MATCH indicates the length of the shortest string which
 * could possibly match (and if 0, all strings match).
 *
 * ANCHOR_TO_START_OF_LINE is true if the pattern only matches at the
 * beginning of a line, i.e. the pattern begins with a '^'. In this case,
 * the command for that match in the compiled regex is omitted, as this
 * flag specifies it.
 *
 * FOLD_ACASE is true if regcomp() was told to fold ASCII case (i.e. ignore
 * differences in case when comparing letters, such that 'a' would match 'A').
 *
 * FOLD_KCASE is true if regcomp() was told to fold kana case (i.e. ignore
 * "kananess", such that֤would match ֥).
 *
 * MUSTHAVE is created if REGCOMP_CALC_MUSTHAVE is given in the flags to
 * regcomp, and is a list of characters that are required for all successful
 * matches. The list isn't 100% guaranteed to be complete (i.e. there are
 * cases where a required character might not appear, but all characters
 * int the list are certainly required). This can be used by the user to
 * pre-screen lines, omitting lines not having all characters. Note that
 * when case folding is done, only the lower-case version of a character
 * appears in the list.
 *
 * MAX_PAREN_LEVEL is the number of sets of parentheses found in the
 * pattern.... it would be 3 for the patter(a(b?))+c(de)?.
 *
 * PAREN_INFO_REQUIRED indicates the number of the highest set which is
 * referenced from within the pattern... it would be zero for the
 * MAX_PAREN_LEVEL example, and 1 for something like(['"])(\w+)\1.
 *
 * MAX_LPC_L and MAX_LPC_C are used to in regexec to decide how much stack
 * space (for the execution of the regex) might be needed. See the discussion
 * elsewhere about these values.
 */

#include "config.h"
#include "assert.h"
#include <ctype.h>
#ifdef DEBUG_MALLOC
# include <malloc.h>
#endif
#include "jregex.h"
#include "output.h"
#include "euc.h"
#include <stdio.h>

/*
TO BE DONE:

if not looking for longest, can invert push/check for anychar.
(allow things that PUSH the entire line to do things backwards)

optimize by having a firstchar

support shift-JIS as well as EUC

can sort EUC characters in classes for quicker lookup... even binary for
long enough ones.

combine adjacent EXACT/ONCE nodes into EXACT(N)/ONCE node.

find out what standards say about these things and try to conform.

Add {n,m} construct.

Partition so that can be compiled in more minimal configurations if one
doesn't need much of the support.

Test EUC Code Set #3 support.
*/


/*
 NO_REGEXEC_MATCH_POINTS
 remove support for regexec_match_start and regexec_match_end.
 A (very very) small optimization if not needed.

 DONT_WORRY_ABOUT_KATAKANA_DASH_BEING_PART_OF_A_WORD
 The word_boundary stuff has to pay special attention to the kana
 dash character֡since it's not on the same EUC page as the kana.
 To make sure something like֥ȡis considered one word, we have
 to do some special processing. Defining this omits that.

 NO_PAREN_INFO
 Define to remove support for regexec_paren_info, et. al.
*/


/***********************************************************************
 ***********************************************************************/

/*
 * Defining FAST_REGEXEC will remove the ability to do regexec debugging.
 * This will save on doing the "are we debugging" tests.
 *
 * Defining NDEBUG will remove other debugging support as well. It will
 * still leave showregex() there for the user.
 *
 * One can define NO_SHOWREGEX independently to omit this routine if you
 * know you'll not use it. I should split this out into a separate file.
 */

#ifndef NDEBUG
# define DEBUGSTUFF(stuff)        stuff
#else
# define DEBUGSTUFF(stuff)        /* nothing */
# ifndef FAST_REGEXEC
#  define FAST_REGEXEC
# endif
#endif

#ifdef NO_REGEX_STATS
# define STATS(stuff)             /* nothing */
  void regex_reset_stats(void) {
     /* nothing */
  }
#else
# define STATS(stuff)             stuff
  struct regex_stats regex_stats;
  void regex_reset_stats(void) {
      bzero((void *)&regex_stats, sizeof(regex_stats));
  }
#endif

#ifdef FAST_REGEXEC
# define FASTDEBUGSTUFF(stuff)    /* nothing */
#else
# define FASTDEBUGSTUFF(stuff)    stuff
#endif

#if !defined(__GNUC__)
#  if !defined(__volatile__)
#    define __volatile__ /*nothing; for use with volatile functions */
#  endif
#  if !defined(__inline__)
#    define __inline__ /*nothing; for use with volatile functions */
#  endif
#endif

#define sizeof_array(array)	(sizeof(array)/sizeof(array[0]))




/*******************
 *******************	Some general routines.
 *******************/

/*
 * In case we give an invalid character to EUC_CHAR_LENGTH and it
 * returns zero, we sometimes want to just treat it as a one-byte
 * (unknown) character, so we'll use this macro.
 */
#define CHAR_LENGTH(C) (EUC_CHAR_LENGTH(C) ? EUC_CHAR_LENGTH(C) : 1)



/*
 * Return where if the given one-byte character is found in the string
 * (possibly of mixed-size characters) from range_start to string_end,
 * return zero otherwise.
 */
static __inline__ const unsigned char *
onebyte_char_in_string(unsigned char byte0,
		       const unsigned char *string_start,
		       const unsigned char *string_end)
{
    while (string_start < string_end)
    {
	unsigned len = EUC_CHAR_LENGTH(string_start[0]);
	if (len == 0)
	{
	    assert(0);
	    len = 1;
	}
	if (1 == len && byte0 == string_start[0])
	    return string_start;
	string_start += len;
    }
    return 0;
}

/*
 * Return where if the given two-byte character is found in the string
 * (possibly of mixed-size characters) from range_start to string_end,
 * return zero otherwise.
 */
static __inline__ const unsigned char *
twobyte_char_in_string(unsigned char byte0,
		       unsigned char byte1,
		       const unsigned char *string_start,
		       const unsigned char *string_end)
{
    if (string_start == string_end)
	return 0;

    string_end -= 1; /* no need to start checking at the last byte */
    while (string_start < string_end)
    {
	unsigned len = EUC_CHAR_LENGTH(string_start[0]);
	if (len == 0)
	{
	    assert(0);
	    len = 1;
	}
	/* the order we check the bytes is designed to fail ASAP for
	   the random non-matching pair */
	if (2 == len && byte1 == string_start[1] && byte0 == string_start[0])
	    return string_start;
	string_start += len;
    }
    return 0;
}

/*
 * Return where if the given three-byte character is found in the string
 * (possibly of mixed-size characters) from range_start to string_end,
 * return zero otherwise.
 */
static __inline__ const unsigned char *
threebyte_char_in_string(unsigned char byte0,
			 unsigned char byte1,
			 unsigned char byte2,
			 const unsigned char *string_start,
			 const unsigned char *string_end)
{
    if (string_start == string_end)
	return 0;

    string_end -= 2; /* no need to start checking at the last two bytes */
    while (string_start < string_end)
    {
	unsigned len = EUC_CHAR_LENGTH(string_start[0]);
	if (len == 0)
	{
	    assert(0);
	    len = 1;
	}
	/* the order we check the bytes is designed to fail ASAP for
	   the random non-matching pair */
	if (3 == len && byte2 == string_start[2] &&
	    byte1 == string_start[1] && byte0 == string_start[0])
	        return string_start;
	string_start += len;
    }
    return 0;
}



/*
 * A regex buffer is an array of bytes (unsigned chars). These bytes
 * can be partitioned into sets of bytes representing "commands", such
 * as "match `A' exactly", etc.  The first byte in any set indicates
 * what type of command it is, which thereby indicates the number and
 * interpretation of any following bytes in the set.
 *
 * The type byte (first byte in a set) has two "fields". The upper six
 * bits indicate the basic type... match an exact ascii character,
 * match the end of a word, etc.  The lower two bits indicate how many
 * times the thing is to be done (once, at least once, zero or one
 * times, or zero or more times). These 'count' bits only make sense
 * for some of the major types... doesn't make sense for "end of
 * word", so are ignored in such cases.
 */

#define    COUNT_BITS      2	/* Bits used to represent count. */
#define    ZERO_OK         0x1  /* This bit means this, while... */
#define    MORE_THAN_1_OK  0x2  /* ... that bit means that.      */

/*
 * The following are how the "count" two-bit value is interpreted.
 */
#define    ONCE            (           0            )
#define    ZERO_OR_ONE     (ZERO_OK                 )
#define    ONE_OR_MORE     (          MORE_THAN_1_OK)
#define    ANY_NUMBER      (ZERO_OK | MORE_THAN_1_OK)

#define    NO_COUNT	   0 /* just for readability when it doesn't matter */

/*
 * Macros for creating and accessing a type-byte value from a major
 * type number and a count. Should be pretty self-explanatory.
 */
#define make_type_byte_with_count(type, count)  (((type)<<COUNT_BITS)|(count))
#define make_type_byte(type)     make_type_byte_with_count(type, NO_COUNT)
#define get_count_from_type_byte(type)  ((type) & ((1<<COUNT_BITS)-1))
#define get_type_from_type_byte(type)   ((type) >> COUNT_BITS)

/*
 * Possible "command" types (upper 6 bits of first byte of each set).
 */
enum TYPE
{
    UNUSED = 0,
        /* just to mark a do-nothing node.... not used in the final pattern */

    EXACT1,
        /* Match the exact byte which follows in the compiled pattern.
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	EXACT1
	 *	  byte n+1:	byte to match
	 */

    EXACT2,
        /* Match the exact two bytes which follow in the compiled pattern.
	 *
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	EXACT2
	 *	  byte n+1:	first byte to match
	 *	  byte n+2:	second byte to match
	 */
    EITHER_EXACT_2,
        /* Match the exact two bytes which follow in the compiled pattern,
	 * or the two bytes that follow that.
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	EITHER_EXACT_2
	 *	  byte n+1:	first byte of first posibility
	 *	  byte n+2:	second byte of first posibility
	 *	  byte n+3:	first byte of second posibility
	 *	  byte n+4:	second byte of second posibility
	 */

    EXACT3,
        /* Match the exact three bytes which follow in the compiled pattern.
	 *
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	EXACT2
	 *	  byte n+1:	first byte to match
	 *	  byte n+2:	second byte to match
	 *	  byte n+3:	third byte to match
	 */

    EXACT_ICASE_ASCII,
        /* Like EXACT1 except letter case is to be ignored. The byte to
	 * match is guaranteed to be a lower case letter.
	 *
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	EXACT1
	 *	  byte n+1:	lower case one-byte character to match.
	 */

    EXACT_ICASE_KANA,
        /* Like EXACT2 ecxept hiragana vs. katakana is ignored. The two-byte
	 * EUC character to be matched is guaranteed to be a hiragana.
	 *
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	EXACT2
	 *	  byte n+1:	first byte of EUC hiragana
	 *	  byte n+2:	second byte of EUC hiragana
	 */

    ANY_CHAR,
        /* Any character (ascii or EUC) except newline will match.
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_CHAR
	 */

    ANY_ASCII,
        /* Any ASCII character except newline will match.
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_ASCII
	 */

    ANY_MULTIBYTE,
        /* Any multibyte EUC character will match.
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_MULTIBYTE_EUC
	 */

    ANY_KATAKANA,
        /* Any multibyte EUC katakana.
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_KATAKANA
	 */

    ANY_NONKATAKANA,
        /* Anything not multibyte EUC katakana or ASCII newline will match.
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_NONKATAKANA
	 */

    ANY_HIRAGANA,
        /* Any multibyte EUC hiragana
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_HIRAGANA
	 */

    ANY_NONHIRAGANA,
        /* Anything not multibyte EUC hiragana or ASCII newline will match
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_NONHIRAGANA
	 */

    ANY_KANJI,
        /* Any multibyte character in kuten rows 16-84.
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_KANJI
	 */

    ANY_NONKANJI,
        /* Any character not in kuten rows 16-84 (except ASCII newline)
	 * No following bytes. Appearence in the final compiled pattern:
	 *	  byte n:     	ANY_NONKANJI
	 */

    CLASS,
	/* 
	 * Appearence in the final compiled pattern:
	 *  byte n:           CLASS
	 *  byte n+1:         One-byte boolean value ``inverted''
	 *  byte n+2
	 *  byte n+3:         Unsigned two-byte value ``size2''
	 *  byte n+4
	 *  byte n+5:         Unsigned two-byte value ``size3''
	 *  next 128 bytes:   Array[0..127] of boolean values
	 *  next size2 bytes: List of two-byte EUC characters
	 *  next size3 bytes: List of three-byte EUC characters
	 *
	 * Represnets a class (such as ``...[0-9a-f]...'').
	 *
	 * For ASCII, C is in the class if array[C] is true.
	 * EUC characters are in the class if they're found in the lists that
	 * follow 'array'.
	 * 
	 * When 'inverted' is false, characters in the class match.
	 * When 'inverted' is true, characters NOT in the class match.
	 */
	
    count_ok_limit,
	/* This is just an enum marker... the 'count' bits *are*    */
	/* interpreted for type bytes with major types in the       */
	/* list above this member; ignored for those below.         */

    REGEX_MATCH,
        /* If this is reached, the regex matches. No following bytes.
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	REGEX_MATCH
	 */
#ifndef NO_REGEXEC_MATCH_POINTS
    REGEX_MATCH_NOTE_WORD,
        /* Exactly like REGEX_MATCH except that
	 *        regexec_match_at_start_of_word
	 *  and
	 *        regexec_match_at_end_of_word
	 *  will be set appropriatly.
	 * 
	 *	  byte n:     	REGEX_MATCH_NOTE_WORD
	 */
#endif

    WORD_BOUNDARY,
	/* Matches the boundary between words. No following bytes.
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	WORD_BOUNDARY
	 */

    START_OF_LINE,
	/* Matches the start of a line. No following bytes.
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	START_OF_LINE
	 */

    END_OF_LINE,
	/* Matches the end of a line. No following bytes.
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	END_OF_LINE
	 */

    JUMP,
	/* Appearence in the final compiled pattern:
	 *	  byte n:     	JUMP
	 *	  byte n+1,
	 *	  byte n+2:	signed two-byte value ``jump offset''
	 *
	 * When this command executes, the "current location" in the compiled
	 * pattern buffer moves from 'n' to 'n + offset' and execution
	 * continues from there.
	 */
    PUSH,
	/* Appearence in the final compiled pattern:
	 *	  byte n:     	PUSH
	 *	  byte n+1,
	 *	  byte n+2:	signed two-byte value ``push offset''
	 * 
	 * A PUSH indicates an alternative way to lead to a successful match.
	 *
	 * When reached, a PUSH indicates that although the subsequent commands
	 * in the compiled pattern may be used to nibble away at the line being
	 * matched, a proper match may *also* be acheived by continuing with
	 * the matching procedure at the pattern location n+offset.
	 *
	 * For example, the pattern 'ab?c' might result in a compiled
	 * pattern looking something like:
	 *	#1: EXACT1  `a'
	 *	#2: PUSH (refer to command #4)
	 *	#3: EXACT1  `b'
         *	#4: EXACT1  `c'
	 *
	 * Consider trying to match the string "ac" (which will match).
	 * The 'a' will match with command #1; execution will then go
	 * to command #2 with the string-to-be-matched being the "c" left
	 * from the "ac" after the 'a' was nibbled off.
	 *
	 * The PUSH will, in effect, say: ``we'll try to continue with what
	 * follows, but if we end up failing sometime down the line, we can
	 * always try to continue to match the string as we have it at the
	 * moment ("c"), starting at command #4.''. This is done by pushing a
	 * #4/"a" combo onto a stack.
	 *
	 * The execution will then continue to #3, failing to match 'b'
	 * against "c". But rather than failing the whole string, it is 
	 * noticed that there is a "try me" on the stack, and that state
	 * is popped off. Execution continues at #4 with "c".
	 *
	 * The EXACT1 command suceeds. The end of the pattern is reached
	 * so the whole regex succeeds.
	 */ 

    PUSH_JUMP,
	/* Just an optimization.
	 * Like a JUMP, except does an implicit PUSH of the following command.
	 *
	 * The following two are the same:
	 *
	 * 	PUSH (refer to `mark')
	 * 	JUMP somewhere
	 *      mark:
	 *
	 * -and-
	 *
	 * 	PUSH_JUMP somewhere
	 * 	mark:
	 *
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	PUSH
	 *	  byte n+1,
	 *	  byte n+2:	signed two-byte value ``jump offset''
	 */


#ifndef NO_PAREN_INFO
    SAVE_OPEN_PAREN,
    SAVE_CLOSE_PAREN,
	/*
	 * If we've been asked to save paren info for this pattern, this will
	 * indicate that we need to note that we're entering or exiting a set
	 * of parens. Followed by one byte indicating the level of the parens.
	 *
	 * Appearance in the final compiled pattern:
	 *	  byte n:     	SAVE_{OPEN,CLOSE}_PAREN
	 *	  byte n+1:	unsigned one-byte value ``paren level''
	 */

    SAVE_CLOSE_PAREN_PUSH_JUMP,
	/* Combo of SAVE_CLOSE_PAREN and PUSH_JUMP.
	 *
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	SAVE_CLOSE_PAREN_PUSH_JUMP
	 *	  byte n+1:	unsigned one-byte value ``save count''
	 *	  byte n+2:	unsigned one-byte value ``paren level''
	 *	  byte n+3
	 *	  byte n+4:	signed two-byte value ``jump offset''
         */

    PUSH_SAVE_OPEN_PAREN,
	/*
	 * Simply a PUSH followed by a SAVE_OPEN_PAREN.  First two
	 * bytes after is the PUSH value, third byte after is the
	 * paren level.
	 *
	 * Appearance in the final compiled pattern:
	 *	  byte n:     	
	 *	  byte n+1:	
	 */

    MATCH_PREV_PAREN,
	/*
	 * Next byte is paren number to match exactly.
	 *
	 * Appearance in the final compiled pattern:
	 *	  byte n:     	
	 *	  byte n+1:	
	 */
#endif

    OPEN_PAREN,
	/* NOT USED in the compiled pattern, but is used in the
	 * intermediate stage. See that description
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	
	 *	  byte n+1:	
	 */

    ALT
	/* NOT USED in the compiled pattern, but is used in the
	 * intermediate stage. See that description
	 * Appearence in the final compiled pattern:
	 *	  byte n:     	
	 *	  byte n+1:	
	 */
};




/*
 * At various times we want to read and write short values that will
 * be unaligned. We have to emulate it ourselves if our processor
 * can't handle it. Define UNALIGNED_SHORT_ACCESS_OK if you know it's OK.
 */
#ifdef UNALIGNED_SHORT_ACCESS_OK
#  define write_short_val(ptr, val) (*(short *)(ptr) = (val))
#  define  read_short_val(ptr)      (*(short *)(ptr))
#else
# define write_short_val(ptr, val)                                           \
  macro_start {                                                              \
    ((unsigned char *)(ptr))[0] = ((short)(val)) >> 8;                       \
    ((unsigned char *)(ptr))[1] = ((short)(val)) & 0xff;                     \
  } macro_end

# define read_short_val(ptr)                               \
     (short)((((const unsigned char *)(ptr))[0] << 8 |     \
              ((const unsigned char *)(ptr))[1]))
#endif /* UNALIGNED_SHORT_ACCESS_OK */



/*
 * The following #defines are to make reading some of the code a bit
 * easer (there are lots of little magic numbers floating around),
 * and are not meant to be changeable.  All are byte counts.
 */
#define SIZEOF_ASCII	  1		/* one byte */
#define SIZEOF_EUC        2		/* EUC is two bytes */

#define TYPE_BYTE_SIZE	  1            /* The type-byte takes up one byte. */
#define SHORT_VAL_SIZE	  2            /* A short val takes up two bytes.  */
#define ASCII_SPEC_SIZE  SIZEOF_ASCII  /* An ASCII spec in a regex buffer. */
#define EUC_SPEC_SIZE	  SIZEOF_EUC   /* An EUC spec in a regex buffer.   */

#define DETERMINISTIC_CLASS_SIZE (1+1+2+2+128)

#define SIZEOF_PUSH_COMMAND     (TYPE_BYTE_SIZE + /*offset*/SHORT_VAL_SIZE)

#define SIZEOF_JUMP_COMMAND     (TYPE_BYTE_SIZE + /*offset*/SHORT_VAL_SIZE)

#ifndef NO_PAREN_INFO
# define PAREN_SPEC_SIZE   1   /* paren level val held in one byte */
# define PAREN_COUNT_SIZE  1   /* paren level count held in one byte */

# define SIZEOF_PUSH_JUMP_COMMAND (TYPE_BYTE_SIZE + /*offset*/SHORT_VAL_SIZE)

# define SIZEOF_SAVE_OPEN_PAREN_COMMAND  (TYPE_BYTE_SIZE + PAREN_SPEC_SIZE)

# define SIZEOF_SAVE_CLOSE_PAREN_COMMAND (TYPE_BYTE_SIZE + PAREN_SPEC_SIZE)

# define SIZEOF_SAVE_CLOSE_PAREN_PUSH_JUMP  (                                \
        TYPE_BYTE_SIZE +                                                     \
        /* count byte */ PAREN_COUNT_SIZE +                                  \
        /* start byte */ PAREN_SPEC_SIZE +                                   \
        /* offset     */ SHORT_VAL_SIZE )

# define SIZEOF_PUSH_SAVE_OPEN_PAREN_COMMAND (                               \
        TYPE_BYTE_SIZE +                                                     \
        /* offset     */ SHORT_VAL_SIZE +                                    \
        /* paren val  */ PAREN_SPEC_SIZE)
#endif

/*
 * When a '(...)' pattern is compiled, there is a need for extra codes to
 * facilitate the saving of paren info (if compiled in and used) and/or
 * when the paren has '*', '+', or '?' tacked on.
 */

/*
 * In the case of raw parens (w/o '*', '+', or '?'), we need no extra
 * space if we're not saving paren info. If we are, we'd need to wrap
 * the parenthesized regex with SAVE_OPEN_PAREN and SAVE_CLOSE_PAREN.
 */
#ifndef NO_PAREN_INFO
# define EXTRA_FOR_RAW_PAREN   (SIZEOF_SAVE_OPEN_PAREN_COMMAND +             \
				SIZEOF_SAVE_CLOSE_PAREN_COMMAND)
#else
# define EXTRA_FOR_RAW_PAREN    0
#endif

/*
 * For the case of '(stuff)?' the pattern would look like
 *
 *      PUSH (refer to "mark:")
 *      <code for STUFF>
 *    mark:
 * 
 * if not saving paren info, and if so:
 * 
 *      PUSH (refer to "mark:")
 *	SAVE_OPEN_PAREN
 *      <code for STUFF>
 *	SAVE_CLOSE_PAREN
 *    mark:
 *
 * which is optimized to
 *
 *      PUSH_SAVE_OPEN_PAREN (refer to "mark:")
 *      <code for STUFF>
 *	SAVE_CLOSE_PAREN
 *    mark:
 */
#ifndef NO_PAREN_INFO
# define EXTRA_FOR_PAREN_QUESTION 	(SIZEOF_PUSH_SAVE_OPEN_PAREN_COMMAND +\
					 SIZEOF_SAVE_CLOSE_PAREN_COMMAND)
#else
# define EXTRA_FOR_PAREN_QUESTION 	SIZEOF_JUMP_COMMAND
#endif

/*
 * For the case of '(stuff)+' the pattern will look like
 *
 *    mark:
 *      <code for STUFF>
 *      PUSH (refer to "mark2:")
 *      JUMP to "mark:"
 *    mark2:
 *
 * which is optimized to
 *      mark:
 *      <code for STUFF>
 *      PUSH_JUMP to "mark:"
 *
 * If we're saving paren info, this becomes
 *      mark:
 *	SAVE_OPEN_PAREN #
 *      <code for STUFF>
 *	SAVE_CLOSE_PAREN #
 *	<special push of paren info>
 *      PUSH_JUMP to "mark:"
 *
 * which is optimized to:
 *      mark:
 *	SAVE_OPEN_PAREN #
 *      <code for STUFF>
 *      SAVE_CLOSE_PAREN_PUSH_JUMP to "mark:"
 *
 * */
#ifndef NO_PAREN_INFO
# define EXTRA_FOR_PAREN_PLUS 		(SIZEOF_SAVE_CLOSE_PAREN_PUSH_JUMP + \
					 SIZEOF_SAVE_OPEN_PAREN_COMMAND)
#else
# define EXTRA_FOR_PAREN_PLUS 		(SIZEOF_PUSH_JUMP_COMMAND)
#endif

/*
 * For the case of '(stuff)*' the pattern will look like
 *
 *    mark:
 *      PUSH (refer to "mark2:")
 *      <code for STUFF>
 *      JUMP to "mark:"
 *    mark2:
 *
 * if not saving parens, and if so:
 *
 *      PUSH (refer to "mark2:")
 *    mark:
 *	SAVE_OPEN_PAREN
 *      <code for STUFF>
 *	SAVE_CLOSE_PAREN
 *	PUSH (refer to mark2:) with paren info
 *	JUMP to "mark:"
 *    mark2:
 *
 * which can be optimized down to
 *
 *      PUSH (refer to "mark2:")
 *    mark:
 *	SAVE_OPEN_PAREN
 *      <code for STUFF>
 *	SAVE_CLOSE_PAREN_PUSH_JUMP (push refers to mark2, jump refers to mark)
 *    mark2:
 *
 */
#ifndef NO_PAREN_INFO
# define EXTRA_FOR_PAREN_STAR	(SIZEOF_PUSH_COMMAND +                       \
				 SIZEOF_SAVE_OPEN_PAREN_COMMAND +            \
				 SIZEOF_SAVE_CLOSE_PAREN_PUSH_JUMP)
#else
# define EXTRA_FOR_PAREN_STAR	(SIZEOF_JUMP_COMMAND + SIZEOF_PUSH_COMMAND)
#endif

/*
 * Something like 'a|b|c' gets compiled into something like
 *
 *    PUSH (refer to "alt-1:")
 *    <code for a>
 *    JUMP to "donewithalt:"
 *  alt-1:
 *    PUSH (refer to "alt-2:")
 *    <code for b>
 *    JUMP to "donewithalt:"
 *  alt-2:
 *    <code for b>
 *  donewithalt:
 *
 * So each but the last alternative needs an extra PUSH and JUMP:
 */
#define EXTRA_FOR_EACH_BRACE	(SIZEOF_JUMP_COMMAND + SIZEOF_PUSH_COMMAND)


/*
 * During a compile (regcomp), we need some memory at times that we'll
 * want to free when the compile is done. TEMP_MEMORY points to a linked
 * list of memory to be freed.
 */
static char **temp_memory = 0;

/*
 * If the user sets this to a function, it will be called upon a memory
 * error. It shouldn't return.
 */
void (*regex_memory_error)(void) = 0;

static void *xmalloc(unsigned size)
{
    void *mem = (void*)malloc(size);
    if (mem == 0) {
	if (regex_memory_error)
	    (*regex_memory_error)();
	die("[regex package out of memory]\n");
    }
    return mem;
}

/*
 * Allocate temporary memory that will be freed when
 * regex_free_temp_memory is later called.
 */
static void *
regex_malloc(unsigned size)
{
    char **mem = xmalloc(size + /*enough for hidden link*/ sizeof(mem));
    *(char ***)mem = temp_memory; /* Link the chain to the new memory */
    temp_memory = mem;		  /* and the new memory to the chain. */
    return &mem[1];               /* Return the requested non-hidden memory */
}


/*
 * Free whatever memory has been allocated via regex_malloc.
 */
static void
regex_free_temp_memory(void)
{
    while (temp_memory)
    {
	char *tmp = (char *)temp_memory;
	temp_memory = (char **)*temp_memory;
	free(tmp);
    }
}

/*
 * regcomp_flags: set at the start of regcomp() to the user-defined flags
 * so that all functions called from within regcomp() can have access as
 * well.
 */
static unsigned regcomp_flags;


/*
 * part_of_word: and array (subscripted by an 'unsigned char' value) to
 * indicate if the character (or any EUC whose high byte is the subscript)
 * should be considered part of a word.
 */
unsigned char jregex_part_of_word[256];
													
/*
 * When doing a case-insensitive match, case_translation[c]
 * (when c is an unsigned char) will return the same 'c' except for
 * when it's an upper case letter, in which case it will return the
 * lower-case counterpart.
 */
static unsigned char case_translation[256];

#define KATA_DASH_HI_BYTE	0241 /* byte#1 byte for EUC dash */
#define KATA_DASH_LO_BYTE	0274 /* byte#2 byte for EUC dash */
#define ROMAN_HI_BYTE     	0243 /* high byte for alphabet EUC */
#define EUC_KATAKANA		0245
#define EUC_HIRAGANA		0244

#define LEAST_KANJI_HI_BYTE	0260
#define GREATEST_KANJI_HI_BYTE  0364

#define NOT_JREGEX_PART_OF_WORD	0
#define JREGEX_PART_OF_WORD	1
#define PART_OF_JAPANESE_WORD	2

/*
 * Initialize the jregex_part_of_word and case_translation arrays.
 */
static __inline__ void
regex_init(void)
{
    static int init_done = 0;
    int i;

    if (init_done)
	return;

    /* this not really needed, but for good form... clear out array */
    for (i = 0; i < sizeof_array(jregex_part_of_word); i++)
	jregex_part_of_word[i] = NOT_JREGEX_PART_OF_WORD;

    for (i = 'a'; i <= 'z'; i++)   /* Lowercase letters are parts of words */
	jregex_part_of_word[i] = JREGEX_PART_OF_WORD;
    for (i = 'A'; i <= 'Z'; i++)   /* Uppercase letters are parts of words */
	jregex_part_of_word[i] = JREGEX_PART_OF_WORD;
    for (i = '0'; i <= '9'; i++)   /* Numbers are parts of words */
	jregex_part_of_word[i] = JREGEX_PART_OF_WORD;
    jregex_part_of_word['_'] = JREGEX_PART_OF_WORD; /* underscore is part of word */

    jregex_part_of_word[ROMAN_HI_BYTE] = JREGEX_PART_OF_WORD; /* Roman letters are. */

    /* kana and kanji are parts of their own kind of words... */
    jregex_part_of_word[EUC_HIRAGANA]      = PART_OF_JAPANESE_WORD;
    jregex_part_of_word[EUC_KATAKANA]      = PART_OF_JAPANESE_WORD;
    jregex_part_of_word[KATA_DASH_HI_BYTE] = PART_OF_JAPANESE_WORD;

    for (i = LEAST_KANJI_HI_BYTE; i <= GREATEST_KANJI_HI_BYTE; i++)
	jregex_part_of_word[i] = PART_OF_JAPANESE_WORD;

    /*
     * Set up case_translation[] to reflect
     *      c == case_translation[isupper(c) : tolower(c) : c]
     */
    for (i = 0; i < sizeof_array(case_translation); i++)
	case_translation[i] = i;
    for (i = 'A'; i <= 'Z'; i++)
	case_translation[i] = 'a' + (i - 'A');
    case_translation[EUC_KATAKANA] = EUC_HIRAGANA;

    init_done = 1; /* mark so we won't do again */
}


struct dynamic_string
{
    unsigned char *buf;
    unsigned short buf_used;
    unsigned short buf_length;
};
#define STRING_BUFFER_INCREMENT	32

static __inline__ void
add_char_to_string(const unsigned char *char_ptr, struct dynamic_string *dstr)
{
    unsigned char_len = EUC_CHAR_LENGTH(char_ptr[0]);
    if (dstr->buf_used + char_len > dstr->buf_length)
    {
	unsigned char *old = dstr->buf;
	dstr->buf_length += STRING_BUFFER_INCREMENT;
	dstr->buf = regex_malloc(dstr->buf_length);
	bcopy(old, dstr->buf, dstr->buf_used);
    }
    dstr->buf[dstr->buf_used++] = char_ptr[0];
    if (char_len > 1)
    {
	dstr->buf[dstr->buf_used++] = char_ptr[1];
	if (char_len > 2)
	    dstr->buf[dstr->buf_used++] = char_ptr[2];
    }
}

/*
 * Before writing the compiled regex buffer, we read the pattern and
 * create an intermediate representation, which is a tree of
 * the following structures.
 */
struct intermediate
{
    enum TYPE type;      	/* type of this node */
    unsigned char count; 	/* ONCE, ZERO_OR_ONE, etc... */

    /* What must match after this note matches. For example, in the
       pattern 'abc', there will be a node EXACT1("a") whose NEXT
       will point to an EXACT1("b") node whose NEXT will ..... */
    struct intermediate *next;

    /* ALT and OPEN_PAREN nodes are parent nodes. The children and their
       siblings (via NEXT) all have pointers back to the PARENT. */
    const struct intermediate *parent;

    /* The children of an ALT (various alternatives, as from 'a|b|c')
       have siblings (other alternatives) which use this pointer. */
    struct intermediate *nextalt;

    /* Maximum number of bytes this node and all subsequent childeren and
       siblings will need in the final compiled regex. */
    unsigned regex_bytes_needed;

    /* The minimum number of bytes that actually must be matched before
       a complete match can be acheived. For example, it would be 3,2, and 1
       for the 'a', 'b', and 'c' nodes of the pattern "abc". This can be
       used for a regexec optimization... if there aren't that many bytes
       left in the string to match, we know it's a failure */
    unsigned min_match_len;

    /* false for things that don't have to match any characters in a
       string, such as anything with '*' or '?' tacked onto the end. */
    int must_match;

    /* Depending upon what the TYPE is, one of the following might be valid */
    union {
	/* For ALT types. Points to the first node in a list (where the
	   list is then traversed via NEXTALT) of alternatives. */
	struct intermediate *alt;

	/*
	 * PAREN points to the first node of a list (then traversed via
	 * NEXT) of what's enclosed in one level of parenthesis, the level
	 * number being in LEVEL.
	 */
	struct
	{
	    struct intermediate *paren;
            #ifndef NO_PAREN_INFO
	      unsigned char level;
	      unsigned char max_internal_level;
	      char real; /* false for /(?....)/ uses */
            #endif
	} paren_info;

	/* The character(s) to match for the EXACT* types */
	unsigned char exact[4];

	/* For classes, a bit more info... */
	struct class {
	    char inverted;	/* true if an inverted [^....] class */
	    unsigned char ascii[128]; /* ascii[c] true if c in class */
	    struct dynamic_string euc2;
	    struct dynamic_string euc3;
	} *class;

	#ifndef NO_PAREN_INFO
	unsigned prev_paren_to_match;
	#endif
    } u;
};


#ifndef NO_PAREN_INFO
/*
 * Level of next set of parens in pattern as we read.
 */
static unsigned paren_level;
static unsigned paren_levels_finished;
static signed int highest_prev_to_match;

/*

 * Whenever, during compiled regex execution, we enter a new set of parens, we
 * have to save some info on an internal stack. With a pattern such asa(.)c
 * we'll only ever have one "instance" of a possible value for the set of
 * parens. However, with a pattern such asa(.)*cwe'll have to keep multiple
 * instances. If the string to match is "ac1c2x", what will finally be matched
 * is "ac1c" and \1 would become "1". However, we don't find that out until
 * we'd gotten to the end of the string and realized that there wasn't another
 * `c' to be had. At that point, the stack of "instances" would have been that
 * \1 might have ended up being "x", "2", "c", "1", or "c".
 *
 * The upshot of this is that with a pattern such this, you might need as much
 * as one stack element per character in the checked string. But with a pattern
 * such asa((((.))))*cwe might need up to four stack elements per
 * character.
 *
 * Since it's faster to allocate the max possible space at the beginning of
 * the execution rather than continually waste time during the execution by
 * trying to dynamically keep the stack large enough, we want a method of
 * knowing the maximum number of levels possible per character (Levels Per
 * Character -- lpc).  This floating point value is represented in the compiled
 *  regex as two integers L (levels) and C (characters) and is re-computed
 * as L/C.
 *
 * If 'len' is the length of a line to check against the regex,
 *	roundup(len / C) * L
 * is the maximum number of levels possibly required at any point during
 * subsequent execution of the compiled regex pattern.
 */
static double max_lpc;
static unsigned max_lpc_l, max_lpc_c;
#endif

/*
 * Which of REGEX_MATCH or REGEX_MATCH_NOTE_WORD we'll use for the
 * regex buffer we'll be creating.
 */
static enum TYPE this_regex_match;

/*
 * Will be set to an appropriate error code within the routines that make
 * up the compile code, to be returned by regcomp once control gets back
 * up there.
 */
unsigned int regcomp_error = 0;
const unsigned char *regcomp_eptr;
const unsigned char *regcomp_last_pattern;
const unsigned char *regcomp_error_report(void); /* forward */


#define ERROR_(VAL, LOC, RETVAL)                                             \
macro_start {                                                                \
    regcomp_error = (VAL);                                                   \
    regcomp_eptr = (LOC);                                                    \
    return RETVAL;                                                           \
} macro_end

#define ERROR(VAL, LOC)  ERROR_((VAL), (LOC), 0)

/*
 * Allocate and return a new new intermediate node structure.
 * The node is zeroed out before being returned.
 */
static struct intermediate *
new_comp_struct(void)
{
    struct intermediate *ptr = regex_malloc(sizeof(struct intermediate));
    bzero(ptr, sizeof(struct intermediate));
    return ptr;
}

static int
next_quoted_character(const unsigned char **ptr_p,
		      unsigned char *out)
{
    int len;
    #define isodigit(c)   ((c) >= '0' && (c) <= '7')
    #define odigit_val(c) ((c) - '0')
    #define p		  (*ptr_p)

    if (*p == '\\')
    {
	p++; /* skip the backslash */

	/* accept a two- or three-digit octal value */
	if (isodigit(p[0]) && isodigit(p[1]))
	{
	    unsigned num = odigit_val(p[0]) * 8 + odigit_val(p[1]);
	    if (isodigit(p[2])) {
		num = num * 8 + odigit_val(p[2]);
		p++;
	    }
	    p += 2;
	    if (num > 0xff)
		ERROR(REGCOMP_INVALID_OCTAL_VALUE, p);
	    out[0] = num;
	    return 1;
	}

	switch (*p)
	{
	  case 't': p++; out[0] = '\t'; return 1; /* tab */
	  case 'n': p++; out[0] = '\n'; return 1; /* newline */
	  case 'r': p++; out[0] = '\r'; return 1; /* carriage return */
	  case 'f': p++; out[0] = '\f'; return 1; /* form feed */
	}
    }

    switch(len = EUC_CHAR_LENGTH(p[0]))
    {
      default:
	ERROR(REGCOMP_CORRUPTED_TEXT, p);

      case 3:
	out[2] = p[2];  	/*fallthrough to next case */
      case 2:
	out[1] = p[1];  	/*fallthrough to next case */
      case 1:
	out[0] = p[0];
    }
    p += len;
    return len;
    #undef p
}

/*
 * Function to parse a class [....] spec from a pattern.
 * Given an intermediate node (in which to fill in the 'struct class' info)
 * and the pointer to the "[...]" in the pattern. Returns the pointer to
 * the character just after the end of the "[...]".
 */
__inline__ static const unsigned char *
nab_class(struct intermediate *inter, const unsigned char *p)
{
    unsigned char current[3];
    int char1_count = 0;
    int char2_count = 0;
    int char3_count = 0;
    int lastascii = -1;
    int ascii = 0; /* doesn't need initialized except to quiet gcc warnings */
    const unsigned char *pat;
    struct class *class = regex_malloc(sizeof(struct class));
    unsigned i; /* general use */

    inter->type = CLASS;
    inter->must_match = 1;
    bzero(inter->u.class = class, sizeof(struct class));

    /* If first char is '^', it's inverted. */
    if (*p == '^')
    {
	class->inverted = 1;
	p++; /* skip '^' */
    }
    pat = p;


    /* run through the class specification in the pattern */
    do
    {
	/* Look for special things that we don't want to count as
	 * regular characters */
	if (p[0] == '\\')
	{
	    switch (p[1])
	    {
	      case 'd':		/* \d means "digit" */
		for (i = '0'; i <= '9'; i++)
		    class->ascii[i] = 1;
		char1_count += 10;
		p += 2;
		lastascii = -1;
		continue;

	      case 'w':		/* \w means ASCII word */
		for (i = 0; i < 127; i++)
		    if (jregex_part_of_word[i]) {
			char1_count++;
			class->ascii[i] = 1;
		    }
		p += 2;
		lastascii = -1;
		continue;

	      case 's':		/* whitespace */
		class->ascii['\t'] = 
		class->ascii['\n'] = 
		class->ascii['\r'] = 
		class->ascii['\f'] = 
		class->ascii[' ' ] = 1;
		char1_count += 5;
		lastascii = -1;
		p += 2;
		continue;
	    }
	}

	if (lastascii != -1 && p[0] == '-')
	{
	    unsigned start, end;
	    p++; /* skip range indicator '-' */
	    if (*p == 0)
		ERROR(REGCOMP_CORRUPTED_TEXT, p);
	    if (next_quoted_character(&p, current) != 1)
		ERROR(REGCOMP_EUC_IN_CLASS_RANGE, p);
	    if (current[0] >= 0x80)
		ERROR(REGCOMP_CORRUPTED_TEXT, p);

	    start = (unsigned char)lastascii < current[0]
		? lastascii : current[0];
	    end   = (unsigned char)lastascii > current[0]
		? lastascii : current[0];

	    for (i = start; i <= end; i++)
	    {
		char1_count++;
		class->ascii[i] = 1;
		if (regcomp_flags & REGCOMP_IGNORE_ALPHA_CASE)
		{
		    if (islower(i))
			class->ascii[toupper(i)] = 1;
		    else if (isupper(i))
			class->ascii[tolower(i)] = 1;
		}
	    }
	    lastascii = -1;
	    continue;
	}

	if (*p == 0)
	    ERROR(REGCOMP_UNCLOSED_CLASS, p);

	switch(next_quoted_character(&p, current))
	{
	  default:
	    ERROR(REGCOMP_CORRUPTED_TEXT, p);

	  case 1:
	    if (current[0] & 0x80)
		ERROR(REGCOMP_CORRUPTED_TEXT, p);
	    char1_count++;
	    class->ascii[ascii = lastascii = current[0]] = 1;
	    if (regcomp_flags & REGCOMP_IGNORE_ALPHA_CASE)
	    {
		if (isupper(current[0]))
		    class->ascii[ascii = tolower(current[0])] = 1;
		else if (islower(current[0]))
		    class->ascii[toupper(current[0])] = 1;
	    }
	    continue;

	  case 2:
	    /* deal with kana folding */
	    if ((regcomp_flags & REGCOMP_IGNORE_KANA_CASE) &&
		current[0] == EUC_KATAKANA)
	    {
		current[0] = EUC_HIRAGANA;
	    }
	    if (!twobyte_char_in_string(current[0], current[1],
	      class->euc2.buf, class->euc2.buf + class->euc2.buf_used))
	    {
		char2_count++;
		add_char_to_string(current, &class->euc2);
	    }
	    lastascii = -1;
	    continue;

	  case 3:
	    if (!threebyte_char_in_string(current[0], current[1], current[3],
	      class->euc2.buf, class->euc2.buf + class->euc2.buf_used))
	    {
		char3_count++;
		add_char_to_string(current, &class->euc3);
	    }
	    lastascii = -1;
	    continue;
	}
    } while (*p && *p != ']');

    /* did we run off the end of the pattern specifier? */
    if (*p == 0)
	ERROR(REGCOMP_UNCLOSED_CLASS, pat);

    p++; /* skip past the close bracket */

#if 1
    if (!(char2_count * 2 == class->euc2.buf_used) ||
	!(char3_count * 2 == class->euc3.buf_used))
    {
	outputf("char2_count=%d, class->euc2.buf_used = %d\n",
	       char2_count, class->euc2.buf_used);
	outputf("char3_count=%d, class->euc3.buf_used = %d\n",
	       char3_count, class->euc3.buf_used);
    }
#endif

    assert(char2_count * 2 == class->euc2.buf_used);
    assert(char3_count * 2 == class->euc3.buf_used);

    if (char1_count + char2_count + char3_count == 0)
	ERROR(REGCOMP_EMPTY_CLASS, pat);

    if (class->inverted || char1_count)
	inter->min_match_len = 1;
    else if (char2_count)
	inter->min_match_len = 2;
    else
	inter->min_match_len = 3;

    inter->regex_bytes_needed = DETERMINISTIC_CLASS_SIZE +
	char2_count * 2 + char3_count * 3;


/* optim */

    if (!class->inverted &&
	char1_count == 0 && char3_count == 0 && char2_count == 2)
    {
	/*
	 * If ignoring case and one or both is kana, we can't do this.
	 */
	if (!(regcomp_flags & REGCOMP_IGNORE_KANA_CASE) ||
	    ((class->euc2.buf[0] != EUC_HIRAGANA) &&
	     (class->euc2.buf[0] != EUC_KATAKANA) &&
	     (class->euc2.buf[2] != EUC_HIRAGANA) &&
	     (class->euc2.buf[2] != EUC_KATAKANA)))
	{
	    inter->type = EITHER_EXACT_2; /* make an EITHER_EXACT_2 */
	    inter->u.exact[0] = class->euc2.buf[0];
	    inter->u.exact[1] = class->euc2.buf[1];
	    inter->u.exact[2] = class->euc2.buf[2];
	    inter->u.exact[3] = class->euc2.buf[3];
	    inter->regex_bytes_needed = TYPE_BYTE_SIZE + 2 * EUC_SPEC_SIZE;
	}
    }
    else if (!class->inverted && char1_count + char3_count + char2_count == 1)
    {
	/* well, just make this an EXACT */
	if (char1_count == 1)
	{
	    if ((regcomp_flags & REGCOMP_IGNORE_ALPHA_CASE) && isalpha(ascii))
		inter->type = EXACT_ICASE_ASCII;
	    else
		inter->type = EXACT1;
	    inter->u.exact[0] = ascii;
	    inter->regex_bytes_needed = TYPE_BYTE_SIZE + ASCII_SPEC_SIZE;
	}
	else if (char2_count == 1)
	{
	    if ((regcomp_flags & REGCOMP_IGNORE_KANA_CASE) &&
		(class->euc2.buf[0] == EUC_HIRAGANA))
		    inter->type = EXACT_ICASE_KANA;
	    else
		inter->type = EXACT2;
	    inter->u.exact[0] = class->euc2.buf[0];
	    inter->u.exact[1] = class->euc2.buf[1];
	    inter->regex_bytes_needed = TYPE_BYTE_SIZE + EUC_SPEC_SIZE;
	}
	else if (char3_count == 1)
	{
	    inter->type = EXACT3;
	    inter->u.exact[0] = class->euc3.buf[0];
	    inter->u.exact[1] = class->euc3.buf[1];
	    inter->u.exact[2] = class->euc3.buf[2];
	    inter->regex_bytes_needed = TYPE_BYTE_SIZE + 3;
	} else {
	    assert(0);
	}
    }
/* end optim */
    
    return p;
}

static __inline__ int set_mustmatch(struct intermediate *i)
{
    kibishii_assert(i != 0);
    if (i->next == 0)
	return i->must_match;
    else
	return i->must_match |= set_mustmatch(i->next);
}

#define PAT	(*pp)

/* forward */
static struct intermediate *nibble_from_pattern(const unsigned char **pp);

static struct intermediate *
nibble_list(const unsigned char **pp, const struct intermediate *parent)
{
    const unsigned char *start_PAT = PAT;
    struct intermediate *listhead = 0; /* head of current list.           */
    struct intermediate *lastlist;     /* last element in the list [x]    */

    struct intermediate *alt = 0;      /* encompasing ALT note, if exists */
    struct intermediate *lastalt;      /* last alternative in list [x]    */
    /*
     * [x] -- "may be used uninitialized" Ok.
     */

    struct intermediate *to_be_returned = 0; /* what we'll eventually return */

    #define update_stats_for_current_alternative()                           \
    macro_start {                                                            \
	set_mustmatch(listhead);                                             \
	if (alt->min_match_len > listhead->min_match_len)                    \
	    alt->min_match_len = listhead->min_match_len;                    \
	alt->regex_bytes_needed +=                                           \
	    listhead->regex_bytes_needed + EXTRA_FOR_EACH_BRACE;             \
	alt->must_match &= listhead->must_match;                             \
    } macro_end


    /*
     * Nibble away at the pattern, adding to our alternative until
     * we hit the next alternative, or run out of pattern space,
     * or leave a level of parenthesis.
     */
    for (;;)
    {
	struct intermediate *new;

	/* if have a '|' now, it's because of a "||", which is illegal */
	if (PAT[0] == '|')
	    ERROR(REGCOMP_BAD_BRACE, PAT);

	if (PAT[0] == ')')
	{
	    /*
	     * We'll be leaving, so update status info in the ALT node
	     * from the currently-being-built alternative (whose stats
	     * are in LISTHEAD)
	     */
	    if (alt && listhead)
		update_stats_for_current_alternative();

	    if (parent == 0)
		ERROR(REGCOMP_UNMATCHED_CLOSE_PAREN, PAT);

	    #ifndef NO_PAREN_INFO
	    if (parent->u.paren_info.real)
	        paren_levels_finished = parent->u.paren_info.level +1;
	    #endif

	    if (to_be_returned == 0)
	    {
		/* make a dummy empty node */
		to_be_returned = new_comp_struct();
		to_be_returned->type = UNUSED;
	    }
	    PAT++; /* skip past closed paren */
	    return to_be_returned;
	}

#if 1
	if (PAT[0] == '$' && PAT[1] == '\0' && alt)
	{
	    /*
	     * We'll be leaving, so update status info in the ALT node
	     * from the currently-being-built alternative (whose stats
	     * are in LISTHEAD)
	     */
	    if (listhead)
		update_stats_for_current_alternative();

	    if (parent != 0)
		return to_be_returned;
	    else
	    {
		lastlist = alt;
		listhead = to_be_returned;
		alt = 0;
	    }
	}
#endif

	/* get next bit of the pattern */
        if (new = nibble_from_pattern(pp), new == 0)
	    return 0;
	new->parent = alt ? alt : parent;

	if (alt && (new->type == this_regex_match ||
	    (new->type == EXACT1 && new->u.exact[0] == '\n')))
	{
	    if (listhead)
		update_stats_for_current_alternative();
	    listhead = to_be_returned;
	    lastlist = alt;
	    alt = 0;
	}

	if (listhead == 0)
	{
	    listhead = lastlist = new;
	    if (alt)
	    {
		lastalt->nextalt = listhead;
		lastalt = new;
	    }
	} else {
	    if (regcomp_flags & REGCOMP_FUZZY_KANA_REPETITION)
	    {
		/*
		 * Allow  to match where appropriate.
		 * When set,  and  will match each-other.
		 *
		 * If the new and the last node are both EXACT2 and the
		 * last one was a regular kanji (high byte 0xb0 or greater)
		 * and the new node is either the same character or ,
		 * turn the new one into a char-or- node.
		 */
		if (lastlist->type == EXACT2 && new->type == EXACT2 &&
		    lastlist->u.exact[0] >= 0xb0 &&
		    ((new->u.exact[0] == lastlist->u.exact[0] &&
		      new->u.exact[1] == lastlist->u.exact[1]) ||
		     (new->u.exact[0] == ((unsigned char *)"")[0] &&
		      new->u.exact[1] == ((unsigned char *)"")[1])))
		{
		    new->type = EITHER_EXACT_2;
		    new->u.exact[0] = lastlist->u.exact[0];
		    new->u.exact[1] = lastlist->u.exact[1];
		    new->u.exact[2] = ""[0];
		    new->u.exact[3] = ""[1];
		    new->regex_bytes_needed = TYPE_BYTE_SIZE + 2*EUC_SPEC_SIZE;
		}
	    }
	    lastlist->next = new;
	    lastlist = new;
	    listhead->regex_bytes_needed += new->regex_bytes_needed;
	    listhead->min_match_len += new->min_match_len;
	    listhead->must_match |= new->must_match;
	}

	if (new->type == this_regex_match)
	{
	    if (parent)
		ERROR(REGCOMP_UNMATCHED_OPEN_PAREN, start_PAT - 1);
	    set_mustmatch(to_be_returned);
	    return to_be_returned;
	}

	if (to_be_returned == 0)
	    to_be_returned = listhead;

	if (PAT[0] == '|')
	{
	    /*
	     * Starting another alternative.
	     */
	    if (alt != 0)
	    {
		/* for non-first alternatives */
		update_stats_for_current_alternative();
	    }
	    else
	    {
		/*
		 * Hitting this '|' has told us that the stuff we'd just
		 * nibbled up was all a first-alternative, with the 2nd
		 * (and possibly more) to follow.
		 *
		 * We have to make a new ALT node, and make everything to
		 * the left (in the pattern) that we just read the first
		 * child of that ALT node, with the next alternative(s)
		 * that child's siblings.
		 *
		 * We have to install the ALT node in place of what was
		 * removed to become the first child.
		 */
		struct intermediate *ptr;
		/* make new structure or alt node */
		alt = new_comp_struct();
		alt->type = ALT;
		alt->parent = parent; /* set its parent */

		/*
		 * Now, rather than returning a whole list, we'll return
		 * just the ALT node, whose children will be the various
		 * lists....
		 */
		if (to_be_returned == listhead)
		    to_be_returned = alt;


		/* note the first alternative */
		set_mustmatch(listhead);
		lastalt = alt->u.alt = listhead;

		/* alt node now becomes parent for all elements in that list */
		for (ptr = listhead; ptr; ptr = ptr->next)
		    ptr->parent = alt;

		alt->min_match_len      = listhead->min_match_len;
		alt->regex_bytes_needed = listhead->regex_bytes_needed;
		alt->must_match         = listhead->must_match;
	    }

	    listhead = 0;
	    PAT++; /* skip past '|' */
	}
    }
}

/*
 * Nibble one node worth of the (pointer to a pointer to the) pattern.
 * If the pattern is sitting on something enclosed in parens, the entire
 * thing is nibbled recursively and the one OPEN_PAREN node (with its children
 * being the "entire thing") is returned.
 */
static struct intermediate *
nibble_from_pattern(const unsigned char **pp)
{
    unsigned char this[3];
    struct intermediate *new = new_comp_struct();

    /*
     * We don't worry about freeing up the node memory if we abort early...
     * regex_free_temp_memory() will take care of it all for us.
     */
    
    /* look for special things in the pattern */
    switch (PAT[0])
    {
      case '\0':
	new->type = this_regex_match;
	new->regex_bytes_needed = TYPE_BYTE_SIZE;
	new->min_match_len = 0;
	return new;

      case '|':	/* should never run into this here */
      case ')':	/* should never run into this here */
	assert(0);
	break;

      case '.':
	new->type = ANY_CHAR;
	new->regex_bytes_needed = TYPE_BYTE_SIZE;
	new->min_match_len = 1;
	++PAT; /* skip past '.' */
	goto allow_starplus; /* can have a +, *, or ? appended */

      case '<':
      case '>':
	++PAT; /* skip past '<' */
        word_boundary:
	new->type = WORD_BOUNDARY;
	new->must_match = 1;
	new->regex_bytes_needed = TYPE_BYTE_SIZE;
	return new;

      case '$':
	new->type = END_OF_LINE;
	new->must_match = 1;
	new->regex_bytes_needed = TYPE_BYTE_SIZE;
	++PAT; /* skip past '$' */
	return new;

      case '^':
	new->type = START_OF_LINE;
	new->must_match = 1;
	new->regex_bytes_needed = TYPE_BYTE_SIZE;
	++PAT; /* skip past '^' */
	return new;

      case '*':
      case '+':
      case '?':
	/* these should never appear here */
	ERROR(REGCOMP_MISUSED_COUNT_SPEC, PAT);
	/* notreached */

      case '[':
	if (PAT = nab_class(new, PAT+1), PAT == 0)
	    return 0;
	goto allow_starplus; /* can have a +, *, or ? appended */

      case '(':
	++PAT; /* skip past open paren */

	if (PAT[0] != '?' || PAT[1] != ':') /* perl5's grouping-only parens */

	    new->u.paren_info.real = 1;
        else {
	    PAT += 2;
	    new->u.paren_info.real = 0;
        }
	new->type = OPEN_PAREN;
	#ifndef NO_PAREN_INFO
        if (new->u.paren_info.real)
	    new->u.paren_info.level = paren_level++;
	#endif


	/* nibble entire contents of group */
	if ((new->u.paren_info.paren = nibble_list(pp,new)) == 0)
	    return 0;

	/* first child node will have list totals */
	new->must_match         = new->u.paren_info.paren->must_match;
	new->min_match_len      = new->u.paren_info.paren->min_match_len;
	new->regex_bytes_needed = new->u.paren_info.paren->regex_bytes_needed;

	#ifndef NO_PAREN_INFO
        if (new->u.paren_info.real)
	{
	    new->u.paren_info.max_internal_level = paren_level - 1;

	    /*
	     * If will be writing the two commands to for noting the
	     * position of a paren start and end, allot the space
	     * for them.
	     */
	    if (regcomp_flags & REGCOMP_SAVE_MATCHED_PAREN_INFO)
		new->regex_bytes_needed += TYPE_BYTE_SIZE * 2 + 1 * 2;
	}
	#endif
	goto allow_starplus; /* can have a +, *, or ? appended */

      case '\\':
	/* backslash + non-zero digit + non-octal-digit == match paren */
	if (PAT[1] != '0' && isdigit(PAT[1]) && !isodigit(PAT[2]))
	{
	    /* match paren #%d exactly */
	    #ifndef NO_PAREN_INFO
	    if ((regcomp_flags & REGCOMP_SAVE_MATCHED_PAREN_INFO) == 0)
	    #endif
		ERROR(REGCOMP_NEED_SAVE_PAREN_INFO, PAT);

	    #ifndef NO_PAREN_INFO
	      new->type = MATCH_PREV_PAREN;
	      new->u.prev_paren_to_match = PAT[1] - '0' - 1;
	      if (new->u.prev_paren_to_match > paren_levels_finished)
		  ERROR(REGCOMP_PAREN_LEVEL_INVALID, PAT);

	      if ((signed int)new->u.prev_paren_to_match>highest_prev_to_match)
		  highest_prev_to_match = new->u.prev_paren_to_match;

	      new->regex_bytes_needed = TYPE_BYTE_SIZE + PAREN_SPEC_SIZE;
	      new->min_match_len = 0; /* actually could figure this out */
	      new->must_match = 1;
	      PAT += 2; /* skip past \digit */
	      goto allow_starplus; /* Can't have a +, *, or ? appended, but
				    * we'll get an error if there is one */
            #endif
	}

	switch(PAT[1])
	{
	  case 'b':
	    PAT += 2;
	    goto word_boundary;

	  case 'a':	/* ASCII (except newline) character */
	  case 'H':	/* anything not hiragana */
	  case 'K':	/* anything not katakana */
	  case 'C':	/* anything not kanji */
	    switch(PAT[1]) {
	      case 'a': new->type = ANY_ASCII; break;
	      case 'H': new->type = ANY_NONHIRAGANA; break;
	      case 'K': new->type = ANY_NONKATAKANA; break;
	      case 'C': new->type = ANY_NONKANJI; break;
	    }
	    new->regex_bytes_needed = TYPE_BYTE_SIZE;
	    new->min_match_len = 1;
	    PAT += 2; /* skip past '\a', etc. */
	    goto allow_starplus; /* can have a +, *, or ? appended */

	  case 'A':	/* multibyte character */
	  case 'h':	/* any (multibyte) hiragana */
	  case 'k':	/* any (multibyte) katakana */
	  case 'c':	/* any (multibyte) kanji */
	    switch (PAT[1]) {
	      case 'A': new->type = ANY_MULTIBYTE; break;
	      case 'h': new->type = ANY_HIRAGANA; break;
	      case 'k': new->type = ANY_KATAKANA; break;
	      case 'c': new->type = ANY_KANJI; break;
	    }
	    new->regex_bytes_needed = TYPE_BYTE_SIZE;
	    new->min_match_len = 2;
	    PAT += 2; /* skip past '\A' */
	    goto allow_starplus; /* can have a +, *, or ? appended */


	  case 'd': /* digit */
	  case 'w': /* ascii word */
	  case 's': /* whitespace */
	  case 'D': /* not digit */
	  case 'W': /* not ascii word */
	  case 'S': /* not whitespace */
	    {
		/* we construct a character class specifier
		   which is the same and use that... */
		int inverted = isupper(PAT[1]);
		unsigned char class[5];
		int i = 0;
		if (inverted) {
			class[i++] = '^';
		}
		class[i++] = '\\';
		class[i++] = inverted ? tolower(PAT[1]) : PAT[1];
		class[i++] = ']';
		class[i++] = '\0';
		if (nab_class(new, class) == 0)
		    return 0;
		/* new->u.class->inverted = inverted; */
		new->min_match_len = 1;
		PAT += 2; /* skip past \d, etc. */
		goto allow_starplus; /* can have a +, *, or ? appended */
	    }
	}
    }

    /* was nothing special.... nab the next character as-is */
    switch(next_quoted_character(&PAT, this))
    {
      default:
	ERROR(REGCOMP_CORRUPTED_TEXT, PAT);

      case 1:
	{
	    unsigned char c = this[0];
	    /* if ignoring alpha-case, and this is a letter... */
	    if ((regcomp_flags & REGCOMP_IGNORE_ALPHA_CASE)
		&& isascii(c) && isalpha(c))
	    {
		new->type = EXACT_ICASE_ASCII;
		if (isupper(c))
		    c = tolower(c);
	    } else {
		new->type = EXACT1;
	    }	
	    new->u.exact[0] = c;
	}
	new->min_match_len = 1;
	new->regex_bytes_needed = TYPE_BYTE_SIZE + 1;
	break;

      case 2:
	{
	    unsigned char hi = this[0];

	    /* if ignoring kana case, and this is a kana.... */
	    if ((regcomp_flags & REGCOMP_IGNORE_KANA_CASE) && 
		(hi == EUC_HIRAGANA || hi == EUC_KATAKANA))
	    {
		new->type = EXACT_ICASE_KANA;
		hi = EUC_HIRAGANA;
	    } else {
		new->type = EXACT2;
	    }
	    new->u.exact[0] = hi;
	    new->u.exact[1] = this[1];
	}
	new->min_match_len = 2;
	new->regex_bytes_needed = TYPE_BYTE_SIZE + 2;
	break;

      case 3:
	new->type = EXACT3;
	new->u.exact[0] = this[0];
	new->u.exact[1] = this[1];
	new->u.exact[2] = this[2];
	new->min_match_len = 3;
	new->regex_bytes_needed = TYPE_BYTE_SIZE + 3;
	break;
    }

    goto allow_starplus;  /* can have a +, *, or ? appended */


    /***************************************************/


  allow_starplus:

    /* Whatever just preceeded is allowed to have a +, *, or ? after
       it. If it's there, act appropriately.  If it's a simple node,
       it'll just need to have its COUNT updated, but if it's an OPEN
       PAREN type, we'll have to note the extra space we'll need for
       that in the pattern.  Also, for * and ?, since even a null string
       will match something marked with one of these, we can note that the
       must_match and min_match_len are false and zero, respectively. */

    switch (PAT[0])
    {
      case '+': new->count = ONE_OR_MORE;
	++PAT; /* skip the '+' */
	if (new->type == OPEN_PAREN) {
	    if (new->min_match_len == 0)
		ERROR(REGCOMP_INFINITE_PLUS, PAT);
	    new->regex_bytes_needed += EXTRA_FOR_PAREN_PLUS;
	}
	break;

      case '*': new->count = ANY_NUMBER;
	++PAT; /* skip the '*' */
	if (new->type == OPEN_PAREN) {
	    if (new->min_match_len == 0)
		ERROR(REGCOMP_INFINITE_STAR, PAT);
	    new->regex_bytes_needed += EXTRA_FOR_PAREN_STAR;
	}
	new->min_match_len = 0;
	new->must_match = 0;
	break;

      case '?': new->count = ZERO_OR_ONE;
	++PAT; /* skip the '?' */
	new->min_match_len = 0;
	new->must_match = 0;
	if (new->type == OPEN_PAREN)
	    new->regex_bytes_needed += EXTRA_FOR_PAREN_QUESTION;
	break;

      default:
	new->count = ONCE;
	if (new->type == OPEN_PAREN)
	    new->regex_bytes_needed += EXTRA_FOR_RAW_PAREN;
	break;
    }

    /* note that we must match anthing with a non-zero minimum match length */
    if (new->min_match_len != 0)
	new->must_match = 1;

    return new;
}


/* forward */
static void write_regex_buffer(const struct intermediate *,
			       unsigned char **,
			       int,
			       unsigned char **);

/* I should probably just make this a global.... */
#define outbuf  (*bb)

/*
 * [this a separate function just for visual convenience...
 *  only called from one place]
 *
 * Given an intermediate node of type CLASS, write out the compiled
 * form to the given buffer.
 */
static __inline__ void
write_regex_buffer_for_class(const struct intermediate *inter,
			     unsigned char **bb)
{
    const struct class *class = inter->u.class;
    unsigned i;


    /*
     * Figure out what type of CLASS_* node this really is and dump
     * that type byte.
     */
    *outbuf++ = make_type_byte_with_count(CLASS, inter->count);
    *outbuf++ = class->inverted;
    write_short_val(outbuf, class->euc2.buf_used);
    outbuf += SHORT_VAL_SIZE;
    write_short_val(outbuf, class->euc3.buf_used);
    outbuf += SHORT_VAL_SIZE;

    for (i = 0; i < sizeof(class->ascii); i++)
	*outbuf++ = class->ascii[i];

    for (i = 0; i < class->euc2.buf_used; i++)
	*outbuf++ = class->euc2.buf[i];

    for (i = 0; i < class->euc3.buf_used; i++)
	*outbuf++ = class->euc3.buf[i];

}

static __inline__ unsigned char *
consolidate_musthave(unsigned char *range1_start,
		     const unsigned char *range1_end,
		     const unsigned char *range2_start,
		     const unsigned char *range2_end)
{
    unsigned char *dest = range1_start;

#if 0
outputf("consolidate [%.*s] [%.*s]\n", range1_end - range1_start, range1_start,
range2_end - range2_start, range2_start);
#endif

    while (range1_start < range1_end)
    {
	switch(EUC_CHAR_LENGTH(range1_start[0]))
	{
	  default:
	    assert(0);
	    range1_start += 1;
	    break;

	  case 1:
	    if (onebyte_char_in_string(range1_start[0],
				       range2_start, range2_end))
		*dest++ = range1_start[0];
	    range1_start++;
	    break;

	  case 2:
	    if (twobyte_char_in_string(range1_start[0],range1_start[1],
				       range2_start, range2_end))
	    {
		*dest++ = range1_start[0];
		*dest++ = range1_start[1];
	    }
	    range1_start += 2;
	    break;

	  case 3:
	    if (threebyte_char_in_string(
                   range1_start[0], range1_start[1], range1_start[2],
		   range2_start, range2_end))
	    {
		*dest++ = range1_start[0];
		*dest++ = range1_start[1];
		*dest++ = range1_start[2];
	    }
	    range1_start += 3;
	    break;
	}
    }

    #if 0
    outputf("yields [%.*s]\n", dest - range1_start, range1_start);
    #endif

    return dest;
}


/*
 * Given an intermediate node which is one of the alternatives for
 * a set of alternatives, output the alternative along with the appropriate
 * PUSHs and JUMPs to facilitate the selection process. All but the last
 * alternative will have a preceding PUSH to say ``if I fail, try the
 * next alternative'' and a following JUMP to indicate ``if I succeed,
 * jump past all the other alternatives that we don't need anymore''.
 *
 * If noomit ("no omit") is true, we are not allowed to optimize the
 * compiled pattern by replacing the ``jump past other alternatives''
 * with a REGEX_MATCH (because the jump would just jump to a REGEX_MATCH).
 * Noomit will be true when the parent ALT is not followed by something
 * that must match.
 *
 * If musthave is nonzero, it's a pointer to a pointer to a buffer where
 * we can stuff characters that we know must be part of the string
 * in order to match. In the case of alternatives, a character must be
 * required in all alternatives to be required in the whole string.
 * We'll accomplish this by noting where the first alternative's required
 * characters are saved (musthave_base), then getting the next alternative's
 * required characters (starting at musthave_altbase), then erasing from
 * the first set's those characters not in the second set.
 */
static void
write_regex_buffer_for_alt(const struct intermediate *inter,
			   unsigned char **bb,
			   int noomit,
			   unsigned char **musthave)
{
    unsigned char *push, *jump = 0;
    unsigned char *musthave_base = musthave ? *musthave : 0;
    unsigned char *musthave_altbase;

    /* if last alternative, just write the code */
    if (inter->nextalt == 0) {
	write_regex_buffer(inter, &outbuf, noomit, musthave);
	return;
    }

    /* note location and alot space for a PUSH */
    push = outbuf;
    outbuf += TYPE_BYTE_SIZE + SHORT_VAL_SIZE;

    /* write the alternative */
    write_regex_buffer(inter, &outbuf, noomit, musthave);

    musthave_altbase = musthave ? *musthave : 0;

    if (!noomit)
    {
	/* since we're done, put a REGEX_MATCH */
	*outbuf++ = make_type_byte(this_regex_match);
    } else {
	/* save space for a JUMP (to jump over later alternatives) */
	jump = outbuf;
	outbuf += TYPE_BYTE_SIZE + SHORT_VAL_SIZE;
    }

    /* fill in previous PUSH to note this location */
    *push = make_type_byte(PUSH);
    write_short_val(&push[1], outbuf - push);

    /* write the rest of the alternatives */
    write_regex_buffer_for_alt(inter->nextalt, &outbuf, noomit, musthave);

    if (noomit)
    {
	/* have previous JUMP come here... after all alternatives */
	*jump = make_type_byte(JUMP);
	write_short_val(&jump[1], outbuf - jump);
    }

    /*
     * If keeping must-have character information, update as appropriate
     * to a set of alternatives.
     *
     * Remove from the range [musthave_base - musthave_altbase] those
     * chars that are not in [musthave_altbase - *musthave], then
     * put *musthave to point at the end of what we've kept.
     */
    if (musthave_base)
    {
#ifndef NDEBUG
	if (regcomp_flags & REGCOMP_DEBUG)
	{
	    const unsigned char *p1;
	    output("intersection of [");
	    for (p1 = musthave_base; p1 < musthave_altbase; p1++)
		if (*p1) outchar(*p1);
	    output("] and [");
	
	    for (p1 = musthave_altbase; p1 < *musthave; p1++)
		if (*p1) outchar(*p1);
	    output("] yields ");
	}
#endif

	*musthave = consolidate_musthave(musthave_base, musthave_altbase,
					 musthave_altbase, *musthave);
#ifndef NDEBUG
	if (regcomp_flags & REGCOMP_DEBUG) {
	    unsigned char *p1;
	    outchar('[');
	    for (p1 = musthave_base; p1 < *musthave; p1++)
		if (*p1) outchar(*p1);
	    output("]\n");
	}
#endif
    }
}


/*
 * Main compiled-pattern writing routine.
 * If noomit ("no omit") is true, our parent is followed by something that
 * must match, so we can't optimize by omitting "useless" commands that
 * trail the "useful" ones. These "useless" ones are ones which will match
 * anything (i.e. even the null string) and can never change the truthfulness
 * of a match or fail... i.e. 'abc' vs 'abc(xyz)?'. In the latter, the
 * final "(xyz)?" is completely useless.
 */
static void
write_regex_buffer(const struct intermediate *inter,
		   unsigned char **bb,
		   int noomit,
		   unsigned char **musthave)
{
    assert(inter != 0);

    while (inter)
    {
	/*
	 * We can't do any omitting if any of:
	 *   - our parent told us not to (via noomit).
	 *   - the flag isn't set to allow us to.
	 *   - there is a "next" sibling and it must match
	 */
	int must_not_omit = noomit || !(regcomp_flags & REGCOMP_JUST_MATCH) ||
	    (inter->next && inter->next->must_match);

	/*
	 * But if we *can* omit, and this doesn't need to match, we
	 * can just output a REGEX_MATCH.
	 */
	if (!must_not_omit && !inter->must_match)
	{
	    *outbuf++ = make_type_byte(this_regex_match);
	    return;
	}

	switch(inter->type)
	{
	  default:
	    outputf("<<unknown type %d at line %d>>\n", inter->type, __LINE__);
	    break;

	  case UNUSED:
	    break;

	  case REGEX_MATCH:
	  case REGEX_MATCH_NOTE_WORD:
	  case WORD_BOUNDARY:
	  case END_OF_LINE:
	  case START_OF_LINE:
	    /* pretty simple stuff... */
	    *outbuf++ = make_type_byte(inter->type);
	    break;

	  case ANY_CHAR:
	  case ANY_ASCII:
	  case ANY_MULTIBYTE:
	  case ANY_KATAKANA:
	  case ANY_NONKATAKANA:
	  case ANY_HIRAGANA:
	  case ANY_NONHIRAGANA:
	  case ANY_KANJI:
	  case ANY_NONKANJI:
	    *outbuf++ = make_type_byte_with_count(inter->type, inter->count);
	    break;

	  case OPEN_PAREN:
	  {
	    /* must treat each COUNT differently */
            #ifdef NO_PAREN_INFO
              #define PAREN_INFO(stuff) { /* nothing */ }
	    #else
	      int real = inter->u.paren_info.real &&
		         (regcomp_flags & REGCOMP_SAVE_MATCHED_PAREN_INFO);
	      #define PAREN_INFO(stuff) if (real) { stuff; }
	    #endif

	    switch (inter->count)
	    {
	      case ONCE:
		PAREN_INFO(
		    *outbuf++ = make_type_byte(SAVE_OPEN_PAREN);
		    *outbuf++ = inter->u.paren_info.level;
		)

		write_regex_buffer(inter->u.paren_info.paren,
				   &outbuf, must_not_omit, musthave);

		PAREN_INFO(
		    *outbuf++ = make_type_byte(SAVE_CLOSE_PAREN);
		    *outbuf++ = inter->u.paren_info.level;
		)
		break;

	      case ANY_NUMBER:
		{
		    /*
		     * Any number means that zero is also ok, so first do a
		     * push so that it can backtrack to skipping this part
		     * of the regex entirely.  Then put the regex, and a
		     * pushjump back to the regex to try again.
		     */
		    unsigned char *push_loc = outbuf;
		    unsigned char *mark1_loc;
		    unsigned char *jump_loc;

		    *outbuf++ = make_type_byte(PUSH);
		    outbuf += SHORT_VAL_SIZE;

		    mark1_loc = outbuf;

		    PAREN_INFO(
			mark1_loc = outbuf;
			*outbuf++ = make_type_byte(SAVE_OPEN_PAREN);
			*outbuf++ = inter->u.paren_info.level;
		    )

		    /* write the group regex */
		    write_regex_buffer(inter->u.paren_info.paren, &outbuf,
				       must_not_omit, 0);

                    #ifndef NO_PAREN_INFO
		    if (real)
		    {
			unsigned l = inter->u.paren_info.max_internal_level -
			             inter->u.paren_info.level + 1;
			unsigned c = inter->u.paren_info.paren->min_match_len;
			double lpc = l / (double)c;
			jump_loc = outbuf;
			*outbuf++ = make_type_byte(SAVE_CLOSE_PAREN_PUSH_JUMP);
			*outbuf++ = l;
			*outbuf++ = inter->u.paren_info.level;

			if (lpc > max_lpc)
			{
			    max_lpc = lpc;
			    max_lpc_l = l;
			    max_lpc_c = c;
			}
		    }
		    else
		    #endif
		    {
		        jump_loc = outbuf;
			*outbuf++ = make_type_byte(PUSH_JUMP);
		    }
		    write_short_val(outbuf, mark1_loc - jump_loc);
		    outbuf += SHORT_VAL_SIZE;

		    /* fill in the offset of the PUSH */
		    write_short_val(&push_loc[TYPE_BYTE_SIZE],
				    outbuf - push_loc);
		    break;
		}
		
	      case ZERO_OR_ONE:
		{
		    /*
		     * Since zero is OK, start off with a push past
		     * the regex, then the regex.
		     */
		    unsigned char *marker = outbuf;


                    #ifndef NO_PAREN_INFO
		    if (real)
		    {
			*outbuf = make_type_byte(PUSH_SAVE_OPEN_PAREN);
			outbuf += TYPE_BYTE_SIZE + SHORT_VAL_SIZE;
			*outbuf++ = inter->u.paren_info.level;
		    } else
                    #endif
		    {
			/* write the PUSH and remember where it is */
			*outbuf = make_type_byte(PUSH);
			outbuf += TYPE_BYTE_SIZE + SHORT_VAL_SIZE;
		    }

		    /* write the group regex */
		    write_regex_buffer(inter->u.paren_info.paren, &outbuf,
				       must_not_omit, 0);

		    PAREN_INFO(
			*outbuf++ = make_type_byte(SAVE_CLOSE_PAREN);
			*outbuf++ = inter->u.paren_info.level;
		    )

		    /* fill in the offset of the PUSH */
		    write_short_val(&marker[TYPE_BYTE_SIZE], outbuf - marker);
		    break;
		}
		
	      case ONE_OR_MORE:
		{
		    /*
		     * After doing the regex once, do a push to indicate
		     * that the once has been achieved, then jump to try
		     * again. This is optimized as a PUSH_JUMP
		     */
		    unsigned char *marker = outbuf;
		    unsigned char *base;

		    PAREN_INFO(
			*outbuf++ = make_type_byte(SAVE_OPEN_PAREN);
			*outbuf++ = inter->u.paren_info.level;
		    )

		    /* write the group regex */
		    write_regex_buffer(inter->u.paren_info.paren, &outbuf,
				       must_not_omit, musthave);
		    
		    base = outbuf;
                    #ifndef NO_PAREN_INFO
		    if (real)
		    {
			unsigned l = inter->u.paren_info.max_internal_level -
			             inter->u.paren_info.level + 1;
			unsigned c = inter->u.paren_info.paren->min_match_len;
			double lpc = l/(double)c;
			*outbuf++ = make_type_byte(SAVE_CLOSE_PAREN_PUSH_JUMP);
			*outbuf++ = l;
			*outbuf++ = inter->u.paren_info.level;

			if (lpc > max_lpc)
			{
			    max_lpc = lpc;
			    max_lpc_l = l;
			    max_lpc_c = c;
			}
		    }
		    else
		    #endif
		    {
			*outbuf++ = make_type_byte(PUSH_JUMP);
		    }

		    write_short_val(outbuf, marker - base);
		    outbuf += SHORT_VAL_SIZE;
		    break;
		}
	    }
	    break;
	  }

	  case CLASS:
	    /* just call our helper routine */
	    write_regex_buffer_for_class(inter, &outbuf);
	    break;

	  case EXACT1:
	  case EXACT_ICASE_ASCII:
	    *outbuf++ = make_type_byte_with_count(inter->type, inter->count);
	    *outbuf++ = inter->u.exact[0];
	    if (musthave && !(inter->count & ZERO_OK))
		*(*musthave)++ = inter->u.exact[0];
	    break;

	  case EXACT2:
	  case EXACT_ICASE_KANA:
	    *outbuf++ = make_type_byte_with_count(inter->type, inter->count);
	    *outbuf++ = inter->u.exact[0];
	    *outbuf++ = inter->u.exact[1];
	    if (musthave && !(inter->count & ZERO_OK)) {
		*(*musthave)++ = inter->u.exact[0];
		*(*musthave)++ = inter->u.exact[1];
	    }
	    break;

	  case EXACT3:
	    *outbuf++ = make_type_byte_with_count(inter->type, inter->count);
	    *outbuf++ = inter->u.exact[0];
	    *outbuf++ = inter->u.exact[1];
	    *outbuf++ = inter->u.exact[2];
	    if (musthave && !(inter->count & ZERO_OK)) {
		*(*musthave)++ = inter->u.exact[0];
		*(*musthave)++ = inter->u.exact[1];
		*(*musthave)++ = inter->u.exact[2];
	    }
	    break;

	  case EITHER_EXACT_2:
	    *outbuf++ = make_type_byte_with_count(inter->type, inter->count);
	    *outbuf++ = inter->u.exact[0];
	    *outbuf++ = inter->u.exact[1];
	    *outbuf++ = inter->u.exact[2];
	    *outbuf++ = inter->u.exact[3];
	    break;

	  case ALT:
	    {
		/*
		 * Deal with a bunch (2+) of alternatives.
		 * The 'final' bit is just an optimization. All alternatives
		 * but the last end with a jump to whatever follows all the
		 * alternatives. If we determine that there's nothing
		 * important after this, we'll just skip the jump and put
		 * a REGEX_MATCH there.
		 */
		write_regex_buffer_for_alt(inter->u.alt, &outbuf,
					   must_not_omit, musthave);
	    }
	    break;

 	  case MATCH_PREV_PAREN:
	    *outbuf++ = make_type_byte(MATCH_PREV_PAREN);
	    *outbuf++ = inter->u.prev_paren_to_match;
	    break;
	}
	inter = inter->next;
    }
}

#ifndef NDEBUG
/*
 * For debugging.
 */
static void
show_intermediate_pattern(const struct intermediate *inter, unsigned level)
{
    if (inter == 0) {
	unsigned i;
	for (i = 0; i < level; i++)
	    outchar('|');
	output(" <<NULL>>\n");
	return;
    }
    
    while (inter)
    {
	const char *countmemo;
	unsigned i;

	for (i = 0; i < level; i++)
	    outchar('0'+i);

	outputf("[%c r=%02d m=%02d] ", inter->must_match ? '!': ' ',
	       inter->regex_bytes_needed, inter->min_match_len);

	switch(inter->count)
	{
	  case ONCE:        countmemo = ""    ; 	break;
	  case ANY_NUMBER:  countmemo = " (*)"; 	break;
	  case ZERO_OR_ONE: countmemo = " (?)"; 	break;
	  case ONE_OR_MORE: countmemo = " (+)";		break;
	  default:	    countmemo = "????????????";	break;
	}

	switch(inter->type)
	{
	  default:
	    outputf("<<unknown type %d at line %d>>\n", inter->type, __LINE__);
	    break;

	  case UNUSED:         output("<nothing>\n"); break;
	  case REGEX_MATCH:    output("regex match\n"); break;
	  case REGEX_MATCH_NOTE_WORD:
	    output("regex match (note word)\n"); break;
	  case ANY_CHAR:       outputf("any char%s\n",        countmemo);break;
	  case ANY_ASCII:      outputf("any ASCII%s\n",       countmemo);break;
	  case ANY_MULTIBYTE:  outputf("any multibyte%s\n",   countmemo);break;
	  case ANY_KATAKANA:   outputf("any katakana%s\n",    countmemo);break;
	  case ANY_NONKATAKANA:outputf("any non-katakana%s\n",countmemo);break;
	  case ANY_HIRAGANA:   outputf("any hiragana%s\n",    countmemo);break;
	  case ANY_NONHIRAGANA:outputf("any non-hiragana%s\n",countmemo);break;
	  case ANY_KANJI:      outputf("any kanji%s\n",       countmemo);break;
	  case ANY_NONKANJI:   outputf("any non-kanji%s\n",   countmemo);break;
	  case WORD_BOUNDARY:  output("word boundary\n");                break;
	  case END_OF_LINE:    output("end of line\n");                  break;
	  case START_OF_LINE:  output("start of line\n");                break;

	  case OPEN_PAREN:
	    #ifndef NO_PAREN_INFO
	    outputf("group [level %u - %u]%s\n", inter->u.paren_info.level,
		   inter->u.paren_info.max_internal_level, countmemo);
	    #else
	    outputf("group %s\n", countmemo);
	    #endif
	    show_intermediate_pattern(inter->u.paren_info.paren, level+1);
	    break;

        #ifndef NO_PAREN_INFO
	  case MATCH_PREV_PAREN:
	    outputf("match previous paren group %d\n",
		   inter->u.prev_paren_to_match);
	    break;
        #endif

	  case CLASS:
	    {
		if (inter->u.class->inverted)
		    output("inverted ");
		outputf("class%s", countmemo);

		for (i = 0; i < 128; i++)
		{
		    if (inter->u.class->ascii[i])
			if (isprint(i))
			    outchar(i);
			else
			    outputf("\\%03o", i);
		}

		for (i = 0; i < inter->u.class->euc2.buf_used; i++)
		    outchar(inter->u.class->euc2.buf[i]);

		for (i = 0; i < inter->u.class->euc3.buf_used; i++)
		    outchar(inter->u.class->euc3.buf[i]);
		output("\n");
	    }
	    break;

	  case EXACT1:
	    outputf("exact1%c%s\n", inter->u.exact[0], countmemo);
	    break;

	  case EXACT_ICASE_ASCII:
	    outputf("ignore-case ascii%c%s\n",inter->u.exact[0],countmemo);
	    break;

	  case EXACT2:
	    outputf("exact2%c%c%s\n",
		    inter->u.exact[0], inter->u.exact[1], countmemo);
	    break;

	  case EITHER_EXACT_2:
	    outputf("exact2%c%c*or*%c%c%s\n",
		    inter->u.exact[0], inter->u.exact[1],
		    inter->u.exact[2], inter->u.exact[3], countmemo);
	    break;

	  case EXACT_ICASE_KANA:
	    outputf("kana (ignore case)%c%c%s\n",
		    inter->u.exact[0], inter->u.exact[1], countmemo);
	    break;

	  case EXACT3:
	    outputf("exact3%c%c%c%s\n", inter->u.exact[0],
		   inter->u.exact[1], inter->u.exact[2], countmemo);
	    break;


	  case ALT:
	    {
		struct intermediate *ptr = inter->u.alt;
		output("start of alternatives\n");
		for (;;)
		{
		    show_intermediate_pattern(ptr, level + 1);
		    for (i = 0; i < level; i++)
			outchar('|');
		    if (ptr->nextalt == 0)
			break;
		    output("---------------------\n");
		    ptr = ptr->nextalt;
		}
		output("end of alternatives\n");
		break;
	    }
	}
	inter = inter->next;
    }
}
#endif

/*
 * Wow, this is it.
 * Given a null-terminated pattern, a REGEX_T to fill, and some flags,
 * return one of the REGCOMP_* return values (i.e. REGCOMP_SUCCESS)
 */
int
regcomp(regex_t *r, const unsigned char *pattern, unsigned flags)
{
    const struct intermediate *compiled;
    const unsigned char *orig_pattern = pattern;
    int retval;

    regcomp_last_pattern = pattern;

    if (pattern == 0 || r == 0)
	return regcomp_error = REGCOMP_INVALID_DATA;

    bzero(r, sizeof(*r));

    regex_init();           /* Make sure this has been done.   */
    regcomp_flags = flags;  /* So everyone else can know, too. */
    regcomp_error = 0;      /* No false alarms, please.        */

   #ifndef NO_PAREN_INFO
    paren_level = paren_levels_finished = 0;
    highest_prev_to_match = -1;
    max_lpc = max_lpc_c = max_lpc_l = 0;
   #endif

    this_regex_match = (flags & REGCOMP_WANT_WORD_MATCH_INFO)
	                         ? REGEX_MATCH_NOTE_WORD
				 : REGEX_MATCH;

    DEBUGSTUFF(if (flags & REGCOMP_DEBUG)
	          outputf("FLAGS %x PATTERN %s\n", flags, pattern);)

    if (pattern[0] == '\0')
	return regcomp_error = REGCOMP_EMPTY_PATTERN;

    compiled = nibble_list(&pattern, 0);    /* process that baby */

    if (regcomp_error)
	retval = regcomp_error;
    else if (compiled == 0)
	retval = REGCOMP_INTERNAL_ERROR;
    else
    {
	/*
	 * It compiled well into the intermediate form. Now output to
	 * the final compiled form.
	 */
	unsigned char *buffer = xmalloc(compiled->regex_bytes_needed);
	unsigned char *musthave, **mh_ptr = 0;

	/*
	 * Simple optimization: if the regex (or every top-level alternative)
	 * begins with START_OF_LINE, then set r->anchor_to_start_of_line.
	 */
	if (compiled->type == START_OF_LINE)
	    r->anchor_to_start_of_line = 1;
	else if (compiled->type == ALT) 
	{
	    const struct intermediate *ptr = compiled->u.alt;
	    r->anchor_to_start_of_line = 1;
	    while (ptr) {
		if (ptr->type != START_OF_LINE) {
		    r->anchor_to_start_of_line = 0;
		    break;
		}
		ptr = ptr->nextalt;
	    }
	}

	DEBUGSTUFF(if (flags & REGCOMP_DEBUG)
	               show_intermediate_pattern(compiled, 1);)

	r->fold_acase = (flags & REGCOMP_IGNORE_ALPHA_CASE) ? 1 : 0;
	r->fold_kcase = (flags & REGCOMP_IGNORE_KANA_CASE)  ? 1 : 0;

	if ((flags & REGCOMP_CALC_MUSTHAVE) == 0)
	    r->musthave = 0;
	else {
	    r->musthave = xmalloc((unsigned)strlen((void*)orig_pattern)+1);
	    musthave = r->musthave;
	    mh_ptr = &musthave;
	}

	r->buf = buffer;
	write_regex_buffer(compiled, &buffer, /*noomit*/0, mh_ptr);
	r->bufend = buffer;
	r->min_length_match = compiled->min_match_len;

       #ifndef NO_PAREN_INFO
	r->max_paren_level = paren_level;
	r->max_lpc_l = max_lpc_l;
	r->max_lpc_c = max_lpc_c;
	r->paren_info_required = highest_prev_to_match + 1;
	assert(r->paren_info_required == 0 ||
	       (flags & REGCOMP_SAVE_MATCHED_PAREN_INFO));
       #endif

	/* squish out repeates and empties from the musthave list */
	if (r->musthave && r->musthave != musthave)
	{
	    unsigned char *start = r->musthave;
	    unsigned char *end = musthave; /* changed in write_regex_buffer */
	    unsigned char *ptr   = start + EUC_CHAR_LENGTH(start[0]);
	    unsigned char *dest  = ptr;

	    while (ptr < end) switch(EUC_CHAR_LENGTH(ptr[0]))
	    {
	      default:
		warn("[internal error %s:%d; width=%d, first byte=\\%03o]\n",
		     __FILE__, __LINE__, EUC_CHAR_LENGTH(ptr[0]), ptr[0]);
		ptr += EUC_CHAR_LENGTH(ptr[0]) ? EUC_CHAR_LENGTH(ptr[0]) : 1;
		break;

	      case 1:
		if (!onebyte_char_in_string(ptr[0], start, dest))
		    *dest++ = ptr[0];
		ptr += 1;
		break;
		
	      case 2:
		if (!twobyte_char_in_string(ptr[0], ptr[1], start, dest))
		{
		    *dest++ = ptr[0];
		    *dest++ = ptr[1];
		}
		ptr += 2;
		break;
		
	      case 3:
		if (!threebyte_char_in_string(ptr[0], ptr[1], ptr[2],
					      start, dest))
		{
		    *dest++ = ptr[0];
		    *dest++ = ptr[1];
		    *dest++ = ptr[2];
		}
		ptr += 3;
		break;
	    }
	    musthave = dest;
	}
	if (r->musthave)
	    *musthave = 0; /* cap off end of list */
	retval = REGCOMP_SUCCESS;
    }

    DEBUGSTUFF(if (flags & REGCOMP_DEBUG)
		   output((const char *)regcomp_error_report());)

    regex_free_temp_memory(); /* make sure to free our temp'ly-used memory */
    return retval;
}

const char *regcomp_errstr[] =
{
    "success",
    "internal error",
    "invalid data",
    "empty pattern",
    "unmatched [",
    "unmatched open paren",
    "unmatched close paren",
    "misused +,*, or ?",
    "object of + could be empty",
    "object of * could be empty",
    "empty class",
    "misused |",
    "nonexistent paren'ed expression",
    "need SAVE_PAREN_INFO with this pattern",
    "Japanese character in class range",
    "invalid octal value",
    "corrupted text"
};

const unsigned char *regcomp_error_report(void)
{
    static unsigned char *report = 0;
    int pat_len = regcomp_last_pattern ? strlen((void*)regcomp_last_pattern) : 0;
    const char *str;

    if (report)
    {
	free((void*)report);
	report = 0;
    }

    switch(regcomp_error)
    {
      default:
	str = "regcomp%sreturns error code %d.\n";
	report = xmalloc(strlen((void*)str) + pat_len + 10 + 1);
	sprintf(report, str, regcomp_last_pattern, regcomp_error);
	return report;

      case REGCOMP_SUCCESS:
	str = "regcomp%sreturns success\n";
	report = xmalloc(strlen((void*)str) + pat_len + 1);
	sprintf(report, str, regcomp_last_pattern);
	return report;

      case REGCOMP_INVALID_DATA:
	str = "regcomp (pat=0x%00000008x) barfs on bad data\n";
	report = xmalloc(strlen((void*)str) + 1);
	sprintf(report, str, regcomp_last_pattern);
	return report;

      case REGCOMP_EMPTY_PATTERN:
	return (const unsigned char *)"regcomp barfs on empty pattern\n";

      case REGCOMP_UNMATCHED_OPEN_PAREN:
      case REGCOMP_UNMATCHED_CLOSE_PAREN:
      case REGCOMP_MISUSED_COUNT_SPEC:
      case REGCOMP_UNCLOSED_CLASS:
      case REGCOMP_EUC_IN_CLASS_RANGE:
      case REGCOMP_INTERNAL_ERROR:
      case REGCOMP_INFINITE_PLUS:
      case REGCOMP_INFINITE_STAR:
      case REGCOMP_PAREN_LEVEL_INVALID:
      case REGCOMP_NEED_SAVE_PAREN_INFO:
      case REGCOMP_CORRUPTED_TEXT:
      case REGCOMP_BAD_BRACE:
      case REGCOMP_INVALID_OCTAL_VALUE:
      case REGCOMP_EMPTY_CLASS:
	str = "regcomp error: %s\n"
	    " pattern%s\n";
	report = xmalloc(strlen((void*)str) +
			 strlen((void*)regcomp_errstr[regcomp_error])
			 + pat_len + 20 + pat_len + 1);
	sprintf(report,str,regcomp_errstr[regcomp_error],regcomp_last_pattern);
	if (regcomp_eptr - regcomp_last_pattern <= pat_len)
	{
	    int i = regcomp_eptr - regcomp_last_pattern;
	    strcat(report, "  before --");
	    while (i-- > 0)
		strcat(report, "-");
	    strcat(report, "^\n");
	}
	return report;
    }
}

/*
 * Free any (non-temporary) memory associated with the given REGEX_T.
 * The REGEX_T itself is not freed, however, as it wasn't allocated here.
 * Safe to re-free a free'd regex.
 */
void regfree(regex_t *r)
{
    if (r)
    {
	if (r->buf)
	    free(r->buf);
	if (r->musthave)
	    free(r->musthave);
	r->buf = r->musthave = 0;
    }
}

const unsigned char *regmusthave(const regex_t *r)
{
    return r->musthave;
}

#ifndef NO_SHOWREGEX
/*
 * Another debug routine... this time prints a real compiled pattern.
 */
static void
showbuf(const unsigned char *origb,
	const unsigned char *b,
	const unsigned char *b_end,
	const char *margin)
{
    while (b < b_end)
    {
	enum TYPE type = get_type_from_type_byte(*b);
	unsigned char count = get_count_from_type_byte(*b);

	if (margin)
	    output(margin);
	outputf("  %3ld ", b - origb);
	b += TYPE_BYTE_SIZE;

	switch (type)
	{
	  default:
	    outputf("unknown code [%x|%x]", type, count);
	    break;

	  case REGEX_MATCH_NOTE_WORD:
	    output("match (note word)");
	    break;

	  case REGEX_MATCH:
	    output("match");
	    break;

	  case EXACT_ICASE_ASCII:
	    outputf("exact ascii (ignore case) [%c] ", *b);
	    b += ASCII_SPEC_SIZE;
	    break;

	  case EXACT1:
	    outputf("exact1%c", *b);
	    b += ASCII_SPEC_SIZE;
	    break;

	  case ANY_CHAR: output("ANY CHAR "); break;
	  case ANY_ASCII: output("ANY ASCII"); break;
	  case ANY_MULTIBYTE: output("ANY MULTIBYTE"); break;
	  case ANY_KATAKANA: output("ANY KATAKANA"); break;
	  case ANY_NONKATAKANA: output("ANY NON-KATAKANA"); break;
	  case ANY_HIRAGANA: output("ANY HIRAGANA"); break;
	  case ANY_NONHIRAGANA: output("ANY NON-HIRAGANA"); break;
	  case ANY_KANJI: output("ANY KANJI"); break;
	  case ANY_NONKANJI: output("ANY NON-KANJI"); break;

	  case EXACT_ICASE_KANA:
	    outputf("exact kana (ignore case) %c%c", b[0], b[1]);
	    b += EUC_SPEC_SIZE;
	    break;

	  case EXACT2:
	    outputf("exact2 %c%c", b[0], b[1]);
	    b += EUC_SPEC_SIZE;
	    break;

	  case EITHER_EXACT_2:
	    outputf("exact2 %c%c*or*%c%c", b[0], b[1], b[2], b[3]);
	    b += 2 * EUC_SPEC_SIZE;
	    break;

	  case EXACT3:
	    outputf("exact3 %c%c%c", b[0], b[1], b[3]);
	    b += 3;
	    break;

	  case CLASS:
	    {
		int i;
		int inverted = b[0];
		unsigned short euc2_len = read_short_val(b+1);
		unsigned short euc3_len = read_short_val(b+3);

		outputf("%sclass", inverted ? "inverted " : "");

		b += 5; /* skip above stuff */
		for (i = 0; i < 128; i++)
		{
		    if (b[i])
		    {
			if (isprint(i))
			    outchar(i);
			else
			    outputf("\\%03o", i);
		    }
		}
		b += 128;

		while (euc2_len-- != 0)
		    outchar(*b++);
		while (euc3_len-- != 0)
		    outchar(*b++);
		output("");
	    }
	    break;
	    
	  case WORD_BOUNDARY: output("word boundary"); break;
	  case START_OF_LINE: output("start of line"); break;
	  case END_OF_LINE:   output("end of line");   break;

	  case PUSH:
	    outputf("push %ld", (b - TYPE_BYTE_SIZE) - origb +
		   read_short_val(b));
	    b += SHORT_VAL_SIZE;
	    break;

	  case JUMP:
	    outputf("jump to %ld", (b - TYPE_BYTE_SIZE) - origb +
		   read_short_val(b));
	    b += SHORT_VAL_SIZE;
	    break;

	  case PUSH_JUMP:
	    outputf("pushjump to %ld", (b - TYPE_BYTE_SIZE) - origb +
		   read_short_val(b));
	    b += SHORT_VAL_SIZE;
	    break;

      #ifndef NO_PAREN_INFO
	  case SAVE_CLOSE_PAREN_PUSH_JUMP:
	    outputf("save close paren %d, count of %d, pushjump to %ld",
		   b[1], b[0],
		   (b - TYPE_BYTE_SIZE) - origb + read_short_val(b+2));
	    b += PAREN_COUNT_SIZE + PAREN_SPEC_SIZE + SHORT_VAL_SIZE;
	    break;

	  case PUSH_SAVE_OPEN_PAREN:
	    outputf("push to %ld, save open paren %d",
		   (b - TYPE_BYTE_SIZE) - origb + read_short_val(b),
		   b[SHORT_VAL_SIZE]);
	    b += SHORT_VAL_SIZE + PAREN_SPEC_SIZE;
	    break;

	  case SAVE_OPEN_PAREN:
	    outputf("save open paren %d", b[0]);
	    b += PAREN_SPEC_SIZE;
	    break;

	  case SAVE_CLOSE_PAREN:
	    outputf("save close paren %d", b[0]);
	    b += PAREN_SPEC_SIZE;
	    break;

	  case MATCH_PREV_PAREN:
	    outputf("match paren group %d", b[0]);
	    b += PAREN_SPEC_SIZE;
	    break;
      #endif
	}

	if (type < count_ok_limit) switch (count)
	{
	  default: outputf("<<count %x>>", b[-1]); break;
	  case ONCE:        output("  "); break;
	  case ZERO_OR_ONE: output(" ?"); break;
	  case ONE_OR_MORE: output(" +"); break;
	  case ANY_NUMBER:  output(" *"); break;
	}
	outchar('\n');
    }
}

void
showregex(const regex_t *r)
{
    outputf("Minimum length match: %d\n", r->min_length_match);
    if (r->anchor_to_start_of_line)
	output("START OF LINE ONLY\n");
    if (r->musthave)
	outputf("A line must have [%s]\n", r->musthave);
    #ifndef NO_PAREN_INFO
    outputf("Max paren level %d, max lpc is %d/%d.\n",
	   r->max_paren_level, r->max_lpc_l, r->max_lpc_c);
    if (r->paren_info_required)
	outputf("requires paren info for %d parens\n", r->paren_info_required);
    #endif
    showbuf(r->buf, r->buf, r->bufend, "");
}
#endif


/* compile routines above */

/****************************************************************************/
/****************************************************************************/
/****************************************************************************/

/* execute routines below */


#ifndef FAST_REGEXEC
  unsigned int special_debug;
#endif
static unsigned int regexec_flags = 0;

/* set the regexec_flags, returning old value. */
unsigned int regexec_setflags(unsigned flags)
{
    unsigned int old = regexec_flags;
    regexec_flags = flags;
    return old;
}

#ifndef NO_REGEXEC_MATCH_POINTS
const unsigned char *regexec_match_start;
const unsigned char *regexec_match_end;
int regexec_match_at_start_of_word, regexec_match_at_end_of_word;
#endif

#ifndef NO_PAREN_INFO
  unsigned regexec_paren_info_used = 0;
  #ifndef NO_DEFAULT_PAREN_INFO
     #ifndef DEFAULT_PAREN_INFO_SIZE
     #  define DEFAULT_PAREN_INFO_SIZE 10
     #endif
     matched_paren_t default_regexec_paren_info[DEFAULT_PAREN_INFO_SIZE];
     matched_paren_t *regexec_paren_info = &default_regexec_paren_info[0];
     unsigned regexec_paren_info_size = DEFAULT_PAREN_INFO_SIZE;
  #else
     matched_paren_t *regexec_paren_info = 0;
     unsigned regexec_paren_info_size = 0;
  #endif

  #ifndef FAST_REGEXEC
  static void report_new_pareninfo(const char *message, unsigned num)
  {
      if (message == 0)
	  message = "";
      if (num >= regexec_paren_info_size)
	  outputf("paren_info[%d] %s OUT OF RANGE (%d)\n",
		 num, message, regexec_paren_info_size);
      else if (regexec_paren_info[num].match_start == 0 ||
	  regexec_paren_info[num].match_end == 0)
	       outputf("paren_info[%d] %s <undefined>\n", num, message);
      else
	  outputf("paren_info[%d] %s, now [%.*s]\n", num, message,
		 (int)(regexec_paren_info[num].match_end -
		       regexec_paren_info[num].match_start),
		 regexec_paren_info[num].match_start);
  }
  #endif /* FAST_REGEXEC */
#endif /* NO_PAREN_INFO */


/* XXX - need to deal here with three-byte EUC */
#ifdef DONT_WORRY_ABOUT_KATAKANA_DASH_BEING_PART_OF_A_WORD
#define word_boundary(line) (jregex_part_of_word[(line)[-1]] != \
			     jregex_part_of_word[(line)[0]])
#else
static __inline__ int word_boundary(const unsigned char *line)
{
    unsigned l = line[0];
    unsigned c = line[-1];
    
    if (c & 0x80)
    {
	/* special case for katakana and '' (both part of word) */
	unsigned char c2 = c;
	c = line[-2];
	
	if (c  == KATA_DASH_HI_BYTE &&
	    l == EUC_KATAKANA &&
	    c2 == KATA_DASH_LO_BYTE)
	    return 0;
	
	if (l == KATA_DASH_HI_BYTE &&
	    c  == KATA_DASH_HI_BYTE &&
	    c2 == KATA_DASH_LO_BYTE)
	    return 0;
    }
    return jregex_part_of_word[c] != jregex_part_of_word[l];
}
#endif


/*
 * Execute the given compiled REGEX_T on the given LINE (of the given LENGTH)
 * and return true if it matches, false otherwise.
 */
unsigned int
regexec(const regex_t *r, const unsigned char *line, unsigned len)
{
    const unsigned char *b = r->buf;               /* compiled pattern */
    const unsigned char *orig_line = line;         /* just to save */
    const unsigned char *end_of_line = &line[len]; /* note end of line */
    #define bol(line) ((line) == orig_line)
    #define eol(line) ((line) >= end_of_line)

  #ifndef NO_LLM_SUPPORT
    /* llm is true if doing a longest leftmost match */
    const unsigned char llm = (regexec_flags & REGEXEC_LLM) ? 1 : 0;
    const unsigned char *longest_match_so_far = 0; /* for when llm is true */
  #endif
		  
    #define is_word_boundary(line)                                           \
    (                                                                        \
       (orig_line != end_of_line) &&                                         \
       (                                                                     \
          (bol(line) && jregex_part_of_word[(line)[0]]) ||                   \
          (eol(line) && jregex_part_of_word[end_of_line[-1]]) ||             \
          word_boundary(line)                                                \
       )                                                                     \
    )

    /*
     * Since we know the length of the shortest string possible, we can
     * figure out the latest place in our string where we can possibly
     * start a match from. However, if we can only match from the start
     * of a line, the LATEST_START will then be the start of the line.
     * Whichever is the case, note it.
     */
    const unsigned char *latest_start = r->anchor_to_start_of_line ? line :
	                                  end_of_line - r->min_length_match;
    /*
     * The LINE_HEAD will be from where we try to apply the regex, and what,
     * if failing, we bump up to try again from a new position (until we run
     * out of string [i.e. try to start beyond latest_start] and finally
     * fail completely).
     */
    const unsigned char *line_head = line;

    static unsigned maxstates = 0;
    static struct statestruct {
	const unsigned char *b;
	const unsigned char *line;
        #ifndef NO_PAREN_INFO
	 signed short highest_paren_seen;
	 unsigned char pushed_paren_count;
	 unsigned char pushed_paren_start;
	#endif
    } *statestack_start = 0, *statestack_end;
    struct statestruct *state;

    /* we'll need at most one more than one state per char in the input line */
/* -- Mmm, maybe need two per char -- actually, probably dependent on max paren depth */
    unsigned int maxstates_wanted = 2 * len + 10; /* extra for good measure */

  #ifndef NO_PAREN_INFO
    signed short highest_paren_seen = -1;
    static unsigned parenstate_size = 0;
    static const unsigned char **parenstate_base = 0;
    #ifndef NDEBUG
    static const unsigned char **parenstate_end;
    #endif
    const unsigned char **parenstate; /* "used uninitialized" OK here. */

    const unsigned char **max_parenstate;

    if (r->max_lpc_l)
    {
	/* make sure we have pleanty of paren-state stuff */
	unsigned max_parenstates_needed = 2 /* 2 for good measure */ + 2 *
	    ((len + r->max_lpc_c - 1)/r->max_lpc_c * r->max_lpc_l);
	if (max_parenstates_needed > parenstate_size)
	{
	    if (parenstate_base)
		free(parenstate_base);
	    parenstate_size = max_parenstates_needed;
	    parenstate_base = xmalloc(sizeof(unsigned char *)*parenstate_size);
	    #ifndef NDEBUG
	    parenstate_end = &parenstate_base[parenstate_size];
	    #endif
	}
	parenstate = parenstate_base;
    }

    max_parenstate = parenstate_base;

    #define push_paren_state(NUM)                                            \
    macro_start {                                                            \
	unsigned int _num_ = (NUM);                                          \
        STATS(regex_stats.parens_pushed++);                                  \
	FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) {                  \
	    outputf("push_paren_state #%ld (%d) %lx\n",                      \
		   parenstate - parenstate_base, _num_,                      \
		   (unsigned long)regexec_paren_info[_num_].match_start);    \
	    report_new_pareninfo("push", _num_);                             \
	})                                                                   \
	assert(r->max_lpc_l);                                                \
	assert(parenstate < parenstate_end);                                 \
	parenstate[0] = regexec_paren_info[_num_].match_start;               \
	parenstate[1] = regexec_paren_info[_num_].match_end;                 \
	parenstate += 2;                                                     \
	if (parenstate > max_parenstate)                                     \
	    max_parenstate = parenstate;                                     \
	regexec_paren_info[_num_].match_end = 0;                             \
    } macro_end

    #define pop_paren_state(NUM)                                             \
    macro_start {                                                            \
	unsigned int _num_ = (NUM);                                          \
        STATS(regex_stats.parens_popped++);                                  \
	parenstate -= 2;                                                     \
	assert(r->max_lpc_l);                                                \
	assert(parenstate >= parenstate_base);                               \
        regexec_paren_info[_num_].match_start = parenstate[0];               \
        regexec_paren_info[_num_].match_end   = parenstate[1];               \
	FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) {                  \
	    outputf("pop_paren_state #%ld (%d) %lx\n", parenstate -          \
		   parenstate_base, _num_, (unsigned long)parenstate[0]);    \
            report_new_pareninfo("pop", _num_);                              \
	})                                                                   \
    } macro_end

 #endif


    if (maxstates_wanted > maxstates)
    {
	/* need more memory. First get rid of any we had before */
	if (statestack_start)
	    free(statestack_start);
	maxstates = maxstates_wanted;
	statestack_start = xmalloc(sizeof(struct statestruct) * maxstates);
	#ifndef NDEBUG
	statestack_end = &statestack_start[maxstates];
	#endif
    }
    state = &statestack_start[0];

    #define state_count()   (state - statestack_start)
    #define raw_push_state(B, L)                                             \
    macro_start {                                                            \
	STATS(regex_stats.states_pushed++;                                   \
	      if (state_count() > regex_stats.max_state_depth)               \
	          regex_stats.max_state_depth = state_count());              \
	FASTDEBUGSTUFF(                                                      \
	   assert(state < statestack_end);                                   \
	   if (regexec_flags & REGEXEC_DEBUG)                                \
	       outputf("state #%ld now <%ld,%ld>\n",                         \
		         state_count(), (B) - r->buf, (L) - orig_line);)     \
	state->b = (B);                                                      \
	state->line = (L);                                                   \
	state++;                                                             \
    } macro_end

    #define raw_pop_state(B, L)                                              \
    macro_start {                                                            \
	--state;                                                             \
	(B) = state->b;                                                      \
	(L) = state->line;                                                   \
	STATS(regex_stats.states_popped++);                                  \
    } macro_end

  #ifndef NO_PAREN_INFO
    #define push_state_with_paren_info(B, L, COUNT, START)                   \
    macro_start {                                                            \
	unsigned int _count = (COUNT); /* get arg */                         \
	unsigned int _start = (START); /* get arg */                         \
	state->highest_paren_seen = highest_paren_seen;                      \
	if (_start >= regexec_paren_info_size)                               \
	    state->pushed_paren_count = 0; /* nothing to save */             \
	else                                                                 \
	{                                                                    \
	    unsigned _end_ = _start + _count;                                \
 	    if (_end_ > regexec_paren_info_size)                             \
		_count -= _end_ - regexec_paren_info_size;                   \
                                                                             \
	    state->pushed_paren_start = _start;                              \
	    state->pushed_paren_count = _count;                              \
            do {                                                             \
		push_paren_state(start++);                                   \
	    } while (--_count);                                              \
	}                                                                    \
	raw_push_state(B, L);                                                \
    } macro_end

    #define push_state(B, L)                                                 \
    macro_start {                                                            \
	state->highest_paren_seen = highest_paren_seen;                      \
	state->pushed_paren_count = 0;                                       \
	raw_push_state(B, L);                                                \
    } macro_end

    #define pop_state(B, L)                                                  \
    macro_start {                                                            \
        raw_pop_state(B, L);                                                 \
	highest_paren_seen = state->highest_paren_seen;                      \
                                                                             \
	if (state->pushed_paren_count)                                       \
	{                                                                    \
	    int _count = state->pushed_paren_count;                          \
	    unsigned int _end = state->pushed_paren_start + _count - 1;      \
	    if (_end >= regexec_paren_info_size) {                           \
		_count -= _end - regexec_paren_info_size;                    \
		_end = regexec_paren_info_size;                              \
	    }                                                                \
	    if (_count > 0) {                                                \
		pop_paren_state(_end);                                       \
		while (--_count > 0)                                         \
		    pop_paren_state(--_end);                                 \
	    }                                                                \
	}                                                                    \
    } macro_end
  #else /* NO_PAREN_INFO */
    #define pop_state(B, L)	raw_pop_state(B, L);
    #define push_state(B, L)	raw_push_state(B, L);
  #endif

    /*
     * We make ample use of the preprocessor here, since there is much
     * regularity in what goes on while checking the regex to the string.
     * Much code is repeated to avoid having to make decisions at regex
     * match time.... should speed things up quite a bit.
     */

    /*
     * The four macros do_ONCE, do_ONE_OR_MORE, do_ANY_NUMBER, and 
     * do_ZERO_OR_ONE represent the basic way the matching is done for
     * each of a simple match, `+', `*', and `?'.
     *
     * The arguments to these macros are, for any one kind of test,
     *
     *    EXTRA_REGEX_BYTES: the number of bytes (besides the type-byte)
     *                       that this instance of the regex command set uses.
     *
     *    TEST               the test that indicates if the regex matches
     *                       at this point in the string.
     *
     *    STRING_INCR        How much to bump along the string when a match
     *                       is successful.
     */
    #ifndef NO_REGEX_STATS
    #  define do_TEST(TEST)       (regex_stats.tests++,              (TEST))
    #  define do_TEST_noEOL(TEST) (regex_stats.tests++, (!eol(line)&&(TEST)))
    #else
    #  define do_TEST(TEST)                      (TEST)
    #  define do_TEST_noEOL(TEST) (!eol(line) && (TEST))
    #endif


    #define do_ONCE(EXTRA_REGEX_BYTES, TEST, STRING_INCR)                    \
        /* if the test fails, we have no match */                            \
	if (!do_TEST_noEOL(TEST))                                            \
	    goto nomatch;                                                    \
                                                                             \
	/* Otherwise, bump along the regex buffer pointer     */             \
	/* the string to reflect that the match has succeeded */             \
	b_if_match = b + TYPE_BYTE_SIZE + (EXTRA_REGEX_BYTES);               \
	line += (STRING_INCR);                                               \
	goto match;


    #define do_ONE_OR_MORE(EXTRA_REGEX_BYTES, TEST, STRING_INCR)             \
        /* if the test fails, we have no match */                            \
	if (!do_TEST_noEOL(TEST))                                            \
	    goto nomatch;                                                    \
                                                                             \
        /* at this point we know we'll match. */                             \
	b_if_match = b + TYPE_BYTE_SIZE + (EXTRA_REGEX_BYTES);               \
                                                                             \
        /* Since it's "or more", continue matching and bumping along    */   \
        /* the string so long as the test matches. We'll push the state */   \
        /* after all but the first match, since all but the first are   */   \
        /* optional and we may have to backtrack and retry w/o having   */   \
        /* taken the match                                              */   \
	while (line += STRING_INCR, do_TEST_noEOL(TEST))                     \
	    push_state(b_if_match, line);                                    \
        STATS(                                                               \
	      regex_stats.states_pushed++;                                   \
              regex_stats.states_popped++;                                   \
	)                                                                    \
	goto match;

    #define do_ANY_NUMBER(EXTRA_REGEX_BYTES, TEST, STRING_INCR)              \
        STATS(                                                               \
	      regex_stats.states_pushed++;                                   \
              regex_stats.states_popped++;                                   \
	)                                                                    \
                                                                             \
        /* since it's any number, zero is fine to, so we *know* we match */  \
	b_if_match = b + TYPE_BYTE_SIZE + (EXTRA_REGEX_BYTES);               \
                                                                             \
	/* nibble along string, pushing state, so long as we match */        \
	if (do_TEST_noEOL(TEST)) do {                                        \
	    push_state(b_if_match, line);                                    \
	} while (line += (STRING_INCR), do_TEST_noEOL(TEST));                \
	/* if we failed because we were at the end of the line, there */     \
        /* was an effective push/test/fail/pop that we optimized out, */     \
        /* so  we account for them here                               */     \
	goto match;

    #define do_ZERO_OR_ONE(EXTRA_REGEX_BYTES, TEST, STRING_INCR)             \
	/* since zero is fine, we *know* we match */                         \
	b_if_match = b + TYPE_BYTE_SIZE + (EXTRA_REGEX_BYTES);               \
	/* but if the test is OK, push state and nibble */                   \
	if (do_TEST_noEOL(TEST)) {                                           \
	    push_state(b_if_match, line);                                    \
	    line += (STRING_INCR);                                           \
	}                                                                    \
        STATS( else {                                                        \
	      regex_stats.states_pushed++;                                   \
              regex_stats.states_popped++;                                   \
	})                                                                   \
	goto match;

    /*
     * The following just applies the four above to the case statement
     * below. A, B, and C are EXTRA_REGEX_BYTES, TEST, and STRING_INC,
     * but I couldn't make it fit prettily (-:
     * 
     * For fun, check out what the output of the preprocessor looks like.
     */
    #define case_major(MAJOR, PRE, A, B, C)                                  \
	case make_type_byte_with_count(MAJOR, ONCE       ):                  \
	{ PRE; do_ONCE(A,B,C) }                                              \
	case make_type_byte_with_count(MAJOR, ZERO_OR_ONE):                  \
	{ PRE; do_ZERO_OR_ONE(A,B,C) }                                       \
	case make_type_byte_with_count(MAJOR, ONE_OR_MORE):                  \
	{ PRE; do_ONE_OR_MORE(A,B,C) }                                       \
	case make_type_byte_with_count(MAJOR, ANY_NUMBER ):                  \
	{ PRE; do_ANY_NUMBER (A,B,C) }

    #ifndef NO_REGEXEC_MATCH_POINTS
    regexec_match_start = line;
    #endif

    #ifndef NO_PAREN_INFO
    if (regexec_paren_info == 0)
	regexec_paren_info_size = 0; /* no info, so no size */
    else {
	/* clear out what we might potentially set */
	int i = regexec_paren_info_size < r->max_paren_level ?
	        regexec_paren_info_size : r->max_paren_level;
        while (i-- > 0) {
	    regexec_paren_info[i].match_end = 0;
	    FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG)
			   report_new_pareninfo("init", i));
	}
    }
    if (r->paren_info_required > regexec_paren_info_size)
	return 0; /* we need the paren state */
    #endif

    STATS(
        if (!r->anchor_to_start_of_line) {
	      latest_start = end_of_line;
	}
    )

    for (;;)
    {
	const unsigned char *b_if_match;

        STATS(regex_stats.cycles++);
	FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG)
	           {
	              outputf("%ld -----------------------------: [%.*s]\n",
			state_count(), (int)(end_of_line - line), line);
		      showbuf(r->buf, b, r->bufend, ">>> ");
		  })

	switch(b[0])
	{
      #ifndef NO_REGEXEC_MATCH_POINTS
	  case make_type_byte(REGEX_MATCH_NOTE_WORD):
            #ifndef NO_LLM_SUPPORT
	        if (llm && (line <= longest_match_so_far))
		    goto nomatch;
            #endif
	    regexec_match_at_start_of_word =
		is_word_boundary(regexec_match_start);
	    regexec_match_at_end_of_word =
		is_word_boundary(line);

	    /**** FALLTHROUGH ****/
      #endif

	  case make_type_byte(REGEX_MATCH):
            #ifndef NO_LLM_SUPPORT
	        if (llm && (line <= longest_match_so_far)) {
		    FASTDEBUGSTUFF(
		       if (regexec_flags & REGEXEC_DEBUG)
		          outputf("** pattern matches, but is too short **\n");
                    )
		    goto nomatch;
	        }
		longest_match_so_far = line;
            #endif

	    FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG)
		    outputf("*** pattern matches [llm=%d]***\n", llm);)

            #ifndef NO_REGEXEC_MATCH_POINTS
	    regexec_match_end = line;
	    #endif

            #ifndef NO_PAREN_INFO
	    regexec_paren_info_used = highest_paren_seen + 1;
	    FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) {
		int i;
		outputf("regexec_paren_info_used is %d\n",
		       regexec_paren_info_used);
		for (i = 0; i < regexec_paren_info_used; i++)
		    report_new_pareninfo("return", i);
	    })
	    #endif

            #ifndef NO_LLM_SUPPORT
	        if (llm)
		    goto nomatch; /* look for more */
            #endif
	    return 1; /* match! */

	  default:
	    outputf("<<unknown code @%ld[%x]>>\n", b - r->buf, b[0]);
	    return 0;

	  case_major(EXACT1,		                      /* Major type. */
		     ,                                        /* Prep space. */
		     1,			     /* Extra bytes in regex buffer. */
		     b[1] == line[0],	                 /* The test itself. */
		     1)			/* Bytes to move in string if match. */

	  case_major(EXACT_ICASE_ASCII,	                      /* Major type. */
		     ,                                        /* Prep space. */
		     ASCII_SPEC_SIZE,	     /* Extra bytes in regex buffer. */
		     b[1] == case_translation[line[0]],	     /* Test itself. */
		     SIZEOF_ASCII)	/* Bytes to move in string if match. */

	  case_major(EXACT2,                                  /* Major type. */
		     ,                                        /* Prep space. */
		     EUC_SPEC_SIZE,    	     /* Extra bytes in regex buffer. */
		     b[2] == line[1] && b[1] == line[0],            /* Test. */
		     SIZEOF_EUC)        /* Bytes to move in string if match. */

	  case_major(EITHER_EXACT_2,                          /* Major type. */
		     ,                                        /* Prep space. */
		     EUC_SPEC_SIZE * 2 ,     /* Extra bytes in regex buffer. */
		     (b[2] == line[1] && b[1] == line[0]) ||
		     (b[4] == line[1] && b[3] == line[0]),          /* Test. */
		     SIZEOF_EUC)        /* Bytes to move in string if match. */

	  case_major(EXACT_ICASE_KANA,                        /* Major type. */
		     ,                                        /* Prep space. */
		     EUC_SPEC_SIZE,          /* Extra bytes in regex buffer. */
		     b[2] == line[1] && (line[0] == EUC_HIRAGANA || /* Test. */
					 line[0] == EUC_KATAKANA),  /* Test. */
		     SIZEOF_EUC)        /* Bytes to move in string if match. */

	  case_major(EXACT3,                                  /* Major type. */
		     ,                                        /* Prep space. */
		     3,                      /* Extra bytes in regex buffer. */
		     b[1] == line[0] &&                             /* Test. */
		     b[3] == line[2] &&                             /* Test. */
		     b[2] == line[1],                               /* Test. */
		     3)                 /* Bytes to move in string if match. */

	  case_major(ANY_CHAR,                                /* Major type. */
		     ,                                        /* Prep space. */
		     0,                      /* Extra bytes in regex buffer. */
		     line[0] != '\n',                               /* Test. */
		     CHAR_LENGTH(line[0]))        /* Bytes to move if match. */

	  case_major(ANY_ASCII,                               /* Major type. */
		     ,                                        /* Prep space. */
		     0,                      /* Extra bytes in regex buffer. */
		     (line[0] & 0x80) == 0 && line[0] != '\n',      /* Test. */
		     SIZEOF_ASCII)      /* Bytes to move in string if match. */

          case_major(ANY_MULTIBYTE,                           /* Major type. */
		     unsigned Len,                             /* prep space */
		     0,                      /* Extra bytes in regex buffer. */
		     (Len = CHAR_LENGTH(line[0]), Len > 1),         /* Test. */
		     Len)                         /* Bytes to move if match. */
		     
	  case_major(ANY_KATAKANA,                            /* Major type. */
		     ,                                        /* Prep space. */
		     0,                      /* Extra bytes in regex buffer. */
		     (line[0] == EUC_KATAKANA ||                    /* Test. */
		      (line[0] == KATA_DASH_HI_BYTE &&              /* Test. */
		       line[1] == KATA_DASH_LO_BYTE)),              /* Test. */
		     SIZEOF_EUC)        /* Bytes to move in string if match. */

	  case_major(ANY_NONKATAKANA,                         /* Major type. */
		     ,                                        /* Prep space. */
		     0,                      /* Extra bytes in regex buffer. */
		     !(line[0] == EUC_KATAKANA ||     
		      (line[0] == KATA_DASH_HI_BYTE &&              /* Test. */
		       line[1] == KATA_DASH_LO_BYTE)) &&            /* Test. */
		     line[0] != '\n',                               /* Test. */
		     CHAR_LENGTH(line[0]))        /* Bytes to move if match. */

	  case_major(ANY_HIRAGANA,                            /* Major type. */
		     ,                                        /* Prep space. */
		     0,                      /* Extra bytes in regex buffer. */
		     line[0] == EUC_HIRAGANA,                       /* Test. */
		     SIZEOF_EUC)        /* Bytes to move in string if match. */

	  case_major(ANY_NONHIRAGANA,                         /* Major type. */
		     ,                                        /* Prep space. */
		     0,                      /* Extra bytes in regex buffer. */
		     line[0] != EUC_HIRAGANA && line[0] != '\n',    /* Test. */
		     CHAR_LENGTH(line[0]))        /* Bytes to move if match. */

	  case_major(ANY_KANJI   ,                            /* Major type. */
		     ,                                        /* Prep space. */
		     0,                      /* Extra bytes in regex buffer. */
		     line[0] >= LEAST_KANJI_HI_BYTE &&
		     line[0] <= GREATEST_KANJI_HI_BYTE,             /* Test. */
		     SIZEOF_EUC)        /* Bytes to move in string if match. */

	  case_major(ANY_NONKANJI,                            /* Major type. */
		     ,                                        /* Prep space. */
		     0,                      /* Extra bytes in regex buffer. */
		     ((line[0] < LEAST_KANJI_HI_BYTE && line[0] != '\n') ||
		      line[0] > GREATEST_KANJI_HI_BYTE),            /* Test. */
		     CHAR_LENGTH(line[0]))        /* Bytes to move if match. */

	  case_major(CLASS,                      /* Major type. */

		     unsigned short euc2_length = read_short_val(b+2);
		     unsigned short euc3_length = read_short_val(b+4);
		     unsigned char c;
		     unsigned Len;
		     int inverted = b[1];
		     int foldkana = r->fold_kcase;
		     ,

		     /* pattern space size for this command, less type byte */
		     DETERMINISTIC_CLASS_SIZE - 1 + euc2_length + euc3_length,

		     
		     ((Len = EUC_CHAR_LENGTH(c = line[0])),
		     (
		      ((Len == 1) && (b+6)[c]) ||
		      ((Len == 2) && euc2_length && twobyte_char_in_string
		       ((foldkana ? case_translation[c] : c), line[1],
			b+6+128, b+6+128+euc2_length)) ||
		      ((Len == 3) && euc3_length && threebyte_char_in_string
		       (c, line[1], line[2],
			b+6+128+euc2_length, b+6+128+euc2_length+euc3_length))
		      ) == !inverted),

		     CHAR_LENGTH(line[0]))

	  case make_type_byte(WORD_BOUNDARY):
	  {	
	      if (!do_TEST(is_word_boundary(line)))
		  goto nomatch;
	      b_if_match = b + TYPE_BYTE_SIZE;
	      goto match;
	  }

	  case make_type_byte(START_OF_LINE):
	    if (!do_TEST(bol(line)))
		goto nomatch;
	    b_if_match = b + TYPE_BYTE_SIZE;
	    goto match;

	  case make_type_byte(END_OF_LINE):
	    if (do_TEST(!eol(line) && line[0] != '\n'))
		goto nomatch;
	    b_if_match = b + TYPE_BYTE_SIZE;
	    goto match;

	  case make_type_byte(PUSH):
	    push_state(b + read_short_val(b+TYPE_BYTE_SIZE), line);
	    b += SIZEOF_PUSH_COMMAND;
	    continue;
	    
	  case make_type_byte(JUMP):
	    b += read_short_val(b+TYPE_BYTE_SIZE);
	    continue;

	  case make_type_byte(PUSH_JUMP):
	    push_state(b + TYPE_BYTE_SIZE + SHORT_VAL_SIZE, line);
	    b += read_short_val(b+TYPE_BYTE_SIZE);
	    continue;

      #ifndef NO_PAREN_INFO

	  case make_type_byte(MATCH_PREV_PAREN):
	  {
	      unsigned level = b[TYPE_BYTE_SIZE];
	      unsigned Len;
	      assert(level < regexec_paren_info_size);
	      if (level > highest_paren_seen ||
		  regexec_paren_info[level].match_start == 0 ||
                  regexec_paren_info[level].match_end == 0)
		      goto nomatch;
	      Len = regexec_paren_info[level].match_end -
		    regexec_paren_info[level].match_start;
	      if (Len != 0)
	      {
		  const unsigned char *A = line;
		  const unsigned char *A_end = A + Len;
		  const unsigned char *B =
		      regexec_paren_info[level].match_start;

		  if (A_end> end_of_line)
		      goto nomatch;

		  FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG)
		      outputf("content len is %d, strings are [%.*s] [%.*s]\n",
			     Len, (int)Len, A, (int)Len, B););

		  do
		  {
		      if (isascii(*A))
		      {
			  if (r->fold_acase == 0 || !isalpha(*A))
			  {
			      if (*A != *B)
				  goto nomatch;
			  } else {
			      if (!isascii(*B) || !isalpha(*B))
				  goto nomatch;
			      if ((islower(*B) ? *B : tolower(*B)) !=
				  (islower(*A) ? *A : tolower(*A)))
				  goto nomatch;
			  }
			  A++;
			  B++;
		      } else {
			  if (r->fold_kcase == 0 ||
			      (A[0] != EUC_HIRAGANA &&
			       A[0] != EUC_KATAKANA))
			  {
			      if (A[0] != B[0] || A[1] != B[1])
				  goto nomatch;
			  } else {
			      if ((B[0] != EUC_HIRAGANA &&
				   B[0] != EUC_KATAKANA) || A[1] != B[1])
				  goto nomatch;
			  }
			  A += 2;
			  B += 2;
		      }
		  } while (A < A_end);
		  line += Len;
	      }
	      b_if_match = b + TYPE_BYTE_SIZE + PAREN_SPEC_SIZE;
	      goto match;
	  }

	  case make_type_byte(SAVE_OPEN_PAREN):
	  {
	      unsigned level = b[TYPE_BYTE_SIZE];
	      STATS(regex_stats.parens_entered++);
	      if (level < regexec_paren_info_size)
	      {
		  regexec_paren_info[level].match_start = line;
		  regexec_paren_info[level].match_end   = 0;
		  if ((signed int)level > highest_paren_seen)
		      highest_paren_seen = level;
		  FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) {
		       outputf("highest_paren_seen now %d\n",
			      highest_paren_seen);
		       report_new_pareninfo("open", level);});
	      }

	      b_if_match = b + TYPE_BYTE_SIZE + 1;
	      goto match;
	   }

	  case make_type_byte(PUSH_SAVE_OPEN_PAREN):
	  {
	      unsigned level = b[TYPE_BYTE_SIZE+SHORT_VAL_SIZE];

	      push_state(b + read_short_val(b+TYPE_BYTE_SIZE), line);

	      STATS(regex_stats.parens_entered++);
	      if (level < regexec_paren_info_size)
	      {
		  regexec_paren_info[level].match_start = line;
		  regexec_paren_info[level].match_end   = 0;
		  if ((signed int)level > highest_paren_seen)
		      highest_paren_seen = level;
		  FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) {
		      outputf("highest_paren_seen now %d\n",
			     highest_paren_seen);
		      report_new_pareninfo("open2", level);})
	      }

	      b_if_match = b + SIZEOF_PUSH_SAVE_OPEN_PAREN_COMMAND;
	      goto match;
	  }

	  case make_type_byte(SAVE_CLOSE_PAREN_PUSH_JUMP):
	  {
	      unsigned count = b[TYPE_BYTE_SIZE];
	      unsigned start = b[TYPE_BYTE_SIZE+PAREN_COUNT_SIZE];

	      STATS(regex_stats.parens_saved++);
	      if (start < regexec_paren_info_size)
	      {
		  regexec_paren_info[start].match_end = line;
		  if ((unsigned int)start > highest_paren_seen)
		      highest_paren_seen = start;

		  FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) {
		      outputf("highest_paren_seen now %d\n",
			     highest_paren_seen);
		      report_new_pareninfo("set1", start);})
	      }


	      push_state_with_paren_info(b+TYPE_BYTE_SIZE+2+SHORT_VAL_SIZE,
					 line, count, start);

	      b += read_short_val(b+TYPE_BYTE_SIZE +
				  PAREN_COUNT_SIZE + PAREN_SPEC_SIZE);
	      continue;
	  }

	  case make_type_byte(SAVE_CLOSE_PAREN):
	  {
	      unsigned level = b[TYPE_BYTE_SIZE];
	      STATS(regex_stats.parens_saved++);
	      if (level < regexec_paren_info_size)
	      {
		  regexec_paren_info[level].match_end = line;
		  if ((unsigned int)level > highest_paren_seen)
		      highest_paren_seen = level;
		  FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG) {
		      outputf("highest_paren_seen now %d\n",
			     highest_paren_seen);
		      report_new_pareninfo("set2", level);})

	      }
	      b_if_match = b + TYPE_BYTE_SIZE + PAREN_SPEC_SIZE;
	      goto match;
	  }
      #endif /* NO_PAREN_INFO */
	}

	assert(0);
	
      match:
	STATS(regex_stats.matches++);
	FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG)
		       output("<match>\n");)
	b = b_if_match;
	continue;

      nomatch:
	STATS(regex_stats.failures++);
	/*
	 * If we failed, but have some states pushed onto the stack,
	 * pop one off and continue from there.
	 */
	if (state_count() > 0)
	{
            #ifndef NO_PAREN_INFO
	    signed int old_highest_paren_seen = highest_paren_seen;
	    #endif

	    pop_state(b, line);

            #ifndef NO_PAREN_INFO
	    if (old_highest_paren_seen != highest_paren_seen)
	    {
		while (old_highest_paren_seen > highest_paren_seen) {
		    FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG)
				   report_new_pareninfo("flushing",
					    old_highest_paren_seen));
		    regexec_paren_info[old_highest_paren_seen--].match_start=0;
		}
	    }
	    #endif

#ifndef FAST_REGEXE
/*
	    FASTDEBUGSTUFF(
*/
	       if (special_debug) {
		   outputf("popping state #%ld (total %ld): [%.*s|%s] regex %d.\n",
		          state_count(),
			 #ifdef NO_REGEX_STATS
			  -1,
			 #else
			  regex_stats.states_popped,
			 #endif
			 line - orig_line, orig_line,
			 line,
			 b - r->buf);
               }
	       else if (regexec_flags & REGEXEC_DEBUG)
		  outputf("++abort, popping state #%ld (total %ld popped)++: <%ld, %ld, p%d>\n",
		          state_count(),
			 #ifdef NO_REGEX_STATS
			  -1,
			 #else
			  regex_stats.states_popped,
			 #endif
			  b - r->buf,
			  line - orig_line,
                         #ifndef NO_PAREN_INFO
			   highest_paren_seen
                         #else
			   -1
                         #endif
			 );
/*
)
*/
#endif
	}
	else
	{
            #ifndef NO_LLM_SUPPORT
	        if (llm && (longest_match_so_far != 0))
		    return 1; /* we had a match last time */
	    #endif

	    /* bump past the currently first character */
            line_head += CHAR_LENGTH(line_head[0]);


	    /* are we too late? */
	    if (line_head > latest_start) {
		FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG)
              		      output("*** regex failed ***\n");)
		return 0;
	    }
	    STATS(regex_stats.states_pushed++); /* effectively*/
	    STATS(regex_stats.states_popped++); /* effectively*/

	    line = line_head; /* Start from the head of the line... */
	    b = r->buf;       /* ... and head of the regex */

	    FASTDEBUGSTUFF(if (regexec_flags & REGEXEC_DEBUG)
              outputf("++abort, moving along line: line=%lx, latest=%lx, end=%lx++\n",
		     (long)line, (long)latest_start, (long)end_of_line);)

            #ifndef NO_PAREN_INFO
	    highest_paren_seen = -1;
            #endif

            #ifndef NO_REGEXEC_MATCH_POINTS
	    regexec_match_start = line;
	    #endif
	}
	continue;
    }
    /* notreached */
}