1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
// regexcmp.h
//
// Copyright (C) 2002-2016, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexCompile
//
// This class is internal to the regular expression implementation.
// For the public Regular Expression API, see the file <_foundation_unicode/regex.h>
//
#ifndef RBBISCAN_H
#define RBBISCAN_H
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include <_foundation_unicode/parseerr.h>
#include <_foundation_unicode/uniset.h>
#include <_foundation_unicode/uobject.h>
#include <_foundation_unicode/utext.h>
#include "uhash.h"
#include "uvector.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
//--------------------------------------------------------------------------------
//
// class RegexCompile Contains the regular expression compiler.
//
//--------------------------------------------------------------------------------
class RegexPattern;
class U_I18N_API RegexCompile : public UMemory {
public:
enum {
kStackSize = 100 // The size of the state stack for
}; // pattern parsing. Corresponds roughly
// to the depth of parentheses nesting
// that is allowed in the rules.
struct RegexPatternChar {
UChar32 fChar;
UBool fQuoted;
};
RegexCompile(RegexPattern *rp, UErrorCode &e);
void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
void compile(UText *pat, UParseError &pp, UErrorCode &e);
virtual ~RegexCompile();
void nextChar(RegexPatternChar &c); // Get the next char from the input stream.
// Categories of parentheses in pattern.
// The category is saved in the compile-time parentheses stack frame, and
// determines the code to be generated when the matching close ) is encountered.
enum EParenClass {
plain = -1, // No special handling
capturing = -2,
atomic = -3,
lookAhead = -4,
negLookAhead = -5,
flags = -6,
lookBehind = -7,
lookBehindN = -8
};
private:
UBool doParseActions(int32_t a);
void error(UErrorCode e); // error reporting convenience function.
UChar32 nextCharLL();
UChar32 peekCharLL();
UnicodeSet *scanProp();
UnicodeSet *scanPosixProp();
void handleCloseParen();
int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
// at the top of the just completed block
// or operation, and optionally ensure that
// there is space to add an opcode there.
void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for
// a reference to a UnicodeSet.
void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier.
int32_t LoopOp);
UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier
void literalChar(UChar32 c); // Compile a literal char
void fixLiterals(UBool split=false); // Generate code for pending literal characters.
void insertOp(int32_t where); // Open up a slot for a new op in the
// generated code at the specified location.
void appendOp(int32_t op); // Append a new op to the compiled pattern.
void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction.
int32_t allocateData(int32_t size); // Allocate space in the matcher data area.
// Return index of the newly allocated data.
int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame.
// Return offset index in the frame.
int32_t minMatchLength(int32_t start,
int32_t end);
int32_t maxMatchLength(int32_t start,
int32_t end);
void matchStartType();
void stripNOPs();
void setEval(int32_t op);
void setPushOp(int32_t op);
UChar32 scanNamedChar();
UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
public: // Public for testing only.
static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
private:
UErrorCode *fStatus;
RegexPattern *fRXPat;
UParseError *fParseErr;
//
// Data associated with low level character scanning
//
int64_t fScanIndex; // Index of current character being processed
// in the rule input string.
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
// end of line comments, in favor of (?#...) comments.
int64_t fLineNum; // Line number in input file.
int64_t fCharNum; // Char position within the line.
UChar32 fLastChar; // Previous char, needed to count CR-LF
// as a single line, not two.
UChar32 fPeekChar; // Saved char, if we've scanned ahead.
RegexPatternChar fC; // Current char for parse state machine
// processing.
uint16_t fStack[kStackSize]; // State stack, holds state pushes
int32_t fStackPtr; // and pops as specified in the state
// transition rules.
//
// Data associated with the generation of the pcode for the match engine
//
int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.)
// Always has high bit (31) set so that flag values
// on the paren stack are distinguished from relocatable
// pcode addresses.
int32_t fNewModeFlags; // New flags, while compiling (?i, holds state
// until last flag is scanned.
UBool fSetModeFlag; // true for (?ismx, false for (?-ismx
UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here.
// Once completed, meaning that some non-literal pattern
// construct is encountered, the appropriate opcodes
// to match the literal will be generated, and this
// string will be cleared.
int64_t fPatternLength; // Length of the input pattern string.
UVector32 fParenStack; // parentheses stack. Each frame consists of
// the positions of compiled pattern operations
// needing fixup, followed by negative value. The
// first entry in each frame is the position of the
// spot reserved for use when a quantifier
// needs to add a SAVE at the start of a (block)
// The negative value (-1, -2,...) indicates
// the kind of paren that opened the frame. Some
// need special handling on close.
int32_t fMatchOpenParen; // The position in the compiled pattern
// of the slot reserved for a state save
// at the start of the most recently processed
// parenthesized block. Updated when processing
// a close to the location for the corresponding open.
int32_t fMatchCloseParen; // The position in the pattern of the first
// location after the most recently processed
// parenthesized block.
int32_t fIntervalLow; // {lower, upper} interval quantifier values.
int32_t fIntervalUpper; // Placed here temporarily, when pattern is
// initially scanned. Each new interval
// encountered overwrites these values.
// -1 for the upper interval value means none
// was specified (unlimited occurrences.)
UStack fSetStack; // Stack of UnicodeSets, used while evaluating
// (at compile time) set expressions within
// the pattern.
UStack fSetOpStack; // Stack of pending set operators (&&, --, union)
UChar32 fLastSetLiteral; // The last single code point added to a set.
// needed when "-y" is scanned, and we need
// to turn "x-y" into a range.
UnicodeString *fCaptureName; // Named Capture, the group name is built up
// in this string while being scanned.
};
// Constant values to be pushed onto fSetOpStack while scanning & evaluating [set expressions]
// The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
enum SetOperations {
setStart = 0 << 16 | 1,
setEnd = 1 << 16 | 2,
setNegation = 2 << 16 | 3,
setCaseClose = 2 << 16 | 9,
setDifference2 = 3 << 16 | 4, // '--' set difference operator
setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator
setUnion = 4 << 16 | 6, // implicit union of adjacent items
setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet.
setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet.
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif // RBBISCAN_H
|