1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
|
/*------------------------------------------------------------------------
Copyright (C) 2002-2016 SIL International. All rights reserved.
Distributable under the terms of either the Common Public License or the
GNU Lesser General Public License, as specified in the LICENSING.txt file.
File: Engine.h
Responsibility: Jonathan Kew
Last reviewed: Not yet.
Description:
Changes:
2008-01-23 jk revised endian-ness stuff to allow Universal build
2006-06-02 jk added support for extended string rules (>255 per initial char)
24-May-2005 change from Ulrik to work around MS VC++ 6 issues
21-May-2005 changes based on Ulrik Petersen's patch for MS VC++ 6
-------------------------------------------------------------------------*/
#ifndef __Compiler_H__
#define __Compiler_H__
#ifdef HAVE_CONFIG_H
# include "config.h" /* a Unix-ish setup where we have config.h available */
#endif
#if (defined(__dest_os) && (__dest_os == __win32_os)) || defined(WIN32) /* Windows target: little-endian */
# undef WORDS_BIGENDIAN
#endif
#ifdef __APPLE__
#include <TargetConditionals.h>
#endif
#if defined(TARGET_RT_BIG_ENDIAN) /* the CodeWarrior prefix files or Apple TargetConditionals.h sets this */
# if TARGET_RT_BIG_ENDIAN
# undef WORDS_BIGENDIAN
# define WORDS_BIGENDIAN 1
# else
# undef WORDS_BIGENDIAN
# endif
#endif
#include "TECkit_Format.h"
#include "TECkit_Compiler.h"
#include "TECkit_Engine.h"
#ifndef __MWERKS__
# include "ulong_chartraits.h"
#endif
#include <string>
#include <vector>
#include <map>
using namespace std;
class Compiler
{
public:
Compiler(const char* txt, UInt32 len, char inForm, bool cmp, bool genXML, TECkit_ErrorFn errFunc, void* userData);
~Compiler();
void GetCompiledTable(Byte*& table, UInt32& len) const;
void DetachCompiledTable();
enum { kInvalidRuleOffset = 0xffffffffUL };
protected:
typedef enum {
// general token types recognized by the compiler
tok_Newline = 256,
tok_Map,
tok_Ellipsis,
tok_Number,
tok_USV,
tok_Identifier,
tok_String,
tok_Unknown,
// then we have the TECkit language keywords:
tok_Name,
tok_Flags,
tok_FlagValue,
tok_Pass,
tok_PassType,
tok_Class,
tok_Default,
tok_Define
} tokenType;
Byte* compiledTable;
UInt32 compiledSize;
TECkit_ErrorFn errorFunction;
void* errFuncUserData;
typedef basic_string<UInt32> string32;
struct Token {
tokenType type;
UInt32 val;
const char* str;
string32 strval;
};
struct Keyword {
const char *keyword;
tokenType token;
UInt32 refCon;
};
static Keyword keywords[];
const unsigned char* textEnd;
const unsigned char* textPtr;
char idBuffer[256];
// used by the front end parser
UInt32 ungotten;
Token tok;
const unsigned char* tokStart;
UInt32 errorCount;
UInt32 lineNumber;
char inputForm;
bool errorState;
bool generateXML;
bool usedExtStringRules;
// used in compiling passes
enum {
notInRule,
inLHSString,
inLHSPreContext,
inLHSPostContext,
inRHSString,
inRHSPreContext,
inRHSPostContext
} ruleState;
char ruleType;
struct Item {
UInt8 type; // 0: literal; kMatchElem_Type_XXXX; 0xff: copy
UInt8 negate;
UInt8 repeatMin;
UInt8 repeatMax;
UInt32 val; // class index or literal value
UInt8 start; // OR/EGroup: index of BGroup
UInt8 next; // BGroup/OR: index of next OR/EGroup
UInt8 after; // BGroup: index of EGroup + 1
UInt8 index; // Class/Copy: index of corresponding item in match
string tag;
};
struct Rule {
Rule(
const vector<Item>& mat,
const vector<Item>& pre,
const vector<Item>& post,
const vector<Item>& rep,
UInt32 line
) : matchStr(mat)
, preContext(pre)
, postContext(post)
, replaceStr(rep)
, lineNumber(line)
, offset(kInvalidRuleOffset)
, sortKey(0)
{ }
vector<Item> matchStr;
vector<Item> preContext;
vector<Item> postContext;
vector<Item> replaceStr;
UInt32 lineNumber;
UInt32 offset; // offset of the packed form in the StringRuleData block
UInt16 sortKey;
UInt16 reserved;
};
struct CurrRule {
void clear();
void setLineNo(UInt32 lineNo);
UInt32 startingLine;
vector<Item> lhsString;
vector<Item> lhsPreContext;
vector<Item> lhsPostContext;
vector<Item> rhsString;
vector<Item> rhsPreContext;
vector<Item> rhsPostContext;
};
CurrRule currentRule; // the current rule being parsed
UInt32 classLine;
typedef vector<UInt32> Class;
struct MatClass {
MatClass(UInt32 m)
: membersClass(m)
{ }
UInt32 membersClass;
};
struct RepClass {
RepClass(UInt32 m, UInt32 s)
: membersClass(m)
, sortLikeClass(s)
{ }
UInt32 membersClass;
UInt32 sortLikeClass;
};
struct Pass {
void clear();
void setLineNo(UInt32 lineNo);
UInt32 startingLine;
vector<Rule> fwdRules;
vector<Rule> revRules;
vector<string> xmlRules;
map<string,string> xmlContexts;
map<string,UInt32> byteClassNames; // map name to byteClassMembers index
map<string,UInt32> uniClassNames;
vector<Class> byteClassMembers; // the actual members of each byte class
vector<Class> uniClassMembers;
vector<UInt32> byteClassLines;
vector<UInt32> uniClassLines;
UInt32 passType;
UInt32 uniDefault;
UInt8 byteDefault;
bool supplementaryChars;
};
Pass currentPass; // the current pass being built
struct BuildVars {
void clear();
string planeMap;
vector<string> pageMaps;
vector< vector<UInt16> > charMaps;
UInt8 maxMatch;
UInt8 maxPre;
UInt8 maxPost;
UInt8 maxOutput;
};
BuildVars buildVars; // variables used during pass compilation
vector<string> fwdTables; // binary forms of compiled tables
vector<string> revTables;
UInt32 lhsFlags;
UInt32 rhsFlags;
map<UInt16,string> names; // map name IDs to name texts (NB: utf8)
typedef vector<Token> tokListT;
tokListT::const_iterator defIter;
tokListT::const_iterator defEnd;
map<string,tokListT> defines;
string xmlRepresentation;
UInt32 getChar(void);
void ungetChar(UInt32 c);
void SkipSpaces(void);
tokenType IDlookup(const char* str, UInt32 len);
bool GetNextToken();
bool ExpectToken(tokenType type, const char* errMsg);
bool ExpectToken(char c, const char* errMsg)
{ return ExpectToken((tokenType)c, errMsg); }
void Error(const char* errMsg, const char* s = 0, UInt32 line = 0xffffffff);
void StartDefaultPass();
void AppendLiteral(UInt32 val, bool negate = false);
void AppendUSV(UInt32 val, bool negate = false);
void AppendSpecial(UInt8 type, bool negate = false);
void AppendClass(const string& className, bool negate = false);
void AppendToRule(const Item& item);
bool tagExists(bool rhs, const string& tag);
void AssignTag(const string& tag);
void SetMinMax(int repeatMin, int repeatMax);
void FinishPass();
string asUTF8(const string32 s);
void ReadNameString(UInt16 nameID);
UInt32 charLimit();
static int ruleKeyComp(const Rule& a, const Rule& b);
int findTag(const string& tag, const vector<Item>& str);
void associateItems(vector<Rule>& rules, bool fromUni, bool toUni);
void setGroupPointers(vector<Item>::iterator b, vector<Item>::iterator e, int startIndex, bool isReversed = false);
void setGroupPointers(vector<Rule>& rules);
void sortRules(vector<Rule>& rules);
int calcMaxLen(vector<Item>::iterator b, vector<Item>::iterator e);
int calcMaxOutLen(Rule& rule);
bool findInitialItems(const Rule& rule, vector<Item>::const_iterator b, vector<Item>::const_iterator e,
vector<Item>& initialItems);
void findInitialItems(const Rule& rule, vector<Item>& initialItems);
void addToCharMap(UInt32 ch, UInt16 index);
void buildTable(vector<Rule>& rules, bool fromUni, bool toUni, string& table);
long classIndex(UInt32 charCode, const Class& classMembers);
long uniClassIndex(UInt32 charCode, UInt32 classIndex);
long byteClassIndex(UInt8 charCode, UInt32 classIndex);
bool isSingleCharRule(const Rule& rule);
void appendMatchElem(string& packedRule, Item& item, int index,
vector<MatClass>& matchClasses);
void appendReplaceElem(string& packedRule, Item& item,
vector<Item>& matchStr, vector<RepClass>& repClasses);
void appendToTable(string& s, const char* ptr, UInt32 len);
template <class T>
void appendToTable(string& table, T x) {
#ifdef WORDS_BIGENDIAN
const char* xp = (const char*)&x;
table.append(xp, sizeof(x));
#else
/* split into separate statements to work around VC++6 problems */
const char* xp = (const char*)&x;
xp = xp + sizeof(T);
for (unsigned int i = 0; i < sizeof(T); ++i) {
xp = xp - 1;
table.append(1, *xp);
}
#endif
}
vector<Item> reverseContext(const vector<Item>& ctx);
void align(string& table, int alignment);
void xmlOut(const char* s);
void xmlOut(const string& s);
void xmlOut(char c);
string xmlString(vector<Item>::const_iterator b, vector<Item>::const_iterator e, bool isUnicode);
string getContextID(const vector<Item>& ctx, bool isUnicode);
};
extern "C" {
struct CharName {
unsigned int usv;
const char* name;
};
extern CharName gUnicodeNames[];
}
#endif /* __Compiler_H__ */
|