1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520
|
//
// trie.h - trie definition
// Created: 23 May 1996
// Author: Theppitak Karoonboonyanan
//
#ifndef TRIE_INC
#define TRIE_INC
#include "misc/typedefs.h"
#include "vmem/vmem.h"
#include "vmem/dataheap.h" // solely for DataPtr type
typedef int32 State; // index into double-array and TAIL
typedef DataPtr Index; // index into data associating to keys
const Index ErrorIndex = ~0U;
const State RootStateBase = 1;
enum EWalkResult {
INTERNAL, TERMINAL, CRASH
};
// Alphabet set
typedef unsigned char Char;
const int AlphabetSize = 256;
const Char Terminator = 255;
const int MaxKeyLen = 256;
class SymbolSet {
public:
// Constructors
SymbolSet();
SymbolSet(int nElements, ...);
// Symbol List
const Char* Symbols() const;
// Basic Operations
int NElements() const;
bool Contains(Char c) const;
void MakeNull();
// Binary Set Operators
const SymbolSet& operator+=(Char c);
const SymbolSet& operator-=(Char c);
/*
const SymbolSet& operator+=(const SymbolSet& aSet);
const SymbolSet& operator-=(const SymbolSet& aSet);
const SymbolSet& operator*=(const SymbolSet& aSet);
friend SymbolSet operator+(const SymbolSet& set1, Char c);
friend SymbolSet operator-(const SymbolSet& set1, Char c);
friend SymbolSet operator+(const SymbolSet& set1, const SymbolSet& set2);
friend SymbolSet operator-(const SymbolSet& set1, const SymbolSet& set2);
friend SymbolSet operator*(const SymbolSet& set1, const SymbolSet& set2);
*/
private: // private data
Char symbols[AlphabetSize+1];
int nSymbols;
};
inline const Char* SymbolSet::Symbols() const { return symbols; }
inline int SymbolSet::NElements() const { return nSymbols; }
inline void SymbolSet::MakeNull() { symbols[0] = 0; nSymbols = 0; }
///////////////////////////////////////////////////////////////////////////////
// PART 1 : DOUBLE-ARRAY STRUCTURE
///////////////////////////////////////////////////////////////////////////////
// Cells in Double-Array Structure
struct DACell {
State Base;
State Check;
};
const int BaseOffset = 0;
const int CheckOffset = BaseOffset + sizeof(State);
//
// Double-Array Pool (derived from VirtualMem class --See vmem.h)
// -----------------
// This class just shields up the more general VirtualMem class. It represents
// a virtual, dynamic-sized BASE and CHECK arrays.
//
class DoubleArray : public VirtualMem {
public: // public functions
DoubleArray(const char* fileName, ios_base::openmode openModes);
// BASE & CHECK referencings
void SetBase(State s, State b);
void SetCheck(State s, State c);
// BASE & CHECK Read-Only Access
State Base(State s);
State Check(State s);
};
inline DoubleArray::DoubleArray(const char* fileName, ios_base::openmode openModes)
: VirtualMem(fileName, openModes) {}
inline void DoubleArray::SetBase(State s, State b)
{ SetInt32(Pointer(s*sizeof(DACell) + BaseOffset), b); }
inline void DoubleArray::SetCheck(State s, State c)
{ SetInt32(Pointer(s*sizeof(DACell) + CheckOffset), c); }
inline State DoubleArray::Base(State s)
{ return Int32(Pointer(s*sizeof(DACell) + BaseOffset)); }
inline State DoubleArray::Check(State s)
{ return Int32(Pointer(s*sizeof(DACell) + CheckOffset)); }
//
// Branches
// --------
// A tree structure, represented in double-array structure, of the branching
// part of the trie. There are some improvements from Aoe's :
//
// 1. Terminal-nodes are allowed in the Branches structure. Terminal-nodes are
// all nodes with incoming edge labeled Terminator, that is, a node s of
// Branches is terminal if:
//
// BASE[CHECK[s]] + Terminator = s
//
// This can reduce some steps in retrievals: for words which are prefixes
// of some other words, their last branches will be labeled Terminator.
// Thus, pointers to TAIL blocks are no longer needed. We can immediately
// store the associating data in such nodes.
//
// 2. The free-space list (G-link) is adapted to circular doubly-linked list.
// This is accomplished by letting BASE of a free cell points to the
// previous free cell (while CHECK points to next). Doing so can reduce the
// time used to manipulate the list.
//
// The entry point to the list is stored in BASE[0] and CHECK[0], and the
// first cell available for use is BASE[1] and CHECK[1], which stores the
// root node.
//
class Branches {
public:
Branches(const char* fileName, int nIndices, ios_base::openmode openModes);
// File Status
bool IsSwapFileOpen();
bool IsSwapFileGood() const;
bool IsSwapFileBad() const;
bool IsSwapFileFail() const;
// Node info
State ParentOf(State s);
int OutDegree(State s);
SymbolSet OutputSet(State s);
bool IsUsedNode(State s);
bool IsSeparateNode(State s);
bool IsTerminal(State s);
State TailState(State s);
// Key's data access
Index GetKeyData(State aSepNode);
void SetKeyData(State aSepNode, Index data);
static State RootState(int indexNo);
// Walking & Retrieval
State Walk(State s, Char c);
EWalkResult WalkResult() const;
// Insertion & Deletion
State InsertBranch(State from, Char c);
void DeleteNode(State s);
// For Storage Enumeration
State TotalCells() const;
#ifdef TRIE_DEBUG
void AssertValid();
#endif // TRIE_DEBUG
private: // private const
enum {
unusedBase = 0
};
private: // private functions
// free-list operations
void extendDAPool();
void removeFreeCell(State s);
void insertFreeCell(State s);
bool isVacantFor(const SymbolSet& aSet, State aBase);
State findFreeBase(const SymbolSet& aSet); // Aoe's X_CHECK
void relocateBase(State s, State newBase, State* pInspectedState = 0);
private: // private data
// the double-array structure
DoubleArray da;
State totalCells;
// walk result
EWalkResult walkResult;
};
// File Status
inline bool Branches::IsSwapFileOpen() { return da.IsSwapFileOpen(); }
inline bool Branches::IsSwapFileGood() const { return da.IsSwapFileGood(); }
inline bool Branches::IsSwapFileBad() const { return da.IsSwapFileBad(); }
inline bool Branches::IsSwapFileFail() const { return da.IsSwapFileFail(); }
inline State Branches::ParentOf(State s) { return da.Check(s); }
inline bool Branches::IsUsedNode(State s) { return da.Check(s) > 0; }
inline bool Branches::IsSeparateNode(State s)
{ return IsUsedNode(s) && da.Base(s) < 0; }
inline bool Branches::IsTerminal(State s)
{ return IsUsedNode(s) && da.Base(ParentOf(s)) + Terminator == s; }
inline Index Branches::GetKeyData(State aSepNode)
{ return da.Base(aSepNode); }
inline void Branches::SetKeyData(State aSepNode, Index data)
{ da.SetBase(aSepNode, data); }
inline State Branches::RootState(int indexNo)
{ return RootStateBase + indexNo; }
inline EWalkResult Branches::WalkResult() const { return walkResult; }
inline State Branches::TotalCells() const { return totalCells; }
///////////////////////////////////////////////////////////////////////////////
// PART 2 : TAIL HEAP
///////////////////////////////////////////////////////////////////////////////
//
// TailBlock
// ---------
// A data structure to transfer each TAIL element as a whole block.
// The block is composed of :
// - suffix : string of remaining part of the keys in double-array structure
// - data : the data associating with each key
//
class TailBlock {
public: // public functions
// Constructors & Destructor
TailBlock();
TailBlock(const Char* suffix, Index data);
~TailBlock();
// Information Accessing
const Char* Suffix() const;
Index Data() const;
void SetSuffix(const Char* aSuffix);
void SetData(Index aData);
private: // private functions
// never make a copy
TailBlock(const TailBlock&);
const TailBlock& operator=(const TailBlock&);
private: // private data
Char* suffix;
Index data;
};
inline const Char* TailBlock::Suffix() const { return suffix; }
inline Index TailBlock::Data() const { return data; }
//
// Tails
// -----
// The suffix part of the keys distinguished by the Branches structure.
// Blocks, each of which contains suffix and associating data, can be allocated,
// but, at present, cannot be freed. It needs further storage compaction stage.
// (See the reason for this in my document.)
//
// The single-step and multiple-step walk on the suffixes are also provided.
// To do so, the TAIL's State is defined as follows:
//
// lower 8 bits - contains character offset in the suffix string
// upper 24 bits - contains pointer to the beginning of the block
//
// thus completely describes a certain position in a certain suffix.
//
class Tails {
public:
// Constructor
Tails(const char* fileName, ios_base::openmode openModes);
// Swap file status
bool OpenSwapFile(const char* swapFileName, ios_base::openmode iosModes);
bool CloseSwapFile();
const char* SwapFileName() const;
bool IsSwapFileOpen();
bool IsSwapFileGood() const;
bool IsSwapFileBad() const;
bool IsSwapFileFail() const;
// Key data access
Index GetKeyData(State t);
void SetKeyData(State t, Index data);
// TAIL Block Access
void GetBlock(State t, TailBlock* pBlock);
void SetBlock(State t, const TailBlock& aBlock);
// Walking & Retrieval
Char NextSymbol(State t);
State Walk(State t, Char c);
State Walk(State t, const Char* key);
EWalkResult WalkResult() const;
int NWalked() const;
bool IsTerminal(State t);
// Insertion & Deletion
State AllocString(const Char* str, Index data);
void ChangeString(State t, const Char* str, Index data);
void FreeBlock(State t);
private: // private constants
// TAIL state representation
enum {
offsetBits = 8,
offsetMask = ~((~0L) << offsetBits)
};
// TAIL block representation
enum {
dataOffset = 0,
suffixOffset = 4
};
private: // private functions
// State decoding
static Pointer blockBegin(State t);
static int charOffset(State t);
// State encoding
static State makeTailState(Pointer blockBegin, int offset);
// Block representation
static Pointer dataBegin(State t);
static Pointer suffixBegin(State t);
private: // private data
VirtualHeap dataHeap;
// walk result
EWalkResult walkResult; // set by both Walk() functions
int nWalked; // set by Walk(State, const char*)
};
inline bool Tails::OpenSwapFile(const char* swapFileName, ios_base::openmode iosModes)
{ return dataHeap.OpenSwapFile(swapFileName, iosModes); }
inline bool Tails::CloseSwapFile() { return dataHeap.CloseSwapFile(); }
inline const char* Tails::SwapFileName() const
{ return dataHeap.SwapFileName(); }
inline bool Tails::IsSwapFileOpen() { return dataHeap.IsSwapFileOpen(); }
inline bool Tails::IsSwapFileGood() const { return dataHeap.IsSwapFileGood(); }
inline bool Tails::IsSwapFileBad() const { return dataHeap.IsSwapFileBad(); }
inline bool Tails::IsSwapFileFail() const { return dataHeap.IsSwapFileFail(); }
inline Pointer Tails::blockBegin(State t) { return t >> offsetBits; }
inline int Tails::charOffset(State t) { return int(t & offsetMask); }
inline State Tails::makeTailState(Pointer blockBegin, int offset)
{ return (blockBegin << offsetBits) | (offset & offsetMask); }
inline Pointer Tails::dataBegin(State t) { return blockBegin(t) + dataOffset; }
inline Pointer Tails::suffixBegin(State t) { return blockBegin(t) + suffixOffset; }
// Key data access
inline Index Tails::GetKeyData(State t)
{ return dataHeap.UInt32(dataBegin(t)); }
inline void Tails::SetKeyData(State t, Index data)
{ dataHeap.SetUInt32(dataBegin(t), data); }
inline Char Tails::NextSymbol(State t)
{ return dataHeap.Byte(suffixBegin(t) + charOffset(t)); }
inline EWalkResult Tails::WalkResult() const { return walkResult; }
inline int Tails::NWalked() const { return nWalked; }
inline bool Tails::IsTerminal(State t)
{ return dataHeap.Byte(suffixBegin(t) + charOffset(t) - 1) == Terminator; }
///////////////////////////////////////////////////////////////////////////////
// PART 3 : THE TRIE (DOUBLE-ARRAY + TAIL HEAP)
///////////////////////////////////////////////////////////////////////////////
// helper functions
const Char* AppendTerminator(const Char* key);
const char* TruncateTerminator(const char* key);
//
// Trie (Multiple Index Trie)
// ----
// This class gives a general view on tries. The class users can insert keys,
// delete keys, check key inclusion, and retrieve the data associating
// with the key. Furthermore, the users also can walk along the trie with
// a key character by character. To do this, the State of the operation is
// defined so that the users can keep track of where they currently are.
//
// As named, the tries of this class can store more than one set of indices
// in the same storage. Each set is indicated by indexNo argument, of which
// value begins at zero.
//
// This class is implemented by using double-array structure with suffix
// compression. (See Aoe[1989] for the detail.) That is, the trie is composed
// of 2 structures:
// - Branches : the branching part which distinguishes keys (it's a tree)
// - Tails : the remaining part of keys after the distinguishing position
//
// And, to indicate in which structure a State is, we use its sign as follows:
// + : represents multi-nodes (i.e. nodes in the Branches structure)
// - : represents single-nodes (i.e. nodes in the Tails structure)
// 0 : undefined
//
// Terminology:
// - multi-node : internal node in the Branches structure, which
// contains a pointer to the beginning pos of the
// sparse table row containing their child nodes
// - single-node : every node in the Tails structure
// - separate-node : leaf node in the Branches structure, which contains
// a pointer to its remaining part in the Tails
// structure. (This can be distinguished from the pointer
// to table row by its sign.)
// - terminal-node : node with the incoming edge labeled Terminator
// A terminal-node can be in both Branches and Tails.
//
// There is an improvement from Aoe's: the deletion algorithm is more complete.
// Instead of just mark the separate node of a key as unused, the other
// relevant nodes are also deleted.
//
class Trie {
public:
typedef bool (*EnumFunc)(const Char* key, Index data);
enum ErrorCode {
NoError = 0,
CantOpenBranches,
CantOpenTails
};
public:
// constructor
Trie(
const char* branchesFileName,
const char* tailsFileName,
int nIndices,
ios_base::openmode openModes
);
// File Status
ErrorCode ErrorStatus();
// General Operations
bool InsertKey(int indexNo, const Char* key, Index dataIdx);
bool DeleteKey(int indexNo, const Char* key);
Index Retrieve(int indexNo, const Char* key);
// Walking & Retrieval
State StartState(int indexNo) const;
State Walk(State s, Char c);
State Terminate(State s); // walk with Terminator
State Walk(State s, const Char* key);
EWalkResult WalkResult() const;
int NWalked() const; // call after Walk(State, const char*) only
Index GetKeyData(State s);
// For Enumeration
int OutDegree(State s);
SymbolSet OutputSet(State s);
void Enumerate(int indexNo, EnumFunc fn);
// Storage Compaction
void Compact();
private: // private functions
// Aoe's A_INSERT & B_INSERT
void branchWithinBranches(State from, const Char* suffix, Index dataIdx);
void branchWithinTail(State sepNode, const Char* suffix, Index dataIdx);
bool enumKeys(State s, int level);
// storage compaction
void reallocTails();
private: // private data
// the trie structure
Branches branches;
Tails tails;
// walk result
EWalkResult walkResult; // set by both Walk() functions
int nWalked; // set by Walk(State, const Char*)
// Enumerate func
EnumFunc enumFunc;
};
// constructor
inline Trie::Trie(
const char* branchesFileName, const char* tailsFileName,
int nIndices, ios_base::openmode openModes
) :
branches(branchesFileName, nIndices, openModes),
tails(tailsFileName, openModes)
{ walkResult = INTERNAL; nWalked = 0; enumFunc = 0; }
inline State Trie::StartState(int indexNo) const
{
return Branches::RootState(indexNo);
}
inline State Trie::Terminate(State s) { return Walk(s, Terminator); }
inline EWalkResult Trie::WalkResult() const { return walkResult; }
inline int Trie::NWalked() const { return nWalked; }
#endif // TRIE_INC
|