1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationbuilder.h
*
* created on: 2013may06
* created by: Markus W. Scherer
*/
#ifndef __COLLATIONBUILDER_H__
#define __COLLATIONBUILDER_H__
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_COLLATION
#include <_foundation_unicode/uniset.h>
#include <_foundation_unicode/unistr.h>
#include "collationrootelements.h"
#include "collationruleparser.h"
#include "uvectr32.h"
#include "uvectr64.h"
struct UParseError;
U_NAMESPACE_BEGIN
struct CollationData;
struct CollationTailoring;
class CEFinalizer;
class CollationDataBuilder;
class Normalizer2;
class Normalizer2Impl;
class U_I18N_API CollationBuilder : public CollationRuleParser::Sink {
public:
CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode);
CollationBuilder(const CollationTailoring *base, UErrorCode &errorCode);
virtual ~CollationBuilder();
void disableFastLatin() { fastLatinEnabled = false; }
CollationTailoring *parseAndBuild(const UnicodeString &ruleString,
const UVersionInfo rulesVersion,
CollationRuleParser::Importer *importer,
UParseError *outParseError,
UErrorCode &errorCode);
const char *getErrorReason() const { return errorReason; }
private:
friend class CEFinalizer;
/** Implements CollationRuleParser::Sink. */
virtual void addReset(int32_t strength, const UnicodeString &str,
const char *&errorReason, UErrorCode &errorCode) override;
/**
* Returns the secondary or tertiary weight preceding the current node's weight.
* node=nodes[index].
*/
uint32_t getWeight16Before(int32_t index, int64_t node, int32_t level);
int64_t getSpecialResetPosition(const UnicodeString &str,
const char *&parserErrorReason, UErrorCode &errorCode);
/** Implements CollationRuleParser::Sink. */
virtual void addRelation(int32_t strength, const UnicodeString &prefix,
const UnicodeString &str, const UnicodeString &extension,
const char *&errorReason, UErrorCode &errorCode) override;
/**
* Picks one of the current CEs and finds or inserts a node in the graph
* for the CE + strength.
*/
int32_t findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,
UErrorCode &errorCode);
int32_t findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode);
/** Finds or inserts the node for a root CE's primary weight. */
int32_t findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode);
/** Finds or inserts the node for a secondary or tertiary weight. */
int32_t findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level,
UErrorCode &errorCode);
/**
* Makes and inserts a new tailored node into the list, after the one at index.
* Skips over nodes of weaker strength to maintain collation order
* ("postpone insertion").
* @return the new node's index
*/
int32_t insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode);
/**
* Inserts a new node into the list, between list-adjacent items.
* The node's previous and next indexes must not be set yet.
* @return the new node's index
*/
int32_t insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,
UErrorCode &errorCode);
/**
* Finds the node which implies or contains a common=05 weight of the given strength
* (secondary or tertiary), if the current node is stronger.
* Skips weaker nodes and tailored nodes if the current node is stronger
* and is followed by an explicit-common-weight node.
* Always returns the input index if that node is no stronger than the given strength.
*/
int32_t findCommonNode(int32_t index, int32_t strength) const;
void setCaseBits(const UnicodeString &nfdString,
const char *&parserErrorReason, UErrorCode &errorCode);
/** Implements CollationRuleParser::Sink. */
virtual void suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,
UErrorCode &errorCode) override;
/** Implements CollationRuleParser::Sink. */
virtual void optimize(const UnicodeSet &set, const char *&parserErrorReason,
UErrorCode &errorCode) override;
/**
* Adds the mapping and its canonical closure.
* Takes ce32=dataBuilder->encodeCEs(...) so that the data builder
* need not re-encode the CEs multiple times.
*/
uint32_t addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
UErrorCode &errorCode);
uint32_t addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
UErrorCode &errorCode);
void addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
UErrorCode &errorCode);
UBool mergeCompositeIntoString(const UnicodeString &nfdString, int32_t indexAfterLastStarter,
UChar32 composite, const UnicodeString &decomp,
UnicodeString &newNFDString, UnicodeString &newString,
UErrorCode &errorCode) const;
UBool ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const;
UBool ignoreString(const UnicodeString &s, UErrorCode &errorCode) const;
UBool isFCD(const UnicodeString &s, UErrorCode &errorCode) const;
void closeOverComposites(UErrorCode &errorCode);
uint32_t addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,
const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
UErrorCode &errorCode);
static UBool sameCEs(const int64_t ces1[], int32_t ces1Length,
const int64_t ces2[], int32_t ces2Length);
/**
* Walks the tailoring graph and overwrites tailored nodes with new CEs.
* After this, the graph is destroyed.
* The nodes array can then be used only as a source of tailored CEs.
*/
void makeTailoredCEs(UErrorCode &errorCode);
/**
* Counts the tailored nodes of the given strength up to the next node
* which is either stronger or has an explicit weight of this strength.
*/
static int32_t countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength);
/** Replaces temporary CEs with the final CEs they point to. */
void finalizeCEs(UErrorCode &errorCode);
/**
* Encodes "temporary CE" data into a CE that fits into the CE32 data structure,
* with 2-byte primary, 1-byte secondary and 6-bit tertiary,
* with valid CE byte values.
*
* The index must not exceed 20 bits (0xfffff).
* The strength must fit into 2 bits (UCOL_PRIMARY..UCOL_QUATERNARY).
*
* Temporary CEs are distinguished from real CEs by their use of
* secondary weights 06..45 which are otherwise reserved for compressed sort keys.
*
* The case bits are unused and available.
*/
static inline int64_t tempCEFromIndexAndStrength(int32_t index, int32_t strength) {
return
// CE byte offsets, to ensure valid CE bytes, and case bits 11
INT64_C(0x4040000006002000) +
// index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF)
((int64_t)(index & 0xfe000) << 43) +
// index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF)
((int64_t)(index & 0x1fc0) << 42) +
// index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45)
((index & 0x3f) << 24) +
// strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23)
(strength << 8);
}
static inline int32_t indexFromTempCE(int64_t tempCE) {
tempCE -= INT64_C(0x4040000006002000);
return
((int32_t)(tempCE >> 43) & 0xfe000) |
((int32_t)(tempCE >> 42) & 0x1fc0) |
((int32_t)(tempCE >> 24) & 0x3f);
}
static inline int32_t strengthFromTempCE(int64_t tempCE) {
return ((int32_t)tempCE >> 8) & 3;
}
static inline UBool isTempCE(int64_t ce) {
uint32_t sec = (uint32_t)ce >> 24;
return 6 <= sec && sec <= 0x45;
}
static inline int32_t indexFromTempCE32(uint32_t tempCE32) {
tempCE32 -= 0x40400620;
return
((int32_t)(tempCE32 >> 11) & 0xfe000) |
((int32_t)(tempCE32 >> 10) & 0x1fc0) |
((int32_t)(tempCE32 >> 8) & 0x3f);
}
static inline UBool isTempCE32(uint32_t ce32) {
return
(ce32 & 0xff) >= 2 && // not a long-primary/long-secondary CE32
6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45;
}
static int32_t ceStrength(int64_t ce);
/** At most 1M nodes, limited by the 20 bits in node bit fields. */
static const int32_t MAX_INDEX = 0xfffff;
/**
* Node bit 6 is set on a primary node if there are nodes
* with secondary values below the common secondary weight (05).
*/
static const int32_t HAS_BEFORE2 = 0x40;
/**
* Node bit 5 is set on a primary or secondary node if there are nodes
* with tertiary values below the common tertiary weight (05).
*/
static const int32_t HAS_BEFORE3 = 0x20;
/**
* Node bit 3 distinguishes a tailored node, which has no weight value,
* from a node with an explicit (root or default) weight.
*/
static const int32_t IS_TAILORED = 8;
static inline int64_t nodeFromWeight32(uint32_t weight32) {
return (int64_t)weight32 << 32;
}
static inline int64_t nodeFromWeight16(uint32_t weight16) {
return (int64_t)weight16 << 48;
}
static inline int64_t nodeFromPreviousIndex(int32_t previous) {
return (int64_t)previous << 28;
}
static inline int64_t nodeFromNextIndex(int32_t next) {
return next << 8;
}
static inline int64_t nodeFromStrength(int32_t strength) {
return strength;
}
static inline uint32_t weight32FromNode(int64_t node) {
return (uint32_t)(node >> 32);
}
static inline uint32_t weight16FromNode(int64_t node) {
return (uint32_t)(node >> 48) & 0xffff;
}
static inline int32_t previousIndexFromNode(int64_t node) {
return (int32_t)(node >> 28) & MAX_INDEX;
}
static inline int32_t nextIndexFromNode(int64_t node) {
return ((int32_t)node >> 8) & MAX_INDEX;
}
static inline int32_t strengthFromNode(int64_t node) {
return (int32_t)node & 3;
}
static inline UBool nodeHasBefore2(int64_t node) {
return (node & HAS_BEFORE2) != 0;
}
static inline UBool nodeHasBefore3(int64_t node) {
return (node & HAS_BEFORE3) != 0;
}
static inline UBool nodeHasAnyBefore(int64_t node) {
return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0;
}
static inline UBool isTailoredNode(int64_t node) {
return (node & IS_TAILORED) != 0;
}
static inline int64_t changeNodePreviousIndex(int64_t node, int32_t previous) {
return (node & INT64_C(0xffff00000fffffff)) | nodeFromPreviousIndex(previous);
}
static inline int64_t changeNodeNextIndex(int64_t node, int32_t next) {
return (node & INT64_C(0xfffffffff00000ff)) | nodeFromNextIndex(next);
}
const Normalizer2 &nfd, &fcd;
const Normalizer2Impl &nfcImpl;
const CollationTailoring *base;
const CollationData *baseData;
const CollationRootElements rootElements;
uint32_t variableTop;
CollationDataBuilder *dataBuilder;
UBool fastLatinEnabled;
UBool icu4xMode;
UnicodeSet optimizeSet;
const char *errorReason;
int64_t ces[Collation::MAX_EXPANSION_LENGTH];
int32_t cesLength;
/**
* Indexes of nodes with root primary weights, sorted by primary.
* Compact form of a TreeMap from root primary to node index.
*
* This is a performance optimization for finding reset positions.
* Without this, we would have to search through the entire nodes list.
* It also allows storing root primary weights in list head nodes,
* without previous index, leaving room in root primary nodes for 32-bit primary weights.
*/
UVector32 rootPrimaryIndexes;
/**
* Data structure for assigning tailored weights and CEs.
* Doubly-linked lists of nodes in mostly collation order.
* Each list starts with a root primary node and ends with a nextIndex of 0.
*
* When there are any nodes in the list, then there is always a root primary node at index 0.
* This allows some code not to have to check explicitly for nextIndex==0.
*
* Root primary nodes have 32-bit weights but do not have previous indexes.
* All other nodes have at most 16-bit weights and do have previous indexes.
*
* Nodes with explicit weights store root collator weights,
* or default weak weights (e.g., secondary 05) for stronger nodes.
* "Tailored" nodes, with the IS_TAILORED bit set,
* do not store explicit weights but rather
* create a difference of a certain strength from the preceding node.
*
* A root node is followed by either
* - a root/default node of the same strength, or
* - a root/default node of the next-weaker strength, or
* - a tailored node of the same strength.
*
* A node of a given strength normally implies "common" weights on weaker levels.
*
* A node with HAS_BEFORE2 must be immediately followed by
* a secondary node with an explicit below-common weight, then a secondary tailored node,
* and later an explicit common-secondary node.
* The below-common weight can be a root weight,
* or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight
* or before the lowest root weight.
* (&[before 2] resets to an explicit secondary node so that
* the following addRelation(secondary) tailors right after that.
* If we did not have this node and instead were to reset on the primary node,
* then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
*
* If the flag is not set, then there are no explicit secondary nodes
* with the common or lower weights.
*
* Same for HAS_BEFORE3 for tertiary nodes and weights.
* A node must not have both flags set.
*
* Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs
* which point to stable indexes in this list,
* and temporary CEs stored in a CollationDataBuilder only point to tailored nodes.
*
* A temporary CE in the ces[] array may point to a non-tailored reset-before-position node,
* until the next relation is added.
*
* At the end, the tailored weights are allocated as necessary,
* then the tailored nodes are replaced with final CEs,
* and the CollationData is rewritten by replacing temporary CEs with final ones.
*
* We cannot simply insert new nodes in the middle of the array
* because that would invalidate the indexes stored in existing temporary CEs.
* We need to use a linked graph with stable indexes to existing nodes.
* A doubly-linked list seems easiest to maintain.
*
* Each node is stored as an int64_t, with its fields stored as bit fields.
*
* Root primary node:
* - primary weight: 32 bits 63..32
* - reserved/unused/zero: 4 bits 31..28
*
* Weaker root nodes & tailored nodes:
* - a weight: 16 bits 63..48
* + a root or default weight for a non-tailored node
* + unused/zero for a tailored node
* - index to the previous node: 20 bits 47..28
*
* All types of nodes:
* - index to the next node: 20 bits 27..8
* + nextIndex=0 in last node per root-primary list
* - reserved/unused/zero bits: bits 7, 4, 2
* - HAS_BEFORE2: bit 6
* - HAS_BEFORE3: bit 5
* - IS_TAILORED: bit 3
* - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0
*
* We could allocate structs with pointers, but we would have to store them
* in a pointer list so that they can be indexed from temporary CEs,
* and they would require more memory allocations.
*/
UVector64 nodes;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION
#endif // __COLLATIONBUILDER_H__
|