1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collation.h
*
* created on: 2010oct27
* created by: Markus W. Scherer
*/
#ifndef __COLLATION_H__
#define __COLLATION_H__
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_COLLATION
U_NAMESPACE_BEGIN
/**
* Collation v2 basic definitions and static helper functions.
*
* Data structures except for expansion tables store 32-bit CEs which are
* either specials (see tags below) or are compact forms of 64-bit CEs.
*/
class U_I18N_API Collation {
public:
// Special sort key bytes for all levels.
static const uint8_t TERMINATOR_BYTE = 0;
static const uint8_t LEVEL_SEPARATOR_BYTE = 1;
/** The secondary/tertiary lower limit for tailoring before any root elements. */
static const uint32_t BEFORE_WEIGHT16 = 0x0100;
/**
* Merge-sort-key separator.
* Same as the unique primary and identical-level weights of U+FFFE.
* Must not be used as primary compression low terminator.
* Otherwise usable.
*/
static const uint8_t MERGE_SEPARATOR_BYTE = 2;
static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE
static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE
/**
* Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE.
* Reserved value in primary second byte if the lead byte is compressible.
* Otherwise usable in all CE weight bytes.
*/
static const uint8_t PRIMARY_COMPRESSION_LOW_BYTE = 3;
/**
* Primary compression high terminator.
* Reserved value in primary second byte if the lead byte is compressible.
* Otherwise usable in all CE weight bytes.
*/
static const uint8_t PRIMARY_COMPRESSION_HIGH_BYTE = 0xff;
/** Default secondary/tertiary weight lead byte. */
static const uint8_t COMMON_BYTE = 5;
static const uint32_t COMMON_WEIGHT16 = 0x0500;
/** Middle 16 bits of a CE with a common secondary weight. */
static const uint32_t COMMON_SECONDARY_CE = 0x05000000;
/** Lower 16 bits of a CE with a common tertiary weight. */
static const uint32_t COMMON_TERTIARY_CE = 0x0500;
/** Lower 32 bits of a CE with common secondary and tertiary weights. */
static const uint32_t COMMON_SEC_AND_TER_CE = 0x05000500;
static const uint32_t SECONDARY_MASK = 0xffff0000;
static const uint32_t CASE_MASK = 0xc000;
static const uint32_t SECONDARY_AND_CASE_MASK = SECONDARY_MASK | CASE_MASK;
/** Only the 2*6 bits for the pure tertiary weight. */
static const uint32_t ONLY_TERTIARY_MASK = 0x3f3f;
/** Only the secondary & tertiary bits; no case, no quaternary. */
static const uint32_t ONLY_SEC_TER_MASK = SECONDARY_MASK | ONLY_TERTIARY_MASK;
/** Case bits and tertiary bits. */
static const uint32_t CASE_AND_TERTIARY_MASK = CASE_MASK | ONLY_TERTIARY_MASK;
static const uint32_t QUATERNARY_MASK = 0xc0;
/** Case bits and quaternary bits. */
static const uint32_t CASE_AND_QUATERNARY_MASK = CASE_MASK | QUATERNARY_MASK;
static const uint8_t UNASSIGNED_IMPLICIT_BYTE = 0xfe; // compressible
/**
* First unassigned: AlphabeticIndex overflow boundary.
* We want a 3-byte primary so that it fits into the root elements table.
*
* This 3-byte primary will not collide with
* any unassigned-implicit 4-byte primaries because
* the first few hundred Unicode code points all have real mappings.
*/
static const uint32_t FIRST_UNASSIGNED_PRIMARY = 0xfe040200;
static const uint8_t TRAIL_WEIGHT_BYTE = 0xff; // not compressible
static const uint32_t FIRST_TRAILING_PRIMARY = 0xff020200; // [first trailing]
static const uint32_t MAX_PRIMARY = 0xffff0000; // U+FFFF
static const uint32_t MAX_REGULAR_CE32 = 0xffff0505; // U+FFFF
// CE32 value for U+FFFD as well as illegal UTF-8 byte sequences (which behave like U+FFFD).
// We use the third-highest primary weight for U+FFFD (as in UCA 6.3+).
static const uint32_t FFFD_PRIMARY = MAX_PRIMARY - 0x20000;
static const uint32_t FFFD_CE32 = MAX_REGULAR_CE32 - 0x20000;
/**
* A CE32 is special if its low byte is this or greater.
* Impossible case bits 11 mark special CE32s.
* This value itself is used to indicate a fallback to the base collator.
*/
static const uint8_t SPECIAL_CE32_LOW_BYTE = 0xc0;
static const uint32_t FALLBACK_CE32 = SPECIAL_CE32_LOW_BYTE;
/**
* Low byte of a long-primary special CE32.
*/
static const uint8_t LONG_PRIMARY_CE32_LOW_BYTE = 0xc1; // SPECIAL_CE32_LOW_BYTE | LONG_PRIMARY_TAG
static const uint32_t UNASSIGNED_CE32 = 0xffffffff; // Compute an unassigned-implicit CE.
static const uint32_t NO_CE32 = 1;
/** No CE: End of input. Only used in runtime code, not stored in data. */
static const uint32_t NO_CE_PRIMARY = 1; // not a left-adjusted weight
static const uint32_t NO_CE_WEIGHT16 = 0x0100; // weight of LEVEL_SEPARATOR_BYTE
static const int64_t NO_CE = INT64_C(0x101000100); // NO_CE_PRIMARY, NO_CE_WEIGHT16, NO_CE_WEIGHT16
/** Sort key levels. */
enum Level {
/** Unspecified level. */
NO_LEVEL,
PRIMARY_LEVEL,
SECONDARY_LEVEL,
CASE_LEVEL,
TERTIARY_LEVEL,
QUATERNARY_LEVEL,
IDENTICAL_LEVEL,
/** Beyond sort key bytes. */
ZERO_LEVEL
};
/**
* Sort key level flags: xx_FLAG = 1 << xx_LEVEL.
* In Java, use enum Level with flag() getters, or use EnumSet rather than hand-made bit sets.
*/
static const uint32_t NO_LEVEL_FLAG = 1;
static const uint32_t PRIMARY_LEVEL_FLAG = 2;
static const uint32_t SECONDARY_LEVEL_FLAG = 4;
static const uint32_t CASE_LEVEL_FLAG = 8;
static const uint32_t TERTIARY_LEVEL_FLAG = 0x10;
static const uint32_t QUATERNARY_LEVEL_FLAG = 0x20;
static const uint32_t IDENTICAL_LEVEL_FLAG = 0x40;
static const uint32_t ZERO_LEVEL_FLAG = 0x80;
/**
* Special-CE32 tags, from bits 3..0 of a special 32-bit CE.
* Bits 31..8 are available for tag-specific data.
* Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0.
*/
enum {
/**
* Fall back to the base collator.
* This is the tag value in SPECIAL_CE32_LOW_BYTE and FALLBACK_CE32.
* Bits 31..8: Unused, 0.
*/
FALLBACK_TAG = 0,
/**
* Long-primary CE with COMMON_SEC_AND_TER_CE.
* Bits 31..8: Three-byte primary.
*/
LONG_PRIMARY_TAG = 1,
/**
* Long-secondary CE with zero primary.
* Bits 31..16: Secondary weight.
* Bits 15.. 8: Tertiary weight.
*/
LONG_SECONDARY_TAG = 2,
/**
* Unused.
* May be used in the future for single-byte secondary CEs (SHORT_SECONDARY_TAG),
* storing the secondary in bits 31..24, the ccc in bits 23..16,
* and the tertiary in bits 15..8.
*/
RESERVED_TAG_3 = 3,
/**
* Latin mini expansions of two simple CEs [pp, 05, tt] [00, ss, 05].
* Bits 31..24: Single-byte primary weight pp of the first CE.
* Bits 23..16: Tertiary weight tt of the first CE.
* Bits 15.. 8: Secondary weight ss of the second CE.
*/
LATIN_EXPANSION_TAG = 4,
/**
* Points to one or more simple/long-primary/long-secondary 32-bit CE32s.
* Bits 31..13: Index into uint32_t table.
* Bits 12.. 8: Length=1..31.
*/
EXPANSION32_TAG = 5,
/**
* Points to one or more 64-bit CEs.
* Bits 31..13: Index into CE table.
* Bits 12.. 8: Length=1..31.
*/
EXPANSION_TAG = 6,
/**
* Builder data, used only in the CollationDataBuilder, not in runtime data.
*
* If bit 8 is 0: Builder context, points to a list of context-sensitive mappings.
* Bits 31..13: Index to the builder's list of ConditionalCE32 for this character.
* Bits 12.. 9: Unused, 0.
*
* If bit 8 is 1 (IS_BUILDER_JAMO_CE32): Builder-only jamoCE32 value.
* The builder fetches the Jamo CE32 from the trie.
* Bits 31..13: Jamo code point.
* Bits 12.. 9: Unused, 0.
*/
BUILDER_DATA_TAG = 7,
/**
* Points to prefix trie.
* Bits 31..13: Index into prefix/contraction data.
* Bits 12.. 8: Unused, 0.
*/
PREFIX_TAG = 8,
/**
* Points to contraction data.
* Bits 31..13: Index into prefix/contraction data.
* Bit 12: Unused, 0.
* Bit 11: CONTRACT_HAS_STARTER flag. (Used by ICU4X only.)
* Bit 10: CONTRACT_TRAILING_CCC flag.
* Bit 9: CONTRACT_NEXT_CCC flag.
* Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag.
*/
CONTRACTION_TAG = 9,
/**
* Decimal digit.
* Bits 31..13: Index into uint32_t table for non-numeric-collation CE32.
* Bit 12: Unused, 0.
* Bits 11.. 8: Digit value 0..9.
*/
DIGIT_TAG = 10,
/**
* Tag for U+0000, for moving the NUL-termination handling
* from the regular fastpath into specials-handling code.
* Bits 31..8: Unused, 0.
*/
U0000_TAG = 11,
/**
* Tag for a Hangul syllable.
* Bits 31..9: Unused, 0.
* Bit 8: HANGUL_NO_SPECIAL_JAMO flag.
*/
HANGUL_TAG = 12,
/**
* Tag for a lead surrogate code unit.
* Optional optimization for UTF-16 string processing.
* Bits 31..10: Unused, 0.
* 9.. 8: =0: All associated supplementary code points are unassigned-implicit.
* =1: All associated supplementary code points fall back to the base data.
* else: (Normally 2) Look up the data for the supplementary code point.
*/
LEAD_SURROGATE_TAG = 13,
/**
* Tag for CEs with primary weights in code point order.
* Bits 31..13: Index into CE table, for one data "CE".
* Bits 12.. 8: Unused, 0.
*
* This data "CE" has the following bit fields:
* Bits 63..32: Three-byte primary pppppp00.
* 31.. 8: Start/base code point of the in-order range.
* 7: Flag isCompressible primary.
* 6.. 0: Per-code point primary-weight increment.
*/
OFFSET_TAG = 14,
/**
* Implicit CE tag. Compute an unassigned-implicit CE.
* All bits are set (UNASSIGNED_CE32=0xffffffff).
*/
IMPLICIT_TAG = 15
};
static UBool isAssignedCE32(uint32_t ce32) {
return ce32 != FALLBACK_CE32 && ce32 != UNASSIGNED_CE32;
}
/**
* We limit the number of CEs in an expansion
* so that we can use a small number of length bits in the data structure,
* and so that an implementation can copy CEs at runtime without growing a destination buffer.
*/
static const int32_t MAX_EXPANSION_LENGTH = 31;
static const int32_t MAX_INDEX = 0x7ffff;
/**
* Set if there is no match for the single (no-suffix) character itself.
* This is only possible if there is a prefix.
* In this case, discontiguous contraction matching cannot add combining marks
* starting from an empty suffix.
* The default CE32 is used anyway if there is no suffix match.
*/
static const uint32_t CONTRACT_SINGLE_CP_NO_MATCH = 0x100;
/** Set if the first character of every contraction suffix has lccc!=0. */
static const uint32_t CONTRACT_NEXT_CCC = 0x200;
/** Set if any contraction suffix ends with lccc!=0. */
static const uint32_t CONTRACT_TRAILING_CCC = 0x400;
/** Set if any contraction suffix contains a starter. (Used by ICU4X only.) */
static const uint32_t CONTRACT_HAS_STARTER = 0x800;
/** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */
static const uint32_t HANGUL_NO_SPECIAL_JAMO = 0x100;
static const uint32_t LEAD_ALL_UNASSIGNED = 0;
static const uint32_t LEAD_ALL_FALLBACK = 0x100;
static const uint32_t LEAD_MIXED = 0x200;
static const uint32_t LEAD_TYPE_MASK = 0x300;
static uint32_t makeLongPrimaryCE32(uint32_t p) { return p | LONG_PRIMARY_CE32_LOW_BYTE; }
/** Turns the long-primary CE32 into a primary weight pppppp00. */
static inline uint32_t primaryFromLongPrimaryCE32(uint32_t ce32) {
return ce32 & 0xffffff00;
}
static inline int64_t ceFromLongPrimaryCE32(uint32_t ce32) {
return ((int64_t)(ce32 & 0xffffff00) << 32) | COMMON_SEC_AND_TER_CE;
}
static uint32_t makeLongSecondaryCE32(uint32_t lower32) {
return lower32 | SPECIAL_CE32_LOW_BYTE | LONG_SECONDARY_TAG;
}
static inline int64_t ceFromLongSecondaryCE32(uint32_t ce32) {
return ce32 & 0xffffff00;
}
/** Makes a special CE32 with tag, index and length. */
static uint32_t makeCE32FromTagIndexAndLength(int32_t tag, int32_t index, int32_t length) {
return (index << 13) | (length << 8) | SPECIAL_CE32_LOW_BYTE | tag;
}
/** Makes a special CE32 with only tag and index. */
static uint32_t makeCE32FromTagAndIndex(int32_t tag, int32_t index) {
return (index << 13) | SPECIAL_CE32_LOW_BYTE | tag;
}
static inline UBool isSpecialCE32(uint32_t ce32) {
return (ce32 & 0xff) >= SPECIAL_CE32_LOW_BYTE;
}
static inline int32_t tagFromCE32(uint32_t ce32) {
return (int32_t)(ce32 & 0xf);
}
static inline UBool hasCE32Tag(uint32_t ce32, int32_t tag) {
return isSpecialCE32(ce32) && tagFromCE32(ce32) == tag;
}
static inline UBool isLongPrimaryCE32(uint32_t ce32) {
return hasCE32Tag(ce32, LONG_PRIMARY_TAG);
}
static UBool isSimpleOrLongCE32(uint32_t ce32) {
return !isSpecialCE32(ce32) ||
tagFromCE32(ce32) == LONG_PRIMARY_TAG ||
tagFromCE32(ce32) == LONG_SECONDARY_TAG;
}
/**
* @return true if the ce32 yields one or more CEs without further data lookups
*/
static UBool isSelfContainedCE32(uint32_t ce32) {
return !isSpecialCE32(ce32) ||
tagFromCE32(ce32) == LONG_PRIMARY_TAG ||
tagFromCE32(ce32) == LONG_SECONDARY_TAG ||
tagFromCE32(ce32) == LATIN_EXPANSION_TAG;
}
static inline UBool isPrefixCE32(uint32_t ce32) {
return hasCE32Tag(ce32, PREFIX_TAG);
}
static inline UBool isContractionCE32(uint32_t ce32) {
return hasCE32Tag(ce32, CONTRACTION_TAG);
}
static inline UBool ce32HasContext(uint32_t ce32) {
return isSpecialCE32(ce32) &&
(tagFromCE32(ce32) == PREFIX_TAG ||
tagFromCE32(ce32) == CONTRACTION_TAG);
}
/**
* Get the first of the two Latin-expansion CEs encoded in ce32.
* @see LATIN_EXPANSION_TAG
*/
static inline int64_t latinCE0FromCE32(uint32_t ce32) {
return ((int64_t)(ce32 & 0xff000000) << 32) | COMMON_SECONDARY_CE | ((ce32 & 0xff0000) >> 8);
}
/**
* Get the second of the two Latin-expansion CEs encoded in ce32.
* @see LATIN_EXPANSION_TAG
*/
static inline int64_t latinCE1FromCE32(uint32_t ce32) {
return ((ce32 & 0xff00) << 16) | COMMON_TERTIARY_CE;
}
/**
* Returns the data index from a special CE32.
*/
static inline int32_t indexFromCE32(uint32_t ce32) {
return (int32_t)(ce32 >> 13);
}
/**
* Returns the data length from a ce32.
*/
static inline int32_t lengthFromCE32(uint32_t ce32) {
return (ce32 >> 8) & 31;
}
/**
* Returns the digit value from a DIGIT_TAG ce32.
*/
static inline char digitFromCE32(uint32_t ce32) {
return (char)((ce32 >> 8) & 0xf);
}
/** Returns a 64-bit CE from a simple CE32 (not special). */
static inline int64_t ceFromSimpleCE32(uint32_t ce32) {
// normal form ppppsstt -> pppp0000ss00tt00
// assert (ce32 & 0xff) < SPECIAL_CE32_LOW_BYTE
return ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | ((ce32 & 0xff) << 8);
}
/** Returns a 64-bit CE from a simple/long-primary/long-secondary CE32. */
static inline int64_t ceFromCE32(uint32_t ce32) {
uint32_t tertiary = ce32 & 0xff;
if(tertiary < SPECIAL_CE32_LOW_BYTE) {
// normal form ppppsstt -> pppp0000ss00tt00
return ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (tertiary << 8);
} else {
ce32 -= tertiary;
if((tertiary & 0xf) == LONG_PRIMARY_TAG) {
// long-primary form ppppppC1 -> pppppp00050000500
return ((int64_t)ce32 << 32) | COMMON_SEC_AND_TER_CE;
} else {
// long-secondary form ssssttC2 -> 00000000sssstt00
// assert (tertiary & 0xf) == LONG_SECONDARY_TAG
return ce32;
}
}
}
/** Creates a CE from a primary weight. */
static inline int64_t makeCE(uint32_t p) {
return ((int64_t)p << 32) | COMMON_SEC_AND_TER_CE;
}
/**
* Creates a CE from a primary weight,
* 16-bit secondary/tertiary weights, and a 2-bit quaternary.
*/
static inline int64_t makeCE(uint32_t p, uint32_t s, uint32_t t, uint32_t q) {
return ((int64_t)p << 32) | (s << 16) | t | (q << 6);
}
/**
* Increments a 2-byte primary by a code point offset.
*/
static uint32_t incTwoBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible,
int32_t offset);
/**
* Increments a 3-byte primary by a code point offset.
*/
static uint32_t incThreeBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible,
int32_t offset);
/**
* Decrements a 2-byte primary by one range step (1..0x7f).
*/
static uint32_t decTwoBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step);
/**
* Decrements a 3-byte primary by one range step (1..0x7f).
*/
static uint32_t decThreeBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step);
/**
* Computes a 3-byte primary for c's OFFSET_TAG data "CE".
*/
static uint32_t getThreeBytePrimaryForOffsetData(UChar32 c, int64_t dataCE);
/**
* Returns the unassigned-character implicit primary weight for any valid code point c.
*/
static uint32_t unassignedPrimaryFromCodePoint(UChar32 c);
static inline int64_t unassignedCEFromCodePoint(UChar32 c) {
return makeCE(unassignedPrimaryFromCodePoint(c));
}
private:
Collation() = delete; // No instantiation.
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION
#endif // __COLLATION_H__
|