1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationfastlatin.h
*
* created on: 2013aug09
* created by: Markus W. Scherer
*/
#ifndef __COLLATIONFASTLATIN_H__
#define __COLLATIONFASTLATIN_H__
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_COLLATION
U_NAMESPACE_BEGIN
struct CollationData;
struct CollationSettings;
class U_I18N_API CollationFastLatin /* all static */ {
public:
/**
* Fast Latin format version (one byte 1..FF).
* Must be incremented for any runtime-incompatible changes,
* in particular, for changes to any of the following constants.
*
* When the major version number of the main data format changes,
* we can reset this fast Latin version to 1.
*/
static const uint16_t VERSION = 2;
static const int32_t LATIN_MAX = 0x17f;
static const int32_t LATIN_LIMIT = LATIN_MAX + 1;
static const int32_t LATIN_MAX_UTF8_LEAD = 0xc5; // UTF-8 lead byte of LATIN_MAX
static const int32_t PUNCT_START = 0x2000;
static const int32_t PUNCT_LIMIT = 0x2040;
// excludes U+FFFE & U+FFFF
static const int32_t NUM_FAST_CHARS = LATIN_LIMIT + (PUNCT_LIMIT - PUNCT_START);
// Note on the supported weight ranges:
// Analysis of UCA 6.3 and CLDR 23 non-search tailorings shows that
// the CEs for characters in the above ranges, excluding expansions with length >2,
// excluding contractions of >2 characters, and other restrictions
// (see the builder's getCEsFromCE32()),
// use at most about 150 primary weights,
// where about 94 primary weights are possibly-variable (space/punct/symbol/currency),
// at most 4 secondary before-common weights,
// at most 4 secondary after-common weights,
// at most 16 secondary high weights (in secondary CEs), and
// at most 4 tertiary after-common weights.
// The following ranges are designed to support slightly more weights than that.
// (en_US_POSIX is unusual: It creates about 64 variable + 116 Latin primaries.)
// Digits may use long primaries (preserving more short ones)
// or short primaries (faster) without changing this data structure.
// (If we supported numeric collation, then digits would have to have long primaries
// so that special handling does not affect the fast path.)
static const uint32_t SHORT_PRIMARY_MASK = 0xfc00; // bits 15..10
static const uint32_t INDEX_MASK = 0x3ff; // bits 9..0 for expansions & contractions
static const uint32_t SECONDARY_MASK = 0x3e0; // bits 9..5
static const uint32_t CASE_MASK = 0x18; // bits 4..3
static const uint32_t LONG_PRIMARY_MASK = 0xfff8; // bits 15..3
static const uint32_t TERTIARY_MASK = 7; // bits 2..0
static const uint32_t CASE_AND_TERTIARY_MASK = CASE_MASK | TERTIARY_MASK;
static const uint32_t TWO_SHORT_PRIMARIES_MASK =
(SHORT_PRIMARY_MASK << 16) | SHORT_PRIMARY_MASK; // 0xfc00fc00
static const uint32_t TWO_LONG_PRIMARIES_MASK =
(LONG_PRIMARY_MASK << 16) | LONG_PRIMARY_MASK; // 0xfff8fff8
static const uint32_t TWO_SECONDARIES_MASK =
(SECONDARY_MASK << 16) | SECONDARY_MASK; // 0x3e003e0
static const uint32_t TWO_CASES_MASK =
(CASE_MASK << 16) | CASE_MASK; // 0x180018
static const uint32_t TWO_TERTIARIES_MASK =
(TERTIARY_MASK << 16) | TERTIARY_MASK; // 0x70007
/**
* Contraction with one fast Latin character.
* Use INDEX_MASK to find the start of the contraction list after the fixed table.
* The first entry contains the default mapping.
* Otherwise use CONTR_CHAR_MASK for the contraction character index
* (in ascending order).
* Use CONTR_LENGTH_SHIFT for the length of the entry
* (1=BAIL_OUT, 2=one CE, 3=two CEs).
*
* Also, U+0000 maps to a contraction entry, so that the fast path need not
* check for NUL termination.
* It usually maps to a contraction list with only the completely ignorable default value.
*/
static const uint32_t CONTRACTION = 0x400;
/**
* An expansion encodes two CEs.
* Use INDEX_MASK to find the pair of CEs after the fixed table.
*
* The higher a mini CE value, the easier it is to process.
* For expansions and higher, no context needs to be considered.
*/
static const uint32_t EXPANSION = 0x800;
/**
* Encodes one CE with a long/low mini primary (there are 128).
* All potentially-variable primaries must be in this range,
* to make the short-primary path as fast as possible.
*/
static const uint32_t MIN_LONG = 0xc00;
static const uint32_t LONG_INC = 8;
static const uint32_t MAX_LONG = 0xff8;
/**
* Encodes one CE with a short/high primary (there are 60),
* plus a secondary CE if the secondary weight is high.
* Fast handling: At least all letter primaries should be in this range.
*/
static const uint32_t MIN_SHORT = 0x1000;
static const uint32_t SHORT_INC = 0x400;
/** The highest primary weight is reserved for U+FFFF. */
static const uint32_t MAX_SHORT = SHORT_PRIMARY_MASK;
static const uint32_t MIN_SEC_BEFORE = 0; // must add SEC_OFFSET
static const uint32_t SEC_INC = 0x20;
static const uint32_t MAX_SEC_BEFORE = MIN_SEC_BEFORE + 4 * SEC_INC; // 5 before common
static const uint32_t COMMON_SEC = MAX_SEC_BEFORE + SEC_INC;
static const uint32_t MIN_SEC_AFTER = COMMON_SEC + SEC_INC;
static const uint32_t MAX_SEC_AFTER = MIN_SEC_AFTER + 5 * SEC_INC; // 6 after common
static const uint32_t MIN_SEC_HIGH = MAX_SEC_AFTER + SEC_INC; // 20 high secondaries
static const uint32_t MAX_SEC_HIGH = SECONDARY_MASK;
/**
* Lookup: Add this offset to secondary weights, except for completely ignorable CEs.
* Must be greater than any special value, e.g., MERGE_WEIGHT.
* The exact value is not relevant for the format version.
*/
static const uint32_t SEC_OFFSET = SEC_INC;
static const uint32_t COMMON_SEC_PLUS_OFFSET = COMMON_SEC + SEC_OFFSET;
static const uint32_t TWO_SEC_OFFSETS =
(SEC_OFFSET << 16) | SEC_OFFSET; // 0x200020
static const uint32_t TWO_COMMON_SEC_PLUS_OFFSET =
(COMMON_SEC_PLUS_OFFSET << 16) | COMMON_SEC_PLUS_OFFSET;
static const uint32_t LOWER_CASE = 8; // case bits include this offset
static const uint32_t TWO_LOWER_CASES = (LOWER_CASE << 16) | LOWER_CASE; // 0x80008
static const uint32_t COMMON_TER = 0; // must add TER_OFFSET
static const uint32_t MAX_TER_AFTER = 7; // 7 after common
/**
* Lookup: Add this offset to tertiary weights, except for completely ignorable CEs.
* Must be greater than any special value, e.g., MERGE_WEIGHT.
* Must be greater than case bits as well, so that with combined case+tertiary weights
* plus the offset the tertiary bits does not spill over into the case bits.
* The exact value is not relevant for the format version.
*/
static const uint32_t TER_OFFSET = SEC_OFFSET;
static const uint32_t COMMON_TER_PLUS_OFFSET = COMMON_TER + TER_OFFSET;
static const uint32_t TWO_TER_OFFSETS = (TER_OFFSET << 16) | TER_OFFSET;
static const uint32_t TWO_COMMON_TER_PLUS_OFFSET =
(COMMON_TER_PLUS_OFFSET << 16) | COMMON_TER_PLUS_OFFSET;
static const uint32_t MERGE_WEIGHT = 3;
static const uint32_t EOS = 2; // end of string
static const uint32_t BAIL_OUT = 1;
/**
* Contraction result first word bits 8..0 contain the
* second contraction character, as a char index 0..NUM_FAST_CHARS-1.
* Each contraction list is terminated with a word containing CONTR_CHAR_MASK.
*/
static const uint32_t CONTR_CHAR_MASK = 0x1ff;
/**
* Contraction result first word bits 10..9 contain the result length:
* 1=bail out, 2=one mini CE, 3=two mini CEs
*/
static const uint32_t CONTR_LENGTH_SHIFT = 9;
/**
* Comparison return value when the regular comparison must be used.
* The exact value is not relevant for the format version.
*/
static const int32_t BAIL_OUT_RESULT = -2;
static inline int32_t getCharIndex(char16_t c) {
if(c <= LATIN_MAX) {
return c;
} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
return c - (PUNCT_START - LATIN_LIMIT);
} else {
// Not a fast Latin character.
// Note: U+FFFE & U+FFFF are forbidden in tailorings
// and thus do not occur in any contractions.
return -1;
}
}
/**
* Computes the options value for the compare functions
* and writes the precomputed primary weights.
* Returns -1 if the Latin fastpath is not supported for the data and settings.
* The capacity must be LATIN_LIMIT.
*/
static int32_t getOptions(const CollationData *data, const CollationSettings &settings,
uint16_t *primaries, int32_t capacity);
static int32_t compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
const char16_t *left, int32_t leftLength,
const char16_t *right, int32_t rightLength);
static int32_t compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
const uint8_t *left, int32_t leftLength,
const uint8_t *right, int32_t rightLength);
private:
static uint32_t lookup(const uint16_t *table, UChar32 c);
static uint32_t lookupUTF8(const uint16_t *table, UChar32 c,
const uint8_t *s8, int32_t &sIndex, int32_t sLength);
static uint32_t lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
const uint8_t *s8, int32_t &sIndex);
static uint32_t nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
const char16_t *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength);
static inline uint32_t getPrimaries(uint32_t variableTop, uint32_t pair) {
uint32_t ce = pair & 0xffff;
if(ce >= MIN_SHORT) { return pair & TWO_SHORT_PRIMARIES_MASK; }
if(ce > variableTop) { return pair & TWO_LONG_PRIMARIES_MASK; }
if(ce >= MIN_LONG) { return 0; } // variable
return pair; // special mini CE
}
static inline uint32_t getSecondariesFromOneShortCE(uint32_t ce) {
ce &= SECONDARY_MASK;
if(ce < MIN_SEC_HIGH) {
return ce + SEC_OFFSET;
} else {
return ((ce + SEC_OFFSET) << 16) | COMMON_SEC_PLUS_OFFSET;
}
}
static uint32_t getSecondaries(uint32_t variableTop, uint32_t pair);
static uint32_t getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair);
static uint32_t getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair);
static uint32_t getQuaternaries(uint32_t variableTop, uint32_t pair);
private:
CollationFastLatin() = delete; // no constructor
};
/*
* Format of the CollationFastLatin data table.
* CollationFastLatin::VERSION = 2.
*
* This table contains data for a Latin-text collation fastpath.
* The data is stored as an array of uint16_t which contains the following parts.
*
* uint16_t -- version & header length
* Bits 15..8: version, must match the VERSION
* 7..0: length of the header
*
* uint16_t varTops[header length - 1]
* Version 2:
* varTops[m] is the highest CollationFastLatin long-primary weight
* of supported maxVariable group m
* (special reorder group space, punct, symbol, currency).
*
* Version 1:
* Each of these values maps the variable top lead byte of a supported maxVariable group
* to the highest CollationFastLatin long-primary weight.
* The values are stored in ascending order.
* Bits 15..7: max fast-Latin long-primary weight (bits 11..3 shifted left by 4 bits)
* 6..0: regular primary lead byte
*
* uint16_t miniCEs[0x1c0]
* A mini collation element for each character U+0000..U+017F and U+2000..U+203F.
* Each value encodes one or two mini CEs (two are possible if the first one
* has a short mini primary and the second one is a secondary CE, i.e., primary == 0),
* or points to an expansion or to a contraction table.
* U+0000 always has a contraction entry,
* so that NUL-termination need not be tested in the fastpath.
* If the collation elements for a character or contraction cannot be encoded in this format,
* then the BAIL_OUT value is stored.
* For details see the comments for the class constants.
*
* uint16_t expansions[variable length];
* Expansion mini CEs contain an offset relative to just after the miniCEs table.
* An expansions contains exactly 2 mini CEs.
*
* uint16_t contractions[variable length];
* Contraction mini CEs contain an offset relative to just after the miniCEs table.
* It points to a list of tuples which map from a contraction suffix character to a result.
* First uint16_t of each tuple:
* Bits 10..9: Length of the result (1..3), see comments on CONTR_LENGTH_SHIFT.
* Bits 8..0: Contraction character, see comments on CONTR_CHAR_MASK.
* This is followed by 0, 1, or 2 uint16_t according to the length.
* Each list is terminated by an entry with CONTR_CHAR_MASK.
* Each list starts with such an entry which also contains the default result
* for when there is no contraction match.
*
* -----------------
* Changes for version 2 (ICU 55)
*
* Special reorder groups do not necessarily start on whole primary lead bytes any more.
* Therefore, the varTops data has a new format:
* Version 1 stored the lead bytes of the highest root primaries for
* the maxVariable-supported special reorder groups.
* Now the top 16 bits would need to be stored,
* and it is simpler to store only the fast-Latin weights.
*/
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION
#endif // __COLLATIONFASTLATIN_H__
|