1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationdata.h
*
* created on: 2010oct27
* created by: Markus W. Scherer
*/
#ifndef __COLLATIONDATA_H__
#define __COLLATIONDATA_H__
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_COLLATION
#include <_foundation_unicode/ucol.h>
#include <_foundation_unicode/uniset.h>
#include "collation.h"
#include "normalizer2impl.h"
#include "utrie2.h"
struct UDataMemory;
U_NAMESPACE_BEGIN
class UVector32;
/**
* Collation data container.
* Immutable data created by a CollationDataBuilder, or loaded from a file,
* or deserialized from API-provided binary data.
*
* Includes data for the collation base (root/default), aliased if this is not the base.
*/
struct U_I18N_API CollationData : public UMemory {
// Note: The ucadata.icu loader could discover the reserved ranges by setting an array
// parallel with the ranges, and resetting ranges that are indexed.
// The reordering builder code could clone the resulting template array.
static constexpr int32_t REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14;
static constexpr int32_t REORDER_RESERVED_AFTER_LATIN = REORDER_RESERVED_BEFORE_LATIN + 1;
static constexpr int32_t MAX_NUM_SPECIAL_REORDER_CODES = 8;
/** C++ only, data reader check scriptStartsLength. */
static constexpr int32_t MAX_NUM_SCRIPT_RANGES = 256;
CollationData(const Normalizer2Impl &nfc)
: trie(nullptr),
ce32s(nullptr), ces(nullptr), contexts(nullptr), base(nullptr),
jamoCE32s(nullptr),
nfcImpl(nfc),
numericPrimary(0x12000000),
ce32sLength(0), cesLength(0), contextsLength(0),
compressibleBytes(nullptr),
unsafeBackwardSet(nullptr),
fastLatinTable(nullptr), fastLatinTableLength(0),
numScripts(0), scriptsIndex(nullptr), scriptStarts(nullptr), scriptStartsLength(0),
rootElements(nullptr), rootElementsLength(0) {}
uint32_t getCE32(UChar32 c) const {
return UTRIE2_GET32(trie, c);
}
uint32_t getCE32FromSupplementary(UChar32 c) const {
return UTRIE2_GET32_FROM_SUPP(trie, c);
}
UBool isDigit(UChar32 c) const {
return c < 0x660 ? c <= 0x39 && 0x30 <= c :
Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
}
UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
}
UBool isCompressibleLeadByte(uint32_t b) const {
return compressibleBytes[b];
}
inline UBool isCompressiblePrimary(uint32_t p) const {
return isCompressibleLeadByte(p >> 24);
}
/**
* Returns the CE32 from two contexts words.
* Access to the defaultCE32 for contraction and prefix matching.
*/
static uint32_t readCE32(const char16_t *p) {
return ((uint32_t)p[0] << 16) | p[1];
}
/**
* Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
* Requires that ce32 is special.
*/
uint32_t getIndirectCE32(uint32_t ce32) const;
/**
* Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
* if ce32 is special.
*/
uint32_t getFinalCE32(uint32_t ce32) const;
/**
* Computes a CE from c's ce32 which has the OFFSET_TAG.
*/
int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
}
/**
* Returns the single CE that c maps to.
* Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
*/
int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
/**
* Returns the FCD16 value for code point c. c must be >= 0.
*/
uint16_t getFCD16(UChar32 c) const {
return nfcImpl.getFCD16(c);
}
/**
* Returns the first primary for the script's reordering group.
* @return the primary with only the first primary lead byte of the group
* (not necessarily an actual root collator primary weight),
* or 0 if the script is unknown
*/
uint32_t getFirstPrimaryForGroup(int32_t script) const;
/**
* Returns the last primary for the script's reordering group.
* @return the last primary of the group
* (not an actual root collator primary weight),
* or 0 if the script is unknown
*/
uint32_t getLastPrimaryForGroup(int32_t script) const;
/**
* Finds the reordering group which contains the primary weight.
* @return the first script of the group, or -1 if the weight is beyond the last group
*/
int32_t getGroupForPrimary(uint32_t p) const;
int32_t getEquivalentScripts(int32_t script,
int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
/**
* Writes the permutation of primary-weight ranges
* for the given reordering of scripts and groups.
* The caller checks for illegal arguments and
* takes care of [DEFAULT] and memory allocation.
*
* Each list element will be a (limit, offset) pair as described
* for the CollationSettings::reorderRanges.
* The list will be empty if no ranges are reordered.
*/
void makeReorderRanges(const int32_t *reorder, int32_t length,
UVector32 &ranges, UErrorCode &errorCode) const;
/** @see jamoCE32s */
static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
/** Main lookup trie. */
const UTrie2 *trie;
/**
* Array of CE32 values.
* At index 0 there must be CE32(U+0000)
* to support U+0000's special-tag for NUL-termination handling.
*/
const uint32_t *ce32s;
/** Array of CE values for expansions and OFFSET_TAG. */
const int64_t *ces;
/** Array of prefix and contraction-suffix matching data. */
const char16_t *contexts;
/** Base collation data, or nullptr if this data itself is a base. */
const CollationData *base;
/**
* Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
* They are normally simple CE32s, rarely expansions.
* For fast handling of HANGUL_TAG.
*/
const uint32_t *jamoCE32s;
const Normalizer2Impl &nfcImpl;
/** The single-byte primary weight (xx000000) for numeric collation. */
uint32_t numericPrimary;
int32_t ce32sLength;
int32_t cesLength;
int32_t contextsLength;
/** 256 flags for which primary-weight lead bytes are compressible. */
const UBool *compressibleBytes;
/**
* Set of code points that are unsafe for starting string comparison after an identical prefix,
* or in backwards CE iteration.
*/
const UnicodeSet *unsafeBackwardSet;
/**
* Fast Latin table for common-Latin-text string comparisons.
* Data structure see class CollationFastLatin.
*/
const uint16_t *fastLatinTable;
int32_t fastLatinTableLength;
/**
* Data for scripts and reordering groups.
* Uses include building a reordering permutation table and
* providing script boundaries to AlphabeticIndex.
*/
int32_t numScripts;
/**
* The length of scriptsIndex is numScripts+16.
* It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
* 16 special reorder codes (not all used) are mapped starting at numScripts.
* Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
* There are special codes at the end for reorder-reserved primary ranges.
*
* Multiple scripts may share a range and index, for example Hira & Kana.
*/
const uint16_t *scriptsIndex;
/**
* Start primary weight (top 16 bits only) for a group/script/reserved range
* indexed by scriptsIndex.
* The first range (separators & terminators) and the last range (trailing weights)
* are not reorderable, and no scriptsIndex entry points to them.
*/
const uint16_t *scriptStarts;
int32_t scriptStartsLength;
/**
* Collation elements in the root collator.
* Used by the CollationRootElements class. The data structure is described there.
* nullptr in a tailoring.
*/
const uint32_t *rootElements;
int32_t rootElementsLength;
private:
int32_t getScriptIndex(int32_t script) const;
void makeReorderRanges(const int32_t *reorder, int32_t length,
UBool latinMustMove,
UVector32 &ranges, UErrorCode &errorCode) const;
int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION
#endif // __COLLATIONDATA_H__
|