1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationfcd.h
*
* created on: 2012aug18
* created by: Markus W. Scherer
*/
#ifndef __COLLATIONFCD_H__
#define __COLLATIONFCD_H__
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_COLLATION
#include <_foundation_unicode/utf16.h>
U_NAMESPACE_BEGIN
/**
* Data and functions for the FCD check fast path.
*
* The fast path looks at a pair of 16-bit code units and checks
* whether there is an FCD boundary between them;
* there is if the first unit has a trailing ccc=0 (!hasTccc(first))
* or the second unit has a leading ccc=0 (!hasLccc(second)),
* or both.
* When the fast path finds a possible non-boundary,
* then the FCD check slow path looks at the actual sequence of FCD values.
*
* This is a pure optimization.
* The fast path must at least find all possible non-boundaries.
* If the fast path is too pessimistic, it costs performance.
*
* For a pair of BMP characters, the fast path tests are precise (1 bit per character).
*
* For a supplementary code point, the two units are its lead and trail surrogates.
* We set hasTccc(lead)=true if any of its 1024 associated supplementary code points
* has lccc!=0 or tccc!=0.
* We set hasLccc(trail)=true for all trail surrogates.
* As a result, we leave the fast path if the lead surrogate might start a
* supplementary code point that is not FCD-inert.
* (So the fast path need not detect that there is a surrogate pair,
* nor look ahead to the next full code point.)
*
* hasLccc(lead)=true if any of its 1024 associated supplementary code points
* has lccc!=0, for fast boundary checking between BMP & supplementary.
*
* hasTccc(trail)=false:
* It should only be tested for unpaired trail surrogates which are FCD-inert.
*/
class U_I18N_API CollationFCD {
public:
static inline UBool hasLccc(UChar32 c) {
// assert c <= 0xffff
// c can be negative, e.g., U_SENTINEL from UCharIterator;
// that is handled in the first test.
int32_t i;
return
// U+0300 is the first character with lccc!=0.
c >= 0x300 &&
(i = lcccIndex[c >> 5]) != 0 &&
(lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;
}
static inline UBool hasTccc(UChar32 c) {
// assert c <= 0xffff
// c can be negative, e.g., U_SENTINEL from UCharIterator;
// that is handled in the first test.
int32_t i;
return
// U+00C0 is the first character with tccc!=0.
c >= 0xc0 &&
(i = tcccIndex[c >> 5]) != 0 &&
(tcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;
}
static inline UBool mayHaveLccc(UChar32 c) {
// Handles all of Unicode 0..10FFFF.
// c can be negative, e.g., U_SENTINEL.
// U+0300 is the first character with lccc!=0.
if(c < 0x300) { return false; }
if(c > 0xffff) { c = U16_LEAD(c); }
int32_t i;
return
(i = lcccIndex[c >> 5]) != 0 &&
(lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;
}
/**
* Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
* must be decomposed before reaching the core collation code,
* or else some sequences including them, even ones passing the FCD check,
* do not yield canonically equivalent results.
*
* This is a fast and imprecise test.
*
* @param c a code point
* @return true if c is U+0F73, U+0F75 or U+0F81 or one of several other Tibetan characters
*/
static inline UBool maybeTibetanCompositeVowel(UChar32 c) {
return (c & 0x1fff01) == 0xf01;
}
/**
* Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
* must be decomposed before reaching the core collation code,
* or else some sequences including them, even ones passing the FCD check,
* do not yield canonically equivalent results.
*
* They have distinct lccc/tccc combinations: 129/130 or 129/132.
*
* @param fcd16 the FCD value (lccc/tccc combination) of a code point
* @return true if fcd16 is from U+0F73, U+0F75 or U+0F81
*/
static inline UBool isFCD16OfTibetanCompositeVowel(uint16_t fcd16) {
return fcd16 == 0x8182 || fcd16 == 0x8184;
}
private:
CollationFCD() = delete; // No instantiation.
static const uint8_t lcccIndex[2048];
static const uint8_t tcccIndex[2048];
static const uint32_t lcccBits[];
static const uint32_t tcccBits[];
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION
#endif // __COLLATIONFCD_H__
|