1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2001-2015 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 08/13/2001 synwee Creation.
**********************************************************************
*/
#ifndef USRCHIMP_H
#define USRCHIMP_H
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_COLLATION
#include <_foundation_unicode/normalizer2.h>
#include <_foundation_unicode/ucol.h>
#include <_foundation_unicode/ucoleitr.h>
#include <_foundation_unicode/ubrk.h>
/* mask off anything but primary order */
#define UCOL_PRIMARYORDERMASK 0xffff0000
/* mask off anything but secondary order */
#define UCOL_SECONDARYORDERMASK 0x0000ff00
/* mask off anything but tertiary order */
#define UCOL_TERTIARYORDERMASK 0x000000ff
/* primary order shift */
#define UCOL_PRIMARYORDERSHIFT 16
/* secondary order shift */
#define UCOL_SECONDARYORDERSHIFT 8
#define UCOL_IGNORABLE 0
/* get weights from a CE */
#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
#define UCOL_CONTINUATION_MARKER 0xC0
#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
/**
* This indicates an error has occurred during processing or there are no more CEs
* to be returned.
*/
#if APPLE_ICU_CHANGES
// rdar://
#ifndef UCOL_PROCESSED_NULLORDER
#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
#endif
#else
#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
#endif // APPLE_ICU_CHANGES
U_NAMESPACE_BEGIN
class CollationElementIterator;
class Collator;
struct PCEI
{
uint64_t ce;
int32_t low;
int32_t high;
};
struct PCEBuffer
{
PCEI defaultBuffer[16];
PCEI *buffer;
int32_t bufferIndex;
int32_t bufferSize;
PCEBuffer();
~PCEBuffer();
void reset();
UBool isEmpty() const;
void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
const PCEI *get();
};
class UCollationPCE : public UMemory {
private:
PCEBuffer pceBuffer;
CollationElementIterator *cei;
UCollationStrength strength;
UBool toShift;
UBool isShifted;
uint32_t variableTop;
public:
UCollationPCE(UCollationElements *elems);
UCollationPCE(CollationElementIterator *iter);
~UCollationPCE();
void init(UCollationElements *elems);
void init(CollationElementIterator *iter);
/**
* Get the processed ordering priority of the next collation element in the text.
* A single character may contain more than one collation element.
*
* @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
* @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
* @param status A pointer to an UErrorCode to receive any errors.
* @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
* if an error has occurred or if the end of string has been reached
*/
int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
/**
* Get the processed ordering priority of the previous collation element in the text.
* A single character may contain more than one collation element.
*
* @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
* @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
* @param status A pointer to an UErrorCode to receive any errors. Notably
* a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
* buffer has been exhausted.
* @return The previous collation elements ordering, otherwise returns
* UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of
* string has been reached.
*/
int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
private:
void init(const Collator &coll);
uint64_t processCE(uint32_t ce);
};
U_NAMESPACE_END
#define INITIAL_ARRAY_SIZE_ 256
struct USearch {
// required since collation element iterator does not have a getText API
const UChar *text;
int32_t textLength; // exact length
UBool isOverlap;
UBool isCanonicalMatch;
int16_t elementComparisonType;
UBreakIterator *internalBreakIter; // internal character breakiterator, lazily created.
UBreakIterator *breakIter; // caller provided character breakiterator
// value USEARCH_DONE is the default value
// if we are not at the start of the text or the end of the text,
// depending on the iteration direction and matchedIndex is USEARCH_DONE
// it means that we can't find any more matches in that particular direction
int32_t matchedIndex;
int32_t matchedLength;
UBool isForwardSearching;
UBool reset;
};
struct UPattern {
const UChar *text;
int32_t textLength; // exact length
// length required for backwards ce comparison
int32_t cesLength;
int32_t *ces;
int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
int32_t pcesLength;
int64_t *pces;
int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
UBool hasPrefixAccents;
UBool hasSuffixAccents;
};
struct UStringSearch {
struct USearch *search;
struct UPattern pattern;
const UCollator *collator;
const icu::Normalizer2 *nfd;
// positions within the collation element iterator is used to determine
// if we are at the start of the text.
UCollationElements *textIter;
icu::UCollationPCE *textProcessedIter;
// utility collation element, used throughout program for temporary
// iteration.
UCollationElements *utilIter;
UBool ownCollator;
UCollationStrength strength;
uint32_t ceMask;
uint32_t variableTop;
UBool toShift;
};
/**
* Exact matches without checking for the ends for extra accents.
* The match after the position within the collation element iterator is to be
* found.
* After a match is found the offset in the collation element iterator will be
* shifted to the start of the match.
* Implementation note:
* For tertiary we can't use the collator->tertiaryMask, that is a
* preprocessed mask that takes into account case options. since we are only
* concerned with exact matches, we don't need that.
* Alternate handling - since only the 16 most significant digits is only used,
* we can safely do a compare without masking if the ce is a variable, we mask
* and get only the primary values no shifting to quartenary is required since
* all primary values less than variabletop will need to be masked off anyway.
* If the end character is composite and the pattern ce does not match the text
* ce, we skip it until we find a match in the end composite character or when
* it has passed the character. This is so that we can match pattern "a" with
* the text "\u00e6"
* @param strsrch string search data
* @param status error status if any
* @return true if an exact match is found, false otherwise
*/
U_CFUNC
UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
/**
* Canonical matches.
* According to the definition, matches found here will include the whole span
* of beginning and ending accents if it overlaps that region.
* @param strsrch string search data
* @param status error status if any
* @return true if a canonical match is found, false otherwise
*/
U_CFUNC
UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
/**
* Gets the previous match.
* Comments follows from handleNextExact
* @param strsrch string search data
* @param status error status if any
* @return True if a exact math is found, false otherwise.
*/
U_CFUNC
UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
/**
* Canonical matches.
* According to the definition, matches found here will include the whole span
* of beginning and ending accents if it overlaps that region.
* @param strsrch string search data
* @param status error status if any
* @return true if a canonical match is found, false otherwise
*/
U_CFUNC
UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
UErrorCode *status);
#endif /* #if !UCONFIG_NO_COLLATION */
#endif
|