1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* utf8collationiterator.h
*
* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
* created by: Markus W. Scherer
*/
#ifndef __UTF8COLLATIONITERATOR_H__
#define __UTF8COLLATIONITERATOR_H__
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_COLLATION
#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "collationiterator.h"
#include "normalizer2impl.h"
U_NAMESPACE_BEGIN
/**
* UTF-8 collation element and character iterator.
* Handles normalized UTF-8 text inline, with length or NUL-terminated.
* Unnormalized text is handled by a subclass.
*/
class U_I18N_API UTF8CollationIterator : public CollationIterator {
public:
UTF8CollationIterator(const CollationData *d, UBool numeric,
const uint8_t *s, int32_t p, int32_t len)
: CollationIterator(d, numeric),
u8(s), pos(p), length(len) {}
virtual ~UTF8CollationIterator();
virtual void resetToOffset(int32_t newOffset) override;
virtual int32_t getOffset() const override;
virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
protected:
/**
* For byte sequences that are illegal in UTF-8, an error value may be returned
* together with a bogus code point. The caller will ignore that code point.
*
* Special values may be returned for surrogate code points, which are also illegal in UTF-8,
* but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true.
*
* Valid lead surrogates are returned from inside a normalized text segment,
* where handleGetTrailSurrogate() will return the matching trail surrogate.
*/
virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
virtual UBool foundNULTerminator() override;
virtual UBool forbidSurrogateCodePoints() const override;
virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
const uint8_t *u8;
int32_t pos;
int32_t length; // <0 for NUL-terminated strings
};
/**
* Incrementally checks the input text for FCD and normalizes where necessary.
*/
class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
public:
FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
const uint8_t *s, int32_t p, int32_t len)
: UTF8CollationIterator(data, numeric, s, p, len),
state(CHECK_FWD), start(p),
nfcImpl(data->nfcImpl) {}
virtual ~FCDUTF8CollationIterator();
virtual void resetToOffset(int32_t newOffset) override;
virtual int32_t getOffset() const override;
virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
protected:
virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
virtual char16_t handleGetTrailSurrogate() override;
virtual UBool foundNULTerminator() override;
virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
private:
UBool nextHasLccc() const;
UBool previousHasTccc() const;
/**
* Switches to forward checking if possible.
*/
void switchToForward();
/**
* Extends the FCD text segment forward or normalizes around pos.
* @return true if success
*/
UBool nextSegment(UErrorCode &errorCode);
/**
* Switches to backward checking.
*/
void switchToBackward();
/**
* Extends the FCD text segment backward or normalizes around pos.
* @return true if success
*/
UBool previousSegment(UErrorCode &errorCode);
UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
enum State {
/**
* The input text [start..pos[ passes the FCD check.
* Moving forward checks incrementally.
* limit is undefined.
*/
CHECK_FWD,
/**
* The input text [pos..limit[ passes the FCD check.
* Moving backward checks incrementally.
* start is undefined.
*/
CHECK_BWD,
/**
* The input text [start..limit[ passes the FCD check.
* pos tracks the current text index.
*/
IN_FCD_SEGMENT,
/**
* The input text [start..limit[ failed the FCD check and was normalized.
* pos tracks the current index in the normalized string.
*/
IN_NORMALIZED
};
State state;
int32_t start;
int32_t limit;
const Normalizer2Impl &nfcImpl;
UnicodeString normalized;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION
#endif // __UTF8COLLATIONITERATOR_H__
|