1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 1996-2012, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
/**
* \file
* \brief Originally, added as C++ API for Collation data used to compute minLengthInChars
* \internal
*/
/*
* Note: This module was included in ICU 4.0.1 as @internal technology preview for supporting
* Boyer-Moore string search API. For now, only SSearchTest depends on this module.
* I temporarily moved the module from i18n directory to intltest, because we have no plan to
* publish this as public API. (2012-12-18 yoshito)
*/
#ifndef COLL_DATA_H
#define COLL_DATA_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/ucol.h"
#include "unicode/unistr.h"
/**
* The size of the internal CE buffer in a <code>CEList</code> object
*/
#define CELIST_BUFFER_SIZE 4
/**
* \def INSTRUMENT_CELIST
* Define this to enable the <code>CEList</code> objects to collect
* statistics.
*/
/**
* The size of the initial list in a <code>StringList</code> object.
*/
#define STRING_LIST_BUFFER_SIZE 16
U_NAMESPACE_USE
/**
* This object holds a list of CEs generated from a particular
* <code>UnicodeString</code>
*
*/
class CEList
{
public:
/**
* Construct a <code>CEList</code> object.
*
* @param coll - the Collator used to collect the CEs.
* @param string - the string for which to collect the CEs.
* @param status - will be set if any errors occur.
*
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* the destructor.
*/
CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
/**
* The destructor.
*/
~CEList();
/**
* Return the number of CEs in the list.
*
* @return the number of CEs in the list.
*/
int32_t size() const;
/**
* Get a particular CE from the list.
*
* @param index - the index of the CE to return
*
* @return the CE, or <code>0</code> if <code>index</code> is out of range
*/
uint32_t get(int32_t index) const;
/**
* Check if the CEs in another <code>CEList</code> match the
* suffix of this list starting at a give offset.
*
* @param offset - the offset of the suffix
* @param other - the other <code>CEList</code>
*
* @return <code>true</code> if the CEs match, <code>false</code> otherwise.
*/
UBool matchesAt(int32_t offset, const CEList *other) const;
/**
* The index operator.
*
* @param index - the index
*
* @return a reference to the given CE in the list
*/
uint32_t &operator[](int32_t index) const;
private:
void add(uint32_t ce, UErrorCode &status);
uint32_t ceBuffer[CELIST_BUFFER_SIZE];
uint32_t *ces;
int32_t listMax;
int32_t listSize;
};
/**
* StringList
*
* This object holds a list of <code>UnicodeString</code> objects.
*/
class StringList
{
public:
/**
* Construct an empty <code>StringList</code>
*
* @param status - will be set if any errors occur.
*
* Note: if on return, status is set to an error code,
* the only safe thing to do with this object is to call
* the destructor.
*/
StringList(UErrorCode &status);
/**
* The destructor.
*/
~StringList();
/**
* Add a string to the list.
*
* @param string - the string to add
* @param status - will be set if any errors occur.
*/
void add(const UnicodeString *string, UErrorCode &status);
/**
* Add an array of Unicode code points to the list.
*
* @param chars - the address of the array of code points
* @param count - the number of code points in the array
* @param status - will be set if any errors occur.
*/
void add(const char16_t *chars, int32_t count, UErrorCode &status);
/**
* Get a particular string from the list.
*
* @param index - the index of the string
*
* @return a pointer to the <code>UnicodeString</code> or <code>nullptr</code>
* if <code>index</code> is out of bounds.
*/
const UnicodeString *get(int32_t index) const;
/**
* Get the number of strings in the list.
*
* @return the number of strings in the list.
*/
int32_t size() const;
private:
UnicodeString *strings;
int32_t listMax;
int32_t listSize;
};
/*
* Forward references to internal classes.
*/
class StringToCEsMap;
class CEToStringsMap;
/**
* CollData
*
* This class holds the Collator-specific data needed to
* compute the length of the shortest string that can
* generate a particular list of CEs.
*
* <code>CollData</code> objects are quite expensive to compute. Because
* of this, they are cached. When you call <code>CollData::open</code> it
* returns a reference counted cached object. When you call <code>CollData::close</code>
* the reference count on the object is decremented but the object is not deleted.
*
* If you do not need to reuse any unreferenced objects in the cache, you can call
* <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
* objects, you can call <code>CollData::freeCollDataCache</code>
*/
class CollData
{
public:
/**
* Construct a <code>CollData</code> object.
*
* @param collator - the collator
* @param status - will be set if any errors occur.
*/
CollData(UCollator *collator, UErrorCode &status);
/**
* The destructor.
*/
~CollData();
/**
* Get the <code>UCollator</code> object used to create this object.
* The object returned may not be the exact object that was used to
* create this object, but it will have the same behavior.
*/
UCollator *getCollator() const;
/**
* Get a list of all the strings which generate a list
* of CEs starting with a given CE.
*
* @param ce - the CE
*
* return a <code>StringList</code> object containing all
* the strings, or <code>nullptr</code> if there are
* no such strings.
*/
const StringList *getStringList(int32_t ce) const;
/**
* Get a list of the CEs generated by a particular string.
*
* @param string - the string
*
* @return a <code>CEList</code> object containing the CEs. You
* must call <code>freeCEList</code> when you are finished
* using the <code>CEList</code>/
*/
const CEList *getCEList(const UnicodeString *string) const;
/**
* Release a <code>CEList</code> returned by <code>getCEList</code>.
*
* @param list - the <code>CEList</code> to free.
*/
void freeCEList(const CEList *list);
/**
* Return the length of the shortest string that will generate
* the given list of CEs.
*
* @param ces - the CEs
* @param offset - the offset of the first CE in the list to use.
*
* @return the length of the shortest string.
*/
int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
/**
* Return the length of the shortest string that will generate
* the given list of CEs.
*
* Note: the algorithm used to do this computation is recursive. To
* limit the amount of recursion, a "history" list is used to record
* the best answer starting at a particular offset in the list of CEs.
* If the same offset is visited again during the recursion, the answer
* in the history list is used.
*
* @param ces - the CEs
* @param offset - the offset of the first CE in the list to use.
* @param history - the history list. Must be at least as long as
* the number of cEs in the <code>CEList</code>
*
* @return the length of the shortest string.
*/
int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
private:
UCollator *coll;
CEToStringsMap *ceToCharsStartingWith;
uint32_t minHan;
uint32_t maxHan;
uint32_t jamoLimits[4];
};
#endif // #if !UCONFIG_NO_COLLATION
#endif // #ifndef COLL_DATA_H
|