1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
|
/*
******************************************************************************
* Copyright (C) 2006-2008, 2017-2018 Apple Inc. All Rights Reserved.
*
* originally added per rdar://4768966&4768976 C tokenizer API with binary rule endian swapping
* updated per
* rdar://5378823 Add urbtok_openBinaryRulesNoCopy().
* rdar://35946337 Rewrite urbtok_tokenize & other urbtok_ interfaces to work with new RBBI but be fast enough
* rdar://37249396 Add ICU 57 version of RBBI classes, urbtok57 interfaces for access via RBT, and better tests
******************************************************************************
*/
#ifndef URBTOK_H
#define URBTOK_H
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_BREAK_ITERATION
#include <_foundation_unicode/ubrk.h>
#include <_foundation_unicode/parseerr.h>
/**
* The interfaces here are meant to extend the functionality of the standard
* ubrk_* interfaces in ubrk.h to allow for faster batch tokenization. This
* was primarily intended for Spotlight and related processes. There are two
* versions of these:
*
* The versions prefixed urbtok_ extend the standard ICU RuleBasedBreakIterator
* class. These are intended to fully support all of the current rule syntax used
* by that class, and should urbtok_tokenize give results equivalent to a loop using a
* combination of the standard functions ubrk_next to get the next break (determining
* the length of the previous token) and ubrk_getRuleStatusVec to get a flag value
* formed as the bitwise OR of all of the values in the returnend vector, skipping all
* tokens whose flag value is -1. urbtok_tokenize is faster than such a loop since it
* assumes only one pass over the text in the forward direction, and shut skips caching
* of breaks positions and makes other simplifying assumptions. However, it may not be
* fast enough fo Spotlight.
*
* Thus we also include the versions prefixed by urbtok57_, which use a legacy ICU 57
* version of RuleBasedBreakIterator and an Apple subclass RuleBasedTokenizer. These
* versions do not support any RuleBasedBreakIterator rule sytax enhancements from
* later than ICU 57.
*
* The two different sets of functions should not be mixed; urbtok57_getBinaryRules
* should only be used with a UBreakIterator created using urbtok57_openRules;
* urbtok57_tokenize should only be used with a UBreakIterator created using
* urbtok57_openRules or urbtok_openBinaryRules[NoCopy], etc. Similarly, the
* urbtok_ functions should only be used with other urbtok_ functions.
*/
/**
* struct for returning token results
*/
typedef struct RuleBasedTokenRange {
signed long location;
signed long length;
} RuleBasedTokenRange;
/**
* Open a new UBreakIterator for locating text boundaries for a specified locale.
* A UBreakIterator may be used for detecting character, line, word,
* and sentence breaks in text.
* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
* UBRK_LINE, UBRK_SENTENCE
* @param locale The locale specifying the text-breaking conventions. Note that
* locale keys such as "lb" and "ss" may be used to modify text break behavior,
* see general discussion of BreakIterator C API.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified type and locale.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_open(UBreakIteratorType type,
const char *locale,
UErrorCode *status);
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* The rule syntax is ... (TBD)
* @param rules A set of rules specifying the text breaking conventions.
* @param rulesLength The number of characters in rules, or -1 if null-terminated.
* @param parseErr Receives position and context information for any syntax errors
* detected while parsing the rules.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openRules(const UChar *rules,
int32_t rulesLength,
UParseError *parseErr,
UErrorCode *status);
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* @param rules A set of rules specifying the text breaking conventions. The binary rules
* must be at least 32-bit aligned. Note: This version makes a copy of the
* rules, so after calling this function the caller can close or release
* the rules that were passed to this function. The copy created by this
* call will be freed when ubrk_close() is called on the UBreakIterator*.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openBinaryRules(const uint8_t *rules,
UErrorCode *status);
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* @param rules A set of rules specifying the text breaking conventions. The binary rules
* must be at least 32-bit aligned. Note: This version does NOT make a copy
* of the rules, so after calling this function the caller must not close or
* release the rules passed to this function until after they are finished
* with this UBreakIterator* (and any others created using the same rules)
* and have called ubrk_close() to close the UBreakIterator* (and any others
* using the same rules).
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
UErrorCode *status);
/**
* Get the (native-endian) binary break rules for this tokenizer.
* @param bi The tokenizer to use.
* @param buffer The output buffer for the rules. You can pass 0 to get the required size.
* @param buffSize The size of the output buffer.
* @param status A UErrorCode to receive any errors.
* @return The actual size of the binary rules, whether they fit the buffer or not.
* @internal
*/
U_INTERNAL uint32_t U_EXPORT2
urbtok_getBinaryRules(UBreakIterator *bi,
uint8_t *buffer,
uint32_t buffSize,
UErrorCode *status);
/**
* Tokenize text using a rule-based tokenizer.
* This is primarily intended for speedy batch tokenization using very simple rules.
* It does not currently implement support for all of the features of ICU break rules
* (adding that would reduce performance). If you need support for all of the ICU rule
* features, please use the standard ubrk_* interfaces; instead of urbtok_tokenize,
* use a loop with ubrk_next and ubrk_getRuleStatus.
*
* @param bi The tokenizer to use.
* @param maxTokens The maximum number of tokens to return.
* @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
* @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
* @return The number of tokens returned, 0 if done.
* @internal
*/
U_INTERNAL int32_t U_EXPORT2
urbtok_tokenize(UBreakIterator *bi,
int32_t maxTokens,
RuleBasedTokenRange *outTokens,
unsigned long *outTokenFlags);
/**
* Swap the endianness of a set of binary break rules.
* @param rules A set of rules which need swapping.
* @param buffer The output buffer for the swapped rules, which must be the same
* size as the input rules buffer.
* @param inIsBigEndian UBool indicating whether the input is big-endian
* @param outIsBigEndian UBool indicating whether the output should be big-endian
* @param status A UErrorCode to receive any errors.
* @internal
*/
U_INTERNAL void U_EXPORT2
urbtok_swapBinaryRules(const uint8_t *rules,
uint8_t *buffer,
UBool inIsBigEndian,
UBool outIsBigEndian,
UErrorCode *status);
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* The rule syntax is ... (TBD)
* @param rules A set of rules specifying the text breaking conventions.
* @param rulesLength The number of characters in rules, or -1 if null-terminated.
* @param parseErr Receives position and context information for any syntax errors
* detected while parsing the rules.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok57_openRules(const UChar *rules,
int32_t rulesLength,
UParseError *parseErr,
UErrorCode *status);
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* @param rules A set of rules specifying the text breaking conventions. The binary rules
* must be at least 32-bit aligned. Note: This version makes a copy of the
* rules, so after calling this function the caller can close or release
* the rules that were passed to this function. The copy created by this
* call will be freed when ubrk_close() is called on the UBreakIterator*.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok57_openBinaryRules(const uint8_t *rules,
UErrorCode *status);
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* @param rules A set of rules specifying the text breaking conventions. The binary rules
* must be at least 32-bit aligned. Note: This version does NOT make a copy
* of the rules, so after calling this function the caller must not close or
* release the rules passed to this function until after they are finished
* with this UBreakIterator* (and any others created using the same rules)
* and have called ubrk_close() to close the UBreakIterator* (and any others
* using the same rules).
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok57_openBinaryRulesNoCopy(const uint8_t *rules,
UErrorCode *status);
/**
* Get the (native-endian) binary break rules for this tokenizer.
* @param bi The tokenizer to use.
* @param buffer The output buffer for the rules. You can pass 0 to get the required size.
* @param buffSize The size of the output buffer.
* @param status A UErrorCode to receive any errors.
* @return The actual size of the binary rules, whether they fit the buffer or not.
* @internal
*/
U_INTERNAL uint32_t U_EXPORT2
urbtok57_getBinaryRules(UBreakIterator *bi,
uint8_t *buffer,
uint32_t buffSize,
UErrorCode *status);
/**
* Tokenize text using a rule-based tokenizer.
* This is primarily intended for speedy batch tokenization using very simple rules.
* It does not currently implement support for all of the features of ICU break rules
* (adding that would reduce performance). If you need support for all of the ICU rule
* features, please use the standard Apple urbtok_tokenize, or a loop with standard
* ICU interfaes ubrk_next and ubrk_getRuleStatusVec.
*
* @param bi The tokenizer to use.
* @param maxTokens The maximum number of tokens to return.
* @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
* @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
* @return The number of tokens returned, 0 if done.
* @internal
*/
U_INTERNAL int32_t U_EXPORT2
urbtok57_tokenize(UBreakIterator *bi,
int32_t maxTokens,
RuleBasedTokenRange *outTokens,
unsigned long *outTokenFlags);
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif
|