File: urbtok.h

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (276 lines) | stat: -rw-r--r-- 12,299 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
/*
******************************************************************************
* Copyright (C) 2006-2008, 2017-2018 Apple Inc. All Rights Reserved.
*
* originally added per rdar://4768966&4768976 C tokenizer API with binary rule endian swapping
* updated per
* rdar://5378823 Add urbtok_openBinaryRulesNoCopy().
* rdar://35946337 Rewrite urbtok_tokenize & other urbtok_ interfaces to work with new RBBI but be fast enough
* rdar://37249396 Add ICU 57 version of RBBI classes, urbtok57 interfaces for access via RBT, and better tests
******************************************************************************
*/

#ifndef URBTOK_H
#define URBTOK_H

#include <_foundation_unicode/utypes.h>

#if !UCONFIG_NO_BREAK_ITERATION

#include <_foundation_unicode/ubrk.h>
#include <_foundation_unicode/parseerr.h>

/**
 * The interfaces here are meant to extend the functionality of the standard
 * ubrk_* interfaces in ubrk.h to allow for faster batch tokenization. This
 * was primarily intended for Spotlight and related processes. There are two
 * versions of these:
 *
 * The versions prefixed urbtok_ extend the standard ICU RuleBasedBreakIterator
 * class. These are intended to fully support all of the current rule syntax used
 * by that class, and should urbtok_tokenize give results equivalent to a loop using a
 * combination of the standard functions ubrk_next to get the next break (determining
 * the length of the previous token) and ubrk_getRuleStatusVec to get a flag value
 * formed as the bitwise OR of all of the values in the returnend vector, skipping all
 * tokens whose flag value is -1. urbtok_tokenize is faster than such a loop since it
 * assumes only one pass over the text in the forward direction, and shut skips caching
 * of breaks positions and makes other simplifying assumptions. However, it may not be
 * fast enough fo Spotlight.
 *
 * Thus we also include the versions prefixed by urbtok57_, which use a legacy ICU 57
 * version of RuleBasedBreakIterator and an Apple subclass RuleBasedTokenizer. These
 * versions do not support any RuleBasedBreakIterator rule sytax enhancements from
 * later than ICU 57.
 *
 * The two different sets of functions should not be mixed; urbtok57_getBinaryRules
 * should only be used with a UBreakIterator created using urbtok57_openRules;
 * urbtok57_tokenize should only be used with a UBreakIterator created using
 * urbtok57_openRules or urbtok_openBinaryRules[NoCopy], etc. Similarly, the
 * urbtok_ functions should only be used with other urbtok_ functions.
 */
 
/**
 * struct for returning token results
 */
typedef struct RuleBasedTokenRange {
    signed long location;
    signed long length;
} RuleBasedTokenRange;

/**
 * Open a new UBreakIterator for locating text boundaries for a specified locale.
 * A UBreakIterator may be used for detecting character, line, word,
 * and sentence breaks in text.
 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
 * UBRK_LINE, UBRK_SENTENCE
 * @param locale The locale specifying the text-breaking conventions. Note that
 * locale keys such as "lb" and "ss" may be used to modify text break behavior,
 * see general discussion of BreakIterator C API.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified type and locale.
 * @see ubrk_open
 * @internal
 */
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_open(UBreakIteratorType type,
           const char *locale,
           UErrorCode *status);

/**
 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 * The rule syntax is ... (TBD)
 * @param rules A set of rules specifying the text breaking conventions.
 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
 * @param parseErr   Receives position and context information for any syntax errors
 *                   detected while parsing the rules.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 * @internal
 */
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openRules(const UChar     *rules,
               int32_t         rulesLength,
               UParseError     *parseErr,
               UErrorCode      *status);

/**
 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 * @param rules A set of rules specifying the text breaking conventions. The binary rules
 *              must be at least 32-bit aligned. Note: This version makes a copy of the
 *				rules, so after calling this function the caller can close or release
 *				the rules that were passed to this function. The copy created by this
 *				call will be freed when ubrk_close() is called on the UBreakIterator*.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 * @internal
 */
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openBinaryRules(const uint8_t *rules,
               UErrorCode      *status);

/**
 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 * @param rules A set of rules specifying the text breaking conventions. The binary rules
 *              must be at least 32-bit aligned. Note: This version does NOT make a copy
 *				of the rules, so after calling this function the caller must not close or
 *				release the rules passed to this function until after they are finished
 *				with this UBreakIterator* (and any others created using the same rules)
  *				and have called ubrk_close() to close the UBreakIterator* (and any others
 *				using the same rules).
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 * @internal
 */
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
               UErrorCode      *status);

/**
 * Get the (native-endian) binary break rules for this tokenizer.
 * @param bi The tokenizer to use.
 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
 * @param buffSize The size of the output buffer.
 * @param status A UErrorCode to receive any errors.
 * @return The actual size of the binary rules, whether they fit the buffer or not.
 * @internal
 */
U_INTERNAL uint32_t U_EXPORT2
urbtok_getBinaryRules(UBreakIterator      *bi,
                uint8_t             *buffer,
                uint32_t            buffSize,
                UErrorCode          *status);

/**
 * Tokenize text using a rule-based tokenizer.
 * This is primarily intended for speedy batch tokenization using very simple rules.
 * It does not currently implement support for all of the features of ICU break rules
 * (adding that would reduce performance). If you need support for all of the ICU rule
 * features, please use the standard ubrk_* interfaces; instead of urbtok_tokenize,
 * use a loop with ubrk_next and ubrk_getRuleStatus.
 *
 * @param bi The tokenizer to use.
 * @param maxTokens The maximum number of tokens to return.
 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
 * @return The number of tokens returned, 0 if done.
 * @internal
 */
U_INTERNAL int32_t U_EXPORT2
urbtok_tokenize(UBreakIterator      *bi,
               int32_t              maxTokens,
               RuleBasedTokenRange  *outTokens,
               unsigned long        *outTokenFlags);

/**
 * Swap the endianness of a set of binary break rules.
 * @param rules A set of rules which need swapping.
 * @param buffer The output buffer for the swapped rules, which must be the same
 *               size as the input rules buffer.
 * @param inIsBigEndian UBool indicating whether the input is big-endian
 * @param outIsBigEndian UBool indicating whether the output should be big-endian
 * @param status A UErrorCode to receive any errors.
 * @internal
 */
U_INTERNAL void U_EXPORT2
urbtok_swapBinaryRules(const uint8_t *rules,
               uint8_t          *buffer,
               UBool            inIsBigEndian,
               UBool            outIsBigEndian,
               UErrorCode       *status);



/**
 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 * The rule syntax is ... (TBD)
 * @param rules A set of rules specifying the text breaking conventions.
 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
 * @param parseErr   Receives position and context information for any syntax errors
 *                   detected while parsing the rules.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 * @internal
 */
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok57_openRules(const UChar     *rules,
               int32_t         rulesLength,
               UParseError     *parseErr,
               UErrorCode      *status);

/**
 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 * @param rules A set of rules specifying the text breaking conventions. The binary rules
 *              must be at least 32-bit aligned. Note: This version makes a copy of the
 *				rules, so after calling this function the caller can close or release
 *				the rules that were passed to this function. The copy created by this
 *				call will be freed when ubrk_close() is called on the UBreakIterator*.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 * @internal
 */
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok57_openBinaryRules(const uint8_t *rules,
               UErrorCode      *status);

/**
 * Open a new UBreakIterator for tokenizing text using specified breaking rules.
 * @param rules A set of rules specifying the text breaking conventions. The binary rules
 *              must be at least 32-bit aligned. Note: This version does NOT make a copy
 *				of the rules, so after calling this function the caller must not close or
 *				release the rules passed to this function until after they are finished
 *				with this UBreakIterator* (and any others created using the same rules)
  *				and have called ubrk_close() to close the UBreakIterator* (and any others
 *				using the same rules).
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 * @internal
 */
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok57_openBinaryRulesNoCopy(const uint8_t *rules,
               UErrorCode      *status);

/**
 * Get the (native-endian) binary break rules for this tokenizer.
 * @param bi The tokenizer to use.
 * @param buffer The output buffer for the rules. You can pass 0 to get the required size.
 * @param buffSize The size of the output buffer.
 * @param status A UErrorCode to receive any errors.
 * @return The actual size of the binary rules, whether they fit the buffer or not.
 * @internal
 */
U_INTERNAL uint32_t U_EXPORT2
urbtok57_getBinaryRules(UBreakIterator      *bi,
                uint8_t             *buffer,
                uint32_t            buffSize,
                UErrorCode          *status);

/**
 * Tokenize text using a rule-based tokenizer.
 * This is primarily intended for speedy batch tokenization using very simple rules.
 * It does not currently implement support for all of the features of ICU break rules
 * (adding that would reduce performance). If you need support for all of the ICU rule
 * features, please use the standard Apple urbtok_tokenize, or a loop with standard
 * ICU interfaes ubrk_next and ubrk_getRuleStatusVec.
 *
 * @param bi The tokenizer to use.
 * @param maxTokens The maximum number of tokens to return.
 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
 * @return The number of tokens returned, 0 if done.
 * @internal
 */
U_INTERNAL int32_t U_EXPORT2
urbtok57_tokenize(UBreakIterator      *bi,
               int32_t              maxTokens,
               RuleBasedTokenRange  *outTokens,
               unsigned long        *outTokenFlags);

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif