File: rbtok.cpp

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (298 lines) | stat: -rw-r--r-- 10,194 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/*
***************************************************************************
* Copyright (C) 2006-2008,2017-2018 Apple Inc. All Rights Reserved.
*
* originally added per rdar://4755956&4769693 Rule-based tokenizer optimized implementation
* updated per rdar://5378823 Add urbtok_openBinaryRulesNoCopy().
***************************************************************************
*/

#include <_foundation_unicode/utypes.h>

#if !UCONFIG_NO_BREAK_ITERATION

#include <_foundation_unicode/ustring.h>
#include <_foundation_unicode/utext.h>
#include "rbbidata57.h"
#include "rbbi57.h"
#include "rbtok.h"
#include "uassert.h"

#ifdef RBBI_DEBUG
// The following is now static in rbbi.cpp, gets set dynamicaly.
// For now duplicate here to build, and force to true if desired.
static UBool fTrace = false;
#endif

U_NAMESPACE_BEGIN


static const int16_t START_STATE = 1;     // The state number of the starting state
static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"

int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
{
    RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
    RuleBasedTokenRange *outTokenP = outTokenRanges;
    int32_t             state;
    uint16_t            category = 0;

    const RBBIStateTableRow57  *row;
    const RBBIStateTableRow57  *const startRow = fStartRow;

    int32_t             lastAcceptingState = 0;
    UChar32             c = 0;
    signed long         prev;
    signed long         result;
    const char         *const tableData       = fData->fForwardTable->fTableData;
    const uint32_t            tableRowLen     = fData->fForwardTable->fRowLen;
    UText *text = fText;

    #ifdef RBBI_DEBUG
        if (fTrace) {
            RBBIDebugPuts("Handle Next   pos   char  state category");
        }
    #endif

    fLastStatusIndexValid = false;

    // if we're already at the end of the text, return DONE.
    prev = (signed long)UTEXT_GETNATIVEINDEX(text);

    // loop until we reach the end of the text or transition to state 0
    //
    const UTrie *trie = &fData->fTrie;
    while (outTokenP < outTokenLimit) {
        result = prev; // fallback initialization
        c = UTEXT_NEXT32(text);
        if (c == U_SENTINEL)
        {
            goto exitTokenizer;
        }
        //  Set the initial state for the state machine
        state = START_STATE;
        row = startRow;

        // if we have cached break positions and we're still in the range
        // covered by them, just move one step forward in the cache
        if (fCachedBreakPositions != NULL) {
            if (fPositionInCache < fNumCachedBreakPositions - 1) {
                ++fPositionInCache;
                result = fCachedBreakPositions[fPositionInCache];
                goto emitToken;
            }
            else {
                reset();
            }
        }

        while (c != U_SENTINEL) {
            //
            // Get the char category.  An incoming category of 1 or 2 means that
            //      we are preset for doing the beginning or end of input, and
            //      that we shouldn't get a category from an actual text input character.
            //
                // look up the current character's character category, which tells us
                // which column in the state table to look at.
                // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
                //        not the size of the character going in, which is a UChar32.
                //
                if (c < 0x100)
                    category = fLatin1Cat[c];
                else
                    UTRIE_GET16(trie, c, category);

                // Check the dictionary bit in the character's category.
                //    Counter is only used by dictionary based iterators (subclasses).
                //    Chars that need to be handled by a dictionary have a flag bit set
                //    in their category values.
                //
                if ((category & 0x4000) != 0)  {
                    fDictionaryCharCount++;
                    //  And off the dictionary flag bit.
                    category &= ~0x4000;
                }

            #ifdef RBBI_DEBUG
                if (fTrace) {
                    RBBIDebugPrintf("             %4lld   ", utext_getNativeIndex(fText));
                    if (0x20<=c && c<0x7f) {
                        RBBIDebugPrintf("\"%c\"  ", c);
                    } else {
                        RBBIDebugPrintf("%5x  ", c);
                    }
                    RBBIDebugPrintf("%3d  %3d\n", state, category);
                }
            #endif

            // State Transition - move machine to its next state
            //

            // Note: fNextState is defined as uint16_t[2], but we are casting
            // a generated RBBI table to RBBIStateTableRow57 and some tables
            // actually have more than 2 categories.
            U_ASSERT(category<fData->fHeader->fCatCount);
            state = row->fNextState[category];
            row = (const RBBIStateTableRow57 *) (tableData + tableRowLen * state);

            if (row->fAccepting == -1) {
                // Match found, common case.
                    result = (signed long)UTEXT_GETNATIVEINDEX(text);
                //fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
                //lastStatusRow = row;
                lastAcceptingState = state;
            }
    
            if (state == STOP_STATE) {
                // This is the normal exit from the lookup state machine.
                // We have advanced through the string until it is certain that no
                //   longer match is possible, no matter what characters follow.
                break;
            }

            // Advance to the next character.
            // If this is a beginning-of-input loop iteration, don't advance
            //    the input position.  The next iteration will be processing the
            //    first real input character.
                c = UTEXT_NEXT32(text);
        }

        if (fDictionaryCharCount > 0) {
            result = (signed long) checkDictionary(prev, (int32_t) result, false);
        }

emitToken:
        // The state machine is done.  Check whether it found a match...

        // If the iterator failed to advance in the match engine, force it ahead by one.
        //   (This really indicates a defect in the break rules.  They should always match
        //    at least one character.). Added in open-source ICU r13469
        UBool setFlagsZero = false;
        if (result == prev) {
            UTEXT_SETNATIVEINDEX(text, prev);
            UTEXT_NEXT32(text);
            result = (int32_t)UTEXT_GETNATIVEINDEX(text);
            setFlagsZero = true;
        }

        // Leave the iterator at our result position.
        UTEXT_SETNATIVEINDEX(text, result);

        RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
        int32_t flags = (!setFlagsZero)? fStateFlags[lastAcceptingState]: 0;

        if (flags == -1) {
            goto skipToken;
        }

    #ifdef RBBI_DEBUG
        if (fTrace) {
            RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range.location, range.length, flags);
        }
    #endif
        *outTokenP++ = range;
        if (outTokenFlags)
        {
            *outTokenFlags++ = (unsigned long) flags;
        }

        if (flags & 0x40000000) {
            goto exitTokenizer;
        }

skipToken:
        prev = result;
    }

exitTokenizer:
    return (outTokenP - outTokenRanges);
}

void
RuleBasedTokenizer::init()
{
    const RBBIStateTable57 *statetable = fData->fForwardTable;
    setBreakType(UBRK_WORD);
    fStartRow = (const RBBIStateTableRow57 *)
        (statetable->fTableData + (statetable->fRowLen * START_STATE));
    UChar i;
    const UTrie         *trie = &fData->fTrie;
    //int16_t category;
    fLatin1Cat = new int16_t[256];
    for (i = 0; i < 256; ++i)
    {
        //UTRIE_GET16(trie, i, category);
        //fLatin1Cat[i] = category;
        fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
    }
    fStateFlags = new int32_t[statetable->fNumStates];
    for (i = 0; i < statetable->fNumStates; ++i)
    {
        const RBBIStateTableRow57 *row = (const RBBIStateTableRow57 *)
            (statetable->fTableData + (statetable->fRowLen * i));
        int32_t flags = 0;
        if (row->fAccepting == -1 && row->fTagIdx != 0)
        {
            const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
            const int32_t *valLimit = vals + 1;
            valLimit += *vals++;
            while (vals < valLimit)
            {
                int32_t val = *vals++;
                if (val == 0)
                {
                    break;
                }
                else if (val > 0)
                {
                    flags |= val;
                }
                else
                {
                    flags = val;
                    break;
                }
            }
        }
        fStateFlags[i] = flags;
    }
}

RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
:   RuleBasedBreakIterator57(rules, parseErr, err),
    fStateFlags(NULL),
    fLatin1Cat(NULL)
{
    if (U_SUCCESS(err)) {
        init();
    }
}

RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
:   RuleBasedBreakIterator57((RBBIDataHeader57 *)data, status),
    fStateFlags(NULL),
    fLatin1Cat(NULL)
{
    if (U_SUCCESS(status)) {
        init();
    }
}

RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
:   RuleBasedBreakIterator57((const RBBIDataHeader57 *)data, RuleBasedBreakIterator57::kDontAdopt, status),
    fStateFlags(NULL),
    fLatin1Cat(NULL)
{
    if (U_SUCCESS(status)) {
        init();
    }
}

RuleBasedTokenizer::~RuleBasedTokenizer() {
    delete [] fStateFlags;
    delete [] fLatin1Cat;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */