File: rbtok.h

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (136 lines) | stat: -rw-r--r-- 4,167 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
***************************************************************************
* Copyright (C) 2006-2008, 2018 Apple Inc. All Rights Reserved.  
*
* originally added per rdar://4755956 Add Apple tokenizer subclass of RBBI
* updated per
* rdar://35946337 Rewrite urbtok_tokenize & other urbtok_ interfaces to work with new RBBI but be fast enough
* rdar://37249396 Add ICU 57 version of RBBI classes, urbtok57 interfaces for access via RBT, and better tests
* rdar://5378823 Add urbtok_openBinaryRulesNoCopy().
***************************************************************************
*
***************************************************************************
* This uses the ICU 57 legacy version of RuleBasedBreakIterator for
* performance reasons, does not support the RuleBasedBreakIterator rule
* syntax updates from ICU 60 and later, and requires both forward and
* reverse rules (as in ICU 57).
***************************************************************************
*/

#ifndef RBTOK_H
#define RBTOK_H

#include <_foundation_unicode/utypes.h>

/**
 * \file
 * \brief C++ API: Rule Based Tokenizer
 */

#if !UCONFIG_NO_BREAK_ITERATION

#include <_foundation_unicode/urbtok.h>
#include <_foundation_unicode/parseerr.h>
#include "rbbidata57.h"
#include "rbbi57.h"


U_NAMESPACE_BEGIN

/** @internal */
struct RBBIDataHeader57;
struct RBBIStateTableRow57;


/**
 *
 * A subclass of RuleBasedBreakIterator57 that adds tokenization functionality.

 * <p>This class is for internal use only by Apple Inc.</p>
 *
 */
class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator57 {

private:
    /**
     * The row corresponding to the start state
     * @internal
     */
    const RBBIStateTableRow57 *fStartRow;

    /**
     * The merged flag results for accepting states
     * @internal
     */
    int32_t *fStateFlags;

    /**
     * Character categories for the Latin1 subset of Unicode
     * @internal
     */
    int16_t *fLatin1Cat;

public:
    /**
     * Construct a RuleBasedTokenizer from a set of rules supplied as a string.
     * @param rules The break rules to be used.
     * @param parseError  In the event of a syntax error in the rules, provides the location
     *                    within the rules of the problem.
     * @param status Information on any errors encountered.
     * @internal, used by urbtok57.cpp
     */
    RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status);

    /**
     * Constructor from a flattened set of RBBI data in uprv_malloc'd memory.
     *             RulesBasedBreakIterators built from a custom set of rules
     *             are created via this constructor; the rules are compiled
     *             into memory, then the break iterator is constructed here.
     *
     *             The break iterator adopts the memory, and will
     *             free it when done.
     * @internal, used by urbtok57.cpp
     */
    RuleBasedTokenizer(uint8_t *data, UErrorCode &status);

    /**
     * Constructor from a flattened set of RBBI data in umemory which need not
     *             be malloced (e.g. it may be a memory-mapped file, etc.).
       *
     *             This version does not adopt the memory, and does not
     *             free it when done.
     * @internal, used by urbtok57.cpp
     */
    enum EDontAdopt {
        kDontAdopt
    };
    RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt dontAdopt, UErrorCode &status);

    /**
     * Destructor
     *  @internal
     */
    virtual ~RuleBasedTokenizer();

    /**
     * Fetch the next set of tokens.
     * @param maxTokens The maximum number of tokens to return.
     * @param outTokenRanges Pointer to output array of token ranges.
     * @param outTokenFlags (optional) pointer to output array of token flags.
     * @internal, used by urbtok57.cpp
     */
    int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags);

private:
    /**
      * Common initialization function, used by constructors.
      * @internal
      */
    void init();
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif