File: strsrch.h

package info (click to toggle)
icu 2.1-2.1
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 38,556 kB
  • ctags: 18,435
  • sloc: cpp: 118,545; ansic: 98,775; makefile: 3,759; sh: 3,178; perl: 1,325; lisp: 3
file content (393 lines) | stat: -rw-r--r-- 15,190 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
/*
**********************************************************************
*   Copyright (C) 1999-2000 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*  03/22/2000   helena      Creation.
**********************************************************************
*/
#ifndef STRSRCH_H
#define STRSRCH_H

#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/tblcoll.h"
#include "unicode/brkiter.h"
#include "srchiter.h"


class SearchIterator;
/**
 * <code>StringSearch</code> is a <code>SearchIterator</code> that provides
 * language-sensitive text searching based on the comparison rules defined
 * in a {@link RuleBasedCollator} object.
 * Instances of <code>StringSearch</code> function as iterators
 * maintain a current position and scan over text returning the index of
 * characters where the pattern occurs and the length of each match.
 * <p>
 * <code>StringSearch</code> uses a version of the fast Boyer-Moore search
 * algorithm that has been adapted to work with the large character set of
 * Unicode.  See "Efficient Text Searching in Java", to be published in
 * <i>Java Report</i> in February, 1999, for further information on the algorithm.
 * <p>
 * Consult the <code>SearchIterator</code> documentation for information on
 * and examples of how to use instances of this class to implement text
 * searching.  <code>SearchIterator</code> provides all of the necessary
 * API; this class only provides constructors and internal implementation
 * methods.
 *
 * @see SearchIterator
 * @see RuleBasedCollator
 *
 * @author Laura Werner
 * @version 1.0
 */

class StringSearch : public SearchIterator
{
public:
    /**
     * Construct a <code>StringSearch</code> object using a specific collator and set
     * of boundary-detection rules.
     * <p>
     * @param pat       The text for which this object will search.
     *
     * @param target    The text in which to search for the pattern.
     *
     * @param coll      A <code>RuleBasedCollator</code> object which defines the
     *                  language-sensitive comparison rules used to determine 
     *                  whether text in the pattern and target matches.
     *
     * @param breaker   A <code>BreakIterator</code> object used to constrain the matches
     *                  that are found.  Matches whose start and end indices
     *                  in the target text are not boundaries as determined
     *                  by the <code>BreakIterator</code> are ignored.  If this behavior
     *                  is not desired, <code>null</code> can be passed in instead.
     */
    StringSearch(const UnicodeString& pat, 
                        CharacterIterator* target,
                        RuleBasedCollator* coll, 
                        BreakIterator* breaker,
                        UErrorCode& status);

    /**
     * Construct a <code>StringSearch</code> object using a specific collator.
     * <p>
     * @param pattern   The text for which this object will search.
     *
     * @param target    The text in which to search for the pattern.
     *
     * @param collator  A <code>RuleBasedCollator</code> object which defines the
     *                  language-sensitive comparison rules used to determine 
     *                  whether text in the pattern and target matches.
     */
    StringSearch(const UnicodeString& pattern,
                 CharacterIterator* target,
                 RuleBasedCollator* collator,
                 UErrorCode& status);

    /**
     * copy constructor
     */
    StringSearch(const StringSearch& that);

    /**
     * Construct a <code>StringSearch</code> object using the collator and
     * character boundary detection rules for a given locale
     * <p>
     * @param pattern   The text for which this object will search.
     *
     * @param target    The text in which to search for the pattern.
     *
     * @param loc       The locale whose collation and break-detection rules
     *                  should be used.
     *
     * @exception       ClassCastException thrown if the collator for the specified
     *                  locale is not a RuleBasedCollator.
     */
    StringSearch(const UnicodeString& pattern, 
                 CharacterIterator* target, 
                 const Locale& loc,
                 UErrorCode& status);
    /**
     * Construct a <code>StringSearch</code> object using the collator for the default
     * locale
     * <p>
     * @param pattern   The text for which this object will search.
     *
     * @param target    The text in which to search for the pattern.
     *
     * @param collator  A <code>RuleBasedCollator</code> object which defines the
     *                  language-sensitive comparison rules used to determine 
     *                  whether text in the pattern and target matches.
     */
    StringSearch(const UnicodeString& pattern, 
                 const UnicodeString& target,
                 UErrorCode& status);

    virtual ~StringSearch(void);
    /**
     * Assignment operator.  Sets this iterator to have the same behavior,
     * and iterate over the same text, as the one passed in.
     */
    StringSearch& operator=(const StringSearch& that);

    /**
     * Equality operator.  Returns TRUE if both BreakIterators are of the
     * same class, have the same behavior, and iterate over the same text.
     */
    virtual UBool operator==(const SearchIterator& that) const;

    /**
     * Not-equal operator.  If operator== returns TRUE, this returns FALSE,
     * and vice versa.
     */
    UBool operator!=(const SearchIterator& that) const;

    /**
     * Returns a newly-constructed RuleBasedBreakIterator with the same
     * behavior, and iterating over the same text, as this one.
     */
    virtual SearchIterator* clone(void) const;

    //-------------------------------------------------------------------
    // Getters and Setters
    //-------------------------------------------------------------------
    
    /**
     * Sets this object's strength property. The strength determines the
     * minimum level of difference considered significant during a
     * search.  Generally, {@link Collator#TERTIARY} and 
     * {@link Collator#IDENTICAL} indicate that all differences are
     * considered significant, {@link Collator#SECONDARY} indicates
     * that upper/lower case distinctions should be ignored, and
     * {@link Collator#PRIMARY} indicates that both case and accents
     * should be ignored.  However, the exact meanings of these constants
     * are determined by individual Collator objects.
     * <p>
     * @see Collator#PRIMARY
     * @see Collator#SECONDARY
     * @see Collator#TERTIARY
     * @see Collator#IDENTICAL
     */
     void setStrength(Collator::ECollationStrength newStrength, UErrorCode& status);
    
    
    /**
     * Returns this object's strength property, which indicates what level
     * of differences are considered significant during a search.
     * <p>
     * @see #setStrength
     */
     Collator::ECollationStrength getStrength(void) const{ return strength; }
    
    /**
     * Set the collator to be used for this string search.  Also changes
     * the search strength to match that of the new collator.
     * <p>
     * This method causes internal data such as Boyer-Moore shift tables
     * to be recalculated, but the iterator's position is unchanged.
     * <p>
     * @see #getCollator
     */
     void setCollator(const RuleBasedCollator* coll, UErrorCode& status);
    
    /**
     * Return the RuleBasedCollator being used for this string search.
     */
    const RuleBasedCollator&     getCollator() const;
    
    /**
     * Set the pattern for which to search.  
     * This method causes internal data such as Boyer-Moore shift tables
     * to be recalculated, but the iterator's position is unchanged.
     */
    void setPattern(const UnicodeString& pat, UErrorCode& status);
    
    /**
     * Returns the pattern for which this object is searching.
     */
    const UnicodeString& getPattern() const;
    
    /**
     * Set the target text which should be searched and resets the
     * iterator's position to point before the start of the new text.
     * This method is useful if you want to re-use an iterator to
     * search for the same pattern within a different body of text.
     */
    virtual void setTarget(const UnicodeString& newText);    

    /**
     * Set the target text which should be searched and resets the
     * iterator's position to point before the start of the target text.
     * This method is useful if you want to re-use an iterator to
     * search for the same pattern within a different body of text.
     *
     * @see #getTarget
     */
    virtual void adoptTarget(CharacterIterator* iterator);

    /** Reset iterator
     */
    virtual void reset(void);
    /**
     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
     * This method is to implement a simple version of RTTI, since not all
     * C++ compilers support genuine RTTI.  Polymorphic operator==() and
     * clone() methods call this method.
     *
     * @return          The class ID for this object. All objects of a
     *                  given class have the same class ID.  Objects of
     *                  other classes have different class IDs.
     */
    inline virtual UClassID getDynamicClassID(void) const;

    /**
     * Returns the class ID for this class.  This is useful only for
     * comparing to a return value from getDynamicClassID().  For example:
     *
     *      Base* polymorphic_pointer = createPolymorphicObject();
     *      if (polymorphic_pointer->getDynamicClassID() ==
     *          Derived::getStaticClassID()) ...
     *
     * @return          The class ID for all objects of this class.
     */
    inline static UClassID getStaticClassID(void);

protected:
    //-------------------------------------------------------------------
    // Privates
    //-------------------------------------------------------------------

    /**
     * Search forward for matching text, starting at a given location.
     * Clients should not call this method directly; instead they should call
     * {@link SearchIterator#next}.
     * <p>
     * If a match is found, this method returns the index at which the match
     * starts and calls {@link SearchIterator#setMatchLength}
     * with the number of characters in the target
     * text that make up the match.  If no match is found, the method returns
     * <code>DONE</code> and does not call <tt>setMatchLength</tt>.
     * <p>
     * @param start The index in the target text at which the search starts.
     *
     * @return      The index at which the matched text in the target starts, or DONE
     *              if no match was found.
     * <p>
     * @see SearchIterator#next
     * @see SearchIterator#DONE
     */
    virtual int32_t handleNext(int32_t start, UErrorCode& status);
    /**
     * Search backward for matching text ,starting at a given location.
     * Clients should not call this method directly; instead they should call
     * <code>SearchIterator.previous()</code>, which this method overrides.
     * <p>
     * If a match is found, this method returns the index at which the match
     * starts and calls {@link SearchIterator#setMatchLength}
     * with the number of characters in the target
     * text that make up the match.  If no match is found, the method returns
     * <code>DONE</code> and does not call <tt>setMatchLength</tt>.
     * <p>
     * @param start The index in the target text at which the search starts.
     *
     * @return      The index at which the matched text in the target starts, or DONE
     *              if no match was found.
     * <p>
     * @see SearchIterator#previous
     * @see SearchIterator#DONE
     */
    virtual int32_t handlePrev(int32_t start, UErrorCode& status);
private:
    /**
     * Return a bitmask that will select only the portions of a collation 
     * element that are significant at the given strength level.
     */
    static int32_t getMask(Collator::ECollationStrength strength);
    

    void initialize(UErrorCode& status);
    /**
     * Method used by StringSearch to determine how far to the right to
     * shift the pattern during a Boyer-Moore search.  
     *
     * @param curValue  The current value in the target text
     * @param curIndex  The index in the pattern at which we failed to match
     *                  curValue in the target text.
     */
    int32_t getShift( int32_t curValue, int32_t curIndex ) const;

    /**
     * Method used by StringSearch to determine how far to the left to
     * shift the pattern during a reverse Boyer-Moore search.  
     *
     * @param curValue  The current value in the target text
     * @param curIndex  The index in the pattern at which we failed to match
     *                  curValue in the target text.
     */
    int32_t getBackShift( int32_t curValue, int32_t curIndex ) const;

    /**
     * Hash a collation element from its full size (32 bits) down into a
     * value that can be used as an index into the shift tables.  Right
     * now we do a modulus by the size of the hash table.
     *
     * TODO: At some point I should experiment to see whether a slightly
     * more complicated hash function gives us a better distribution
     * on multilingual text.  I doubt it will have much effect on
     * performance, though.
     */
    static int32_t hash(int32_t order);

    //------------------------------------------------------------------------
    // Private Data
    //
    CollationElementIterator      *iter;
    RuleBasedCollator             *collator;
    /* HSYS ? Why?  Changes to this will not affect collator.  no changes to the comparsion result */
    Collator::ECollationStrength  strength;
    
    //------------------------------------------------------------------------
    // Everything from here on down is the data used to represent the
    // Boyer-Moore shift tables and the code that generates and manipulates
    // them.
    //    
    int32_t         *valueList;
    int32_t         valueListLen;
    int32_t         shiftTable[256];
    int32_t         backShiftTable[256];

    UnicodeString   pattern;            // The pattern string
    int32_t         normLen;        // num. of collation elements in pattern.
    int32_t         minLen;         // Min of composed, decomposed versions
    int32_t         maxLen;         // Max
    CollationElementIterator *it;   // to be removed

private:
    /* to be removed */
    void dumpTables();
    /**
     * Class ID
     */
    static char fgClassID;
};

inline UBool ::StringSearch::operator!=(const SearchIterator& that) const 
{
    return !operator==(that);
}

inline UClassID ::StringSearch::getDynamicClassID(void) const 
{
    return ::StringSearch::getStaticClassID();
}

inline UClassID ::StringSearch::getStaticClassID(void) 
{
    return (UClassID)(&fgClassID);
}

#endif