File: strsrch.cpp

package info (click to toggle)
icu 2.1-2.1
links: PTS
area: main
in suites: sarge
size: 38,556 kB
ctags: 18,435
sloc: cpp: 118,545; ansic: 98,775; makefile: 3,759; sh: 3,178; perl: 1,325; lisp: 3
file content (757 lines) | stat: -rw-r--r-- 25,504 bytes
/*
**********************************************************************
*   Copyright (C) 1999-2000 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*  03/22/2000   helena      Creation.
**********************************************************************
*/

#include  <memory.h>
#include "unicode/coleitr.h"
#include "unicode/schriter.h"
#include "strsrch.h"
/**
 * <code>StringSearch</code> is a <code>SearchIterator</code> that provides
 * language-sensitive text searching based on the comparison rules defined
 * in a {@link RuleBasedCollator} object.
 * Instances of <code>StringSearch</code> function as iterators
 * maintain a current position and scan over text returning the index of
 * characters where the pattern occurs and the length of each match.
 * <p>
 * <code>StringSearch</code> uses a version of the fast Boyer-Moore search
 * algorithm that has been adapted to work with the large character set of
 * Unicode.  See "Efficient Text Searching in Java", to be published in
 * <i>Java Report</i> in February, 1999, for further information on the algorithm.
 * <p>
 * Consult the <code>SearchIterator</code> documentation for information on
 * and examples of how to use instances of this class to implement text
 * searching.  <code>SearchIterator</code> provides all of the necessary
 * API; this class only provides constructors and internal implementation
 * methods.
 *
 * @see SearchIterator
 * @see RuleBasedCollator
 *
 * @author Laura Werner
 * @version 1.0
 */

char  ::StringSearch::fgClassID = 0; // Value is irrelevant       // class id
/* to be removed */
void ::StringSearch::dumpTables() {
    int i;
    for (i = 0; i < 256; i++) {
        if (shiftTable[i] != minLen) {
//            debug("shift[" + Integer.toString(i,16) + "] = " + shiftTable[i]);
        }
    }
    for (i = 0; i < 256; i++) {
        if (backShiftTable[i] != minLen) {
//            debug("backShift[" + Integer.toString(i,16) + "] = " + backShiftTable[i]);
        }
    }
}

::StringSearch::StringSearch(const UnicodeString& pat, 
                CharacterIterator* target,
                RuleBasedCollator* coll, 
                BreakIterator* breaker,
                UErrorCode& status) :
    SearchIterator(target, breaker),
    strength(coll->getStrength()),
    valueList(NULL),
    valueListLen(0),
    pattern(pat),
    normLen(0),        // num. of collation elements in pattern.
    minLen(0),         // Min of composed, decomposed versions
    maxLen(0),         // Max
    it(NULL)

{
    if (U_FAILURE(status)) return;
    collator = (RuleBasedCollator*)(coll->clone());
    iter = collator->createCollationElementIterator(*target);
    it = collator->createCollationElementIterator(pat);
     
    initialize(status);   // Initialize the Boyer-Moore tables
}

/**
 * Construct a <code>StringSearch</code> object using a specific collator.
 * <p>
 * @param pattern   The text for which this object will search.
 *
 * @param target    The text in which to search for the pattern.
 *
 * @param collator  A <code>RuleBasedCollator</code> object which defines the
 *                  language-sensitive comparison rules used to determine 
 *                  whether text in the pattern and target matches.
 */
::StringSearch::StringSearch(const UnicodeString& pat,
                 CharacterIterator* target,
                 RuleBasedCollator* collator,
                 UErrorCode& status) :
    SearchIterator(),
    strength(collator->getStrength()),
    valueList(NULL),
    valueListLen(0),
    pattern(pat),
    normLen(0),        // num. of collation elements in pattern.
    minLen(0),         // Min of composed, decomposed versions
    maxLen(0),          // Max
    it(NULL)
{
    if (U_FAILURE(status)) return;
    this->adoptTarget(target);
    this->collator = (RuleBasedCollator*)(collator->clone());
    this->iter = collator->createCollationElementIterator(*target);
    this->it = collator->createCollationElementIterator(pat);
    initialize(status);
}

/**
 * Construct a <code>StringSearch</code> object using the collator and
 * character boundary detection rules for a given locale
 * <p>
 * @param pattern   The text for which this object will search.
 *
 * @param target    The text in which to search for the pattern.
 *
 * @param loc       The locale whose collation and break-detection rules
 *                  should be used.
 *
 * @exception       ClassCastException thrown if the collator for the specified
 *                  locale is not a RuleBasedCollator.
 */
::StringSearch::StringSearch(const StringSearch& that) :
    SearchIterator(that),    
    iter(NULL),
    collator(that.collator),
    strength(that.strength),
    valueList(NULL),
    valueListLen(that.valueListLen),
    normLen(that.normLen),        // num. of collation elements in pattern.
    minLen(that.minLen),          // Min of composed, decomposed versions
    maxLen(that.maxLen),
    it(NULL)
{
    valueList = new int32_t[valueListLen];
    memcpy(valueList, that.valueList, valueListLen*sizeof(int32_t));    
    iter = that.collator->createCollationElementIterator(that.getTarget());
    it = that.collator->createCollationElementIterator(that.pattern);
}

::StringSearch::StringSearch(const UnicodeString& pat, 
                 CharacterIterator* target, 
                 const Locale& loc,
                 UErrorCode& status) :
    SearchIterator(),
    valueList(NULL),
    valueListLen(0),
    pattern(pat),
    normLen(0),        // num. of collation elements in pattern.
    minLen(0),         // Min of composed, decomposed versions
    maxLen(0)          // Max
{
    if (U_FAILURE(status)) return;
    this->adoptTarget(target);
    collator = (RuleBasedCollator*)Collator::createInstance(loc, status);
    iter = collator->createCollationElementIterator(*target);
    it = collator->createCollationElementIterator(pat);

    strength = collator->getStrength(); 

    initialize(status);
}

UBool
::StringSearch::operator==(const SearchIterator& that) const
{
    if (that.getDynamicClassID() != getDynamicClassID())
        return FALSE;
    if (!SearchIterator::operator==(that))
        return FALSE;
    const StringSearch& that2 = (const StringSearch&)that;
    if (*that2.iter != *iter) return FALSE;
    else if (*that2.collator != *collator) return FALSE;
    else if (that2.strength != strength) return FALSE;
    else if (that2.valueListLen != valueListLen) return FALSE;
    else if (memcmp(that2.valueList, valueList, valueListLen*sizeof(int32_t)) != 0) return FALSE;
    else if (that2.pattern != pattern) return FALSE;
    else if (that2.normLen != normLen) return FALSE;
    else if (that2.minLen != minLen) return FALSE;
    else if (that2.maxLen != maxLen) return FALSE;
    else return TRUE;
}

SearchIterator* 
::StringSearch::clone(void) const
{
    return new StringSearch(*this);
}

/**
 * Construct a <code>StringSearch</code> object using the collator for the default
 * locale
 * <p>
 * @param pattern   The text for which this object will search.
 *
 * @param target    The text in which to search for the pattern.
 *
 * @param collator  A <code>RuleBasedCollator</code> object which defines the
 *                  language-sensitive comparison rules used to determine 
 *                  whether text in the pattern and target matches.
 */
::StringSearch::StringSearch(const UnicodeString& pat, 
                 const UnicodeString& newText,
                 UErrorCode& status) :
    SearchIterator(),
    valueList(NULL),
    valueListLen(0),
    pattern(pat),
    normLen(0),        // num. of collation elements in pattern.
    minLen(0),         // Min of composed, decomposed versions
    maxLen(0)          // Max
{
    StringCharacterIterator *s = new StringCharacterIterator(newText);
    collator = (RuleBasedCollator*)Collator::createInstance(Locale::getDefault(), status);
    strength = collator->getStrength(); 
    iter = collator->createCollationElementIterator(newText);
    it = collator->createCollationElementIterator(pat);
    this->adoptTarget(s);
    initialize(status);
}

::StringSearch::~StringSearch(void)
{
    if (valueList != NULL) {
        delete [] valueList;
        valueList = 0;
    }
    if (iter != NULL) {
        delete iter;
        iter = 0;
    }
    if (collator != NULL) {
        delete collator;
        collator = 0;
    }
    if (it != NULL) {
        delete it;
        it = 0;
    }
}
//-------------------------------------------------------------------
// Getters and Setters
//-------------------------------------------------------------------

/**
 * Sets this object's strength property. The strength determines the
 * minimum level of difference considered significant during a
 * search.  Generally, {@link Collator#TERTIARY} and 
 * {@link Collator#IDENTICAL} indicate that all differences are
 * considered significant, {@link Collator#SECONDARY} indicates
 * that upper/lower case distinctions should be ignored, and
 * {@link Collator#PRIMARY} indicates that both case and accents
 * should be ignored.  However, the exact meanings of these constants
 * are determined by individual Collator objects.
 * <p>
 * @see Collator#PRIMARY
 * @see Collator#SECONDARY
 * @see Collator#TERTIARY
 * @see Collator#IDENTICAL
 */
void ::StringSearch::setStrength(Collator::ECollationStrength newStrength, UErrorCode& status) {
    if (U_FAILURE(status))
    {
        return;
    }
    strength = newStrength;
    
    // Due to a bug (?) in CollationElementIterator, we must set the
    // collator's strength as well, since the iterator is going to
    // mask out the portions of the collation element that are not
    // relevant for the collator's current strength setting
    // Note that this makes it impossible to share a Collator among
    // multiple StringSearch objects if you adjust Strength settings.
    collator->setStrength(strength);
    initialize(status);
}



/**
 * Set the collator to be used for this string search.  Also changes
 * the search strength to match that of the new collator.
 * <p>
 * This method causes internal data such as Boyer-Moore shift tables
 * to be recalculated, but the iterator's position is unchanged.
 * <p>
 * @see #getCollator
 */
void ::StringSearch::setCollator(const RuleBasedCollator *coll, UErrorCode& status) 
{
    delete iter;
    delete collator;
    collator = (RuleBasedCollator*)coll->clone();
    strength = collator->getStrength();
    // Also need to recompute the pattern and get a new target iterator
    iter = collator->createCollationElementIterator(getTarget());
    initialize(status);
}

/**
 * Return the RuleBasedCollator being used for this string search.
 */
const RuleBasedCollator& ::StringSearch::getCollator(void) const 
{
    return *collator;
}

/**
 * Set the pattern for which to search.  
 * This method causes internal data such as Boyer-Moore shift tables
 * to be recalculated, but the iterator's position is unchanged.
 */
void ::StringSearch::setPattern(const UnicodeString& pat, UErrorCode& status) 
{
    pattern = pat;
    initialize(status);
}

/**
 * Returns the pattern for which this object is searching.
 */
const UnicodeString& ::StringSearch::getPattern() const
{
    return pattern;
}

/**
 * Set the target text which should be searched and resets the
 * iterator's position to point before the start of the new text.
 * This method is useful if you want to re-use an iterator to
 * search for the same pattern within a different body of text.
 */
void ::StringSearch::adoptTarget(CharacterIterator* target) 
{
    UErrorCode status = U_ZERO_ERROR;
    SearchIterator::adoptTarget(target);
    
    // fix me: Skipped the error code
    // Since we're caching a CollationElementIterator, recreate it
    iter->setText(*target, status);
}
void ::StringSearch::setTarget(const UnicodeString& newText) 
{
    UErrorCode status = U_ZERO_ERROR;
    SearchIterator::setTarget(newText);
    // Since we're caching a CollationElementIterator, recreate it
    iter->setText(newText, status);
}

void ::StringSearch::reset(void)
{
    SearchIterator::reset();
    iter->reset();
}//-------------------------------------------------------------------
// Privates
//-------------------------------------------------------------------

/**
 * Search forward for matching text, starting at a given location.
 * Clients should not call this method directly; instead they should call
 * {@link SearchIterator#next}.
 * <p>
 * If a match is found, this method returns the index at which the match
 * starts and calls {@link SearchIterator#setMatchLength}
 * with the number of characters in the target
 * text that make up the match.  If no match is found, the method returns
 * <code>DONE</code> and does not call <tt>setMatchLength</tt>.
 * <p>
 * @param start The index in the target text at which the search starts.
 *
 * @return      The index at which the matched text in the target starts, or DONE
 *              if no match was found.
 * <p>
 * @see SearchIterator#next
 * @see SearchIterator#DONE
 */
int32_t ::StringSearch::handleNext(int32_t start, UErrorCode& status)
{
    if (U_FAILURE(status)) 
    { 
        return SearchIterator::DONE; 
    }
    const CharacterIterator& target = getTarget();
    
    int mask = getMask(strength);
    
#if 0
    int done = CollationElementIterator::NULLORDER & mask;
    if (DEBUG) {
        debug("-------------------------handleNext-----------------------------------");
        debug("");
        debug("strength=" + strength + ", mask=" + Integer.toString(mask,16)
            + ", done=" + Integer.toString(done,16));
        debug("decomp=" + collator.getDecomposition());
        
        debug("target.begin=" + getTarget().getBeginIndex());
        debug("target.end=" + getTarget().getEndIndex());
        debug("start = " + start);
    }
#endif
    int32_t index = start + minLen;
    int32_t matchEnd = 0;

    while (index <= target.endIndex())
    {
        int32_t patIndex = normLen;
        int32_t tval = 0, pval = 0;
        UBool getP = TRUE;

        iter->setOffset(index, status);
        matchEnd = index;
        
        //if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index);
        
        while ((patIndex > 0 || getP == false) && iter->getOffset() > start)
        {
#if 0
            if (DEBUG) {
                debug("  inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
                debug("   getP=" + getP);
            }
#endif
            
            // Get the previous character in both the pattern and the target
            tval = iter->previous(status) & mask;
            if (U_FAILURE(status)) 
            {
                return SearchIterator::DONE;
            }
            
            if (getP) pval = valueList[--patIndex];
            getP = TRUE;
            
            // (DEBUG) debug("   pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16));
            
            if (tval == 0) {       // skip tval, use same pval
                // (DEBUG) debug("   tval is ignorable");
                getP = FALSE;
            }
            else if (pval != tval) {    // Mismatch, skip ahead
                // (DEBUG) debug("   mismatch: skippping " + getShift(tval, patIndex));
                
                index += getShift(tval, patIndex);
                break;
            }
            else if (patIndex == 0) {
                // The values matched, and we're at the beginning of the pattern,
                // which means we matched the whole thing.
                start = iter->getOffset();
                setMatchLength(matchEnd - start);
                // if (DEBUG) debug("Found match at index "+ start );
                return start;
            }
        }
#if 0
        if (DEBUG) debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
        if (DEBUG) debug("   getP=" + getP);
#endif   
        if (iter->getOffset() <= start) {
            // We hit the beginning of the text being searched, which is
            // possible if it contains lots of ignorable characters.
            // Advance one character and try again.
            // if (DEBUG) debug("hit beginning of target; advance by one");
            index++;
        }
    }
    // if (DEBUG) debug("Fell off end of outer loop; returning DONE");
    return SearchIterator::DONE;
}

/**
 * Search backward for matching text ,starting at a given location.
 * Clients should not call this method directly; instead they should call
 * <code>SearchIterator.previous()</code>, which this method overrides.
 * <p>
 * If a match is found, this method returns the index at which the match
 * starts and calls {@link SearchIterator#setMatchLength}
 * with the number of characters in the target
 * text that make up the match.  If no match is found, the method returns
 * <code>DONE</code> and does not call <tt>setMatchLength</tt>.
 * <p>
 * @param start The index in the target text at which the search starts.
 *
 * @return      The index at which the matched text in the target starts, or DONE
 *              if no match was found.
 * <p>
 * @see SearchIterator#previous
 * @see SearchIterator#DONE
 */
int32_t ::StringSearch::handlePrev(int32_t start, UErrorCode& status)
{
    if (U_FAILURE(status))
    {
        return SearchIterator::DONE;
    }
    int patLen = normLen;
    int index = start - minLen;

    int mask = getMask(strength);
    int done = CollationElementIterator::NULLORDER & mask;
#if 0
    if (DEBUG) {
        debug("-------------------------handlePrev-----------------------------------");
        debug("");
        debug("strength=" + strength + ", mask=" + Integer.toString(mask,16)
            + ", done=" + Integer.toString(done,16));
        debug("decomp=" + collator.getDecomposition());
        
        debug("target.begin=" + getTarget().getBeginIndex());
        debug("target.end=" + getTarget().getEndIndex());
    }
#endif
    
    while (index >= 0) {
        int patIndex = 0;
        int tval = 0, pval = 0;
        UBool getP = TRUE;

        iter->setOffset(index, status);
        if (U_FAILURE(status))
        {
            return SearchIterator::DONE;
        }


        // if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index);
        
        while ((patIndex < patLen || !getP) && iter->getOffset() < start)
        {
        /*    if (DEBUG) {
                debug("  inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
            }
            */
            tval = iter->next(status) & mask;
            if (U_FAILURE(status))
            {
                return SearchIterator::DONE;
            }
            if (getP) pval = valueList[patIndex++];
            getP = TRUE;

            //if (DEBUG) debug("   pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16));

            if (tval == done) {
              //  if (DEBUG) debug("   end of target; no match");
                return DONE;
            }
            else if (tval == 0) {
                // if (DEBUG) debug("   tval is ignorable");
                getP = false;
            }
            else if (pval != tval) {
                // We didn't match this pattern.  Skip ahead
                // if (DEBUG) debug("   mismatch: skippping " + getBackShift(tval, patIndex));
                
                int shift = getBackShift(tval, patIndex);
                index -= shift;
                break;
            }
            else if (patIndex == patLen) {
                // The elements matched and we're at the end of the pattern,
                // which means we matched the whole thing.
                setMatchLength(iter->getOffset() - index);
                return index;
            }
        }
        if (iter->getOffset() >= start) {
            // We hit the end of the text being searched, which is
            // possible if it contains lots of ignorable characters.
            // Back up one character and try again.
            // if (DEBUG) debug("hit end of target; back by one");
            index--;
        }
    }
    return SearchIterator::DONE;
}

/**
 * Return a bitmask that will select only the portions of a collation 
 * element that are significant at the given strength level.
 */
int32_t ::StringSearch::getMask(Collator::ECollationStrength strength)  
{
    switch (strength) {
    case Collator::PRIMARY:
        return 0xFFFF0000;
    case Collator::SECONDARY:
        return 0xFFFFFF00;
    default:
        return 0xFFFFFFFF;
    }
}


void ::StringSearch::initialize(UErrorCode& status) {
    /*
    if (DEBUG)  {
        debug("-------------------------initialize-----------------------------------");
        debug("pattern=" + pattern);
    }
    */
    it->setText(pattern, status);
    if (U_FAILURE(status)) {
        delete it;
        return;
    }

    int mask = getMask(strength);

    // See how many non-ignorable collation keys are in the text
    normLen = 0;
    int32_t elem;
    while ((elem = it->next(status)) != CollationElementIterator::NULLORDER)
    {
        if (U_FAILURE(status)) {
            return;
        }
        if ((elem & mask) != 0) {
            normLen++;
        }
    }

    if (valueList != NULL) {
        delete [] valueList;
    }

    // Save them all
    valueList = new int32_t[normLen];
    int expandLen = 0;
    it->reset();
    
    int32_t i;
    for (i = 0; i < normLen; i++)
    {
        elem = it->next(status);
        if (U_FAILURE(status)) {
            return;
        }

        if ((elem & mask) != 0) {
            valueList[i] = elem & mask;
            
        }
        // Keep track of whether there are any expanding-character
        // sequences that can result in one of the characters that's in
        // the pattern.  If there are, we have to reduce the shift
        // distances calculated below to account for it.
        expandLen += it->getMaxExpansion(elem) - 1;
    }

    //
    // We need to remember the size of the composed and decomposed
    // versions of the string.  Standard Boyer-Moore shift calculations
    // can be wrong by an amount up to that difference, since a small
    // small number of characters in the pattern can map to a larger
    // number in the text being searched, or vice-versa.
    //
    int uniLen = pattern.length();
    maxLen = uprv_max(normLen, uniLen);
    minLen = uprv_min(normLen, uniLen) - expandLen; 


    /*
    if (DEBUG) debug("normLen=" + normLen + ", expandLen=" + expandLen
                    + ", maxLen=" + maxLen + ", minLen=" + minLen);
    */
    // Now initialize the shift tables
    //
    // NOTE: This is the most conservative way to build them.  If we had a way
    // of knowing that there were no expanding/contracting chars in the rules,
    // we could get rid of the "- 1" in the shiftTable calculations.
    // But all of the default collators have at least one expansion or
    // contraction, so it probably doesn't matter anyway.
    //
    for (i = 0; i < 256; i++) {
        shiftTable[i] = backShiftTable[i] = minLen;
    }

    for (i = 0; i < normLen-1; i++) {
        shiftTable[hash(valueList[i])] = uprv_max(minLen - i - 1, 1);
    }
    shiftTable[hash(valueList[normLen-1])] = 1;
    
    for (i = normLen - 1; i > 0; i--) {
        backShiftTable[hash(valueList[i])] = i;
    }
    backShiftTable[hash(valueList[0])] = 1;
    
    /* dumpTables(); */
}

/**
 * Method used by StringSearch to determine how far to the right to
 * shift the pattern during a Boyer-Moore search.  
 *
 * @param curValue  The current value in the target text
 * @param curIndex  The index in the pattern at which we failed to match
 *                  curValue in the target text.
 */
int32_t ::StringSearch::getShift( int32_t curValue, int32_t curIndex ) const
{
    int32_t shiftAmt = shiftTable[hash(curValue)];

    if (minLen != maxLen) {
        int adjust = normLen - curIndex;
        if (shiftAmt > adjust + 1) {
//            if (DEBUG) debug("getShift: adjusting by " + adjust);
            shiftAmt -= adjust;
        }
    }
    return shiftAmt;
}

/**
 * Method used by StringSearch to determine how far to the left to
 * shift the pattern during a reverse Boyer-Moore search.  
 *
 * @param curValue  The current value in the target text
 * @param curIndex  The index in the pattern at which we failed to match
 *                  curValue in the target text.
 */
int32_t ::StringSearch::getBackShift( int32_t curValue, int32_t curIndex ) const 
{
    int shiftAmt = backShiftTable[hash(curValue)];

    if (minLen != maxLen) {
        int adjust = normLen - (minLen - curIndex);
        if (shiftAmt > adjust + 1) {
            // if (DEBUG) debug("getBackShift: adjusting by " + adjust);
            shiftAmt -= adjust;
        }
    }
    return shiftAmt;
}

/**
 * Hash a collation element from its full size (32 bits) down into a
 * value that can be used as an index into the shift tables.  Right
 * now we do a modulus by the size of the hash table.
 *
 * TODO: At some point I should experiment to see whether a slightly
 * more complicated hash function gives us a better distribution
 * on multilingual text.  I doubt it will have much effect on
 * performance, though.
 */
int32_t ::StringSearch::hash(int32_t order) 
{
    return CollationElementIterator::primaryOrder(order) % 256;
}