File: TextBreakIterator.h

package info (click to toggle)
chromium-browser 57.0.2987.98-1~deb8u1
links: PTS, VCS
area: main
in suites: jessie
size: 2,637,852 kB
ctags: 2,544,394
sloc: cpp: 12,815,961; ansic: 3,676,222; python: 1,147,112; asm: 526,608; java: 523,212; xml: 286,794; perl: 92,654; sh: 86,408; objc: 73,271; makefile: 27,698; cs: 18,487; yacc: 13,031; tcl: 12,957; pascal: 4,875; ml: 4,716; lex: 3,904; sql: 3,862; ruby: 1,982; lisp: 1,508; php: 1,368; exp: 404; awk: 325; csh: 117; jsp: 39; sed: 37
file content (282 lines) | stat: -rw-r--r-- 9,735 bytes
/*
 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this library; see the file COPYING.LIB.  If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 */

#ifndef TextBreakIterator_h
#define TextBreakIterator_h

#include "platform/PlatformExport.h"
#include "wtf/text/AtomicString.h"
#include "wtf/text/Unicode.h"

#include <unicode/brkiter.h>

namespace blink {

typedef icu::BreakIterator TextBreakIterator;

// Note: The returned iterator is good only until you get another iterator, with
// the exception of acquireLineBreakIterator.

// This is similar to character break iterator in most cases, but is subject to
// platform UI conventions. One notable example where this can be different
// from character break iterator is Thai prepend characters, see bug 24342.
// Use this for insertion point and selection manipulations.
PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*,
                                                          int length);

PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&,
                                                     int start,
                                                     int length);
PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(
    const LChar*,
    int length,
    const AtomicString& locale,
    const UChar* priorContext,
    unsigned priorContextLength);
PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(
    const UChar*,
    int length,
    const AtomicString& locale,
    const UChar* priorContext,
    unsigned priorContextLength);
PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*,
                                                         int length);

PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);

const int TextBreakDone = -1;

enum class LineBreakType {
  Normal,
  BreakAll,  // word-break:break-all allows breaks between letters/numbers
  KeepAll,   // word-break:keep-all doesn't allow breaks between all kind of
             // letters/numbers except some south east asians'.
};

class PLATFORM_EXPORT LazyLineBreakIterator final {
  STACK_ALLOCATED();

 public:
  LazyLineBreakIterator()
      : m_iterator(0), m_cachedPriorContext(0), m_cachedPriorContextLength(0) {
    resetPriorContext();
  }

  LazyLineBreakIterator(String string,
                        const AtomicString& locale = AtomicString())
      : m_string(string),
        m_locale(locale),
        m_iterator(0),
        m_cachedPriorContext(0),
        m_cachedPriorContextLength(0) {
    resetPriorContext();
  }

  ~LazyLineBreakIterator() {
    if (m_iterator)
      releaseLineBreakIterator(m_iterator);
  }

  String getString() const { return m_string; }

  UChar lastCharacter() const {
    static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                  "TextBreakIterator has unexpected prior context length");
    return m_priorContext[1];
  }

  UChar secondToLastCharacter() const {
    static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                  "TextBreakIterator has unexpected prior context length");
    return m_priorContext[0];
  }

  void setPriorContext(UChar last, UChar secondToLast) {
    static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                  "TextBreakIterator has unexpected prior context length");
    m_priorContext[0] = secondToLast;
    m_priorContext[1] = last;
  }

  void updatePriorContext(UChar last) {
    static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                  "TextBreakIterator has unexpected prior context length");
    m_priorContext[0] = m_priorContext[1];
    m_priorContext[1] = last;
  }

  void resetPriorContext() {
    static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                  "TextBreakIterator has unexpected prior context length");
    m_priorContext[0] = 0;
    m_priorContext[1] = 0;
  }

  unsigned priorContextLength() const {
    unsigned priorContextLength = 0;
    static_assert(WTF_ARRAY_LENGTH(m_priorContext) == 2,
                  "TextBreakIterator has unexpected prior context length");
    if (m_priorContext[1]) {
      ++priorContextLength;
      if (m_priorContext[0])
        ++priorContextLength;
    }
    return priorContextLength;
  }

  // Obtain text break iterator, possibly previously cached, where this iterator
  // is (or has been) initialized to use the previously stored string as the
  // primary breaking context and using previously stored prior context if
  // non-empty.
  TextBreakIterator* get(unsigned priorContextLength) {
    ASSERT(priorContextLength <= priorContextCapacity);
    const UChar* priorContext =
        priorContextLength
            ? &m_priorContext[priorContextCapacity - priorContextLength]
            : 0;
    if (!m_iterator) {
      if (m_string.is8Bit())
        m_iterator = acquireLineBreakIterator(m_string.characters8(),
                                              m_string.length(), m_locale,
                                              priorContext, priorContextLength);
      else
        m_iterator = acquireLineBreakIterator(m_string.characters16(),
                                              m_string.length(), m_locale,
                                              priorContext, priorContextLength);
      m_cachedPriorContext = priorContext;
      m_cachedPriorContextLength = priorContextLength;
    } else if (priorContext != m_cachedPriorContext ||
               priorContextLength != m_cachedPriorContextLength) {
      this->resetStringAndReleaseIterator(m_string, m_locale);
      return this->get(priorContextLength);
    }
    return m_iterator;
  }

  void resetStringAndReleaseIterator(String string,
                                     const AtomicString& locale) {
    if (m_iterator)
      releaseLineBreakIterator(m_iterator);

    m_string = string;
    m_locale = locale;
    m_iterator = 0;
    m_cachedPriorContext = 0;
    m_cachedPriorContextLength = 0;
  }

  inline bool isBreakable(int pos,
                          int& nextBreakable,
                          LineBreakType lineBreakType = LineBreakType::Normal) {
    if (pos > nextBreakable) {
      switch (lineBreakType) {
        case LineBreakType::BreakAll:
          nextBreakable = nextBreakablePositionBreakAll(pos);
          break;
        case LineBreakType::KeepAll:
          nextBreakable = nextBreakablePositionKeepAll(pos);
          break;
        default:
          nextBreakable = nextBreakablePositionIgnoringNBSP(pos);
      }
    }
    return pos == nextBreakable;
  }

 private:
  int nextBreakablePositionIgnoringNBSP(int pos);
  int nextBreakablePositionBreakAll(int pos);
  int nextBreakablePositionKeepAll(int pos);

  static const unsigned priorContextCapacity = 2;
  String m_string;
  AtomicString m_locale;
  TextBreakIterator* m_iterator;
  UChar m_priorContext[priorContextCapacity];
  const UChar* m_cachedPriorContext;
  unsigned m_cachedPriorContextLength;
};

// Iterates over "extended grapheme clusters", as defined in UAX #29.
// Note that platform implementations may be less sophisticated - e.g. ICU prior
// to version 4.0 only supports "legacy grapheme clusters".  Use this for
// general text processing, e.g. string truncation.

class PLATFORM_EXPORT NonSharedCharacterBreakIterator final {
  STACK_ALLOCATED();
  WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);

 public:
  explicit NonSharedCharacterBreakIterator(const String&);
  NonSharedCharacterBreakIterator(const UChar*, unsigned length);
  ~NonSharedCharacterBreakIterator();

  int next();
  int current();

  bool isBreak(int offset) const;
  int preceding(int offset) const;
  int following(int offset) const;

  bool operator!() const { return !m_is8Bit && !m_iterator; }

 private:
  void createIteratorForBuffer(const UChar*, unsigned length);

  unsigned clusterLengthStartingAt(unsigned offset) const {
    ASSERT(m_is8Bit);
    // The only Latin-1 Extended Grapheme Cluster is CR LF
    return isCRBeforeLF(offset) ? 2 : 1;
  }

  bool isCRBeforeLF(unsigned offset) const {
    ASSERT(m_is8Bit);
    return m_charaters8[offset] == '\r' && offset + 1 < m_length &&
           m_charaters8[offset + 1] == '\n';
  }

  bool isLFAfterCR(unsigned offset) const {
    ASSERT(m_is8Bit);
    return m_charaters8[offset] == '\n' && offset >= 1 &&
           m_charaters8[offset - 1] == '\r';
  }

  bool m_is8Bit;

  // For 8 bit strings, we implement the iterator ourselves.
  const LChar* m_charaters8;
  unsigned m_offset;
  unsigned m_length;

  // For 16 bit strings, we use a TextBreakIterator.
  TextBreakIterator* m_iterator;
};

// Counts the number of grapheme clusters. A surrogate pair or a sequence
// of a non-combining character and following combining characters is
// counted as 1 grapheme cluster.
PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);

}  // namespace blink

#endif