File: tailored_word_break_iterator.h

package info (click to toggle)
chromium 138.0.7204.183-1~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm-proposed-updates
  • size: 6,080,960 kB
  • sloc: cpp: 34,937,079; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,954; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,811; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (63 lines) | stat: -rw-r--r-- 2,509 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_OMNIBOX_BROWSER_TAILORED_WORD_BREAK_ITERATOR_H_
#define COMPONENTS_OMNIBOX_BROWSER_TAILORED_WORD_BREAK_ITERATOR_H_

#include <string>
#include <string_view>

#include "base/i18n/break_iterator.h"

// Breaks on an underscore and numbers. Otherwise, it behaves like its parent
// class with `BreakIterator::BREAK_WORD`.
// E.g. 'Viktor Ambartsumian_is__anAwesome99_99Astrophysicist!!' is broken into:
// [Viktor, <space>, Ambartsumian, _, is, _, _, anAwesome, 99, _, 99,
//  Astrophysicist, !, !].
class TailoredWordBreakIterator : public base::i18n::BreakIterator {
 public:
  explicit TailoredWordBreakIterator(std::u16string_view str);

  ~TailoredWordBreakIterator();
  TailoredWordBreakIterator(const TailoredWordBreakIterator&) = delete;
  TailoredWordBreakIterator& operator=(const TailoredWordBreakIterator&) =
      delete;

  bool Advance();
  bool IsWord() const;
  // Returns characters between `prev_` and `pos_` if `special_word_` is not
  // empty. Otherwise returns the normal `BreakIterator`-determined current
  // word.
  std::u16string_view GetString() const;
  size_t prev() const;
  size_t pos() const;

 private:
  // Returns true if processing a word with underscores or numbers (i.e., `pos`
  // points to a valid position in `special_word_`).
  bool HasSpecialWord() const;

  // Updates `prev_` and `pos_` considering underscores and numbers. Returns
  // true if it successfully advanced within `special_word_`; returns false if
  // it exhausts the word and should resume the main word traversal. This is
  // similar to the semantics of `BreakIterator::Advance()`.
  bool AdvanceInSpecialWord();

  // `prev_` and `pos_` are indices to `special_word_`.
  size_t prev_, pos_;
  // Set if `BreakIterator::GetStringView()` contains '_' or numbers, otherwise
  // it's empty.
  std::u16string_view special_word_;

  // The additional chars to break on that aren't broken on by `BreakIterator`.
  // Subset of `all_breaks_` that return true from `IsWord()` (e.g. numbers).
  const std::u16string word_breaks_;
  // Subset of `all_breaks_` that return false from `IsWord()` (e.g.
  // underscore).
  const std::u16string non_word_breaks_;
  // Union of `word_breaks_` & `word_breaks_` (e.g. numbers & underscore).
  const std::u16string all_breaks_;
};

#endif  // COMPONENTS_OMNIBOX_BROWSER_TAILORED_WORD_BREAK_ITERATOR_H_