File: term_break_iterator.cc

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (83 lines) | stat: -rw-r--r-- 2,432 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "chromeos/ash/components/string_matching/term_break_iterator.h"

#include <ostream>

#include "base/check.h"
#include "base/i18n/char_iterator.h"
#include "base/logging.h"
#include "base/notreached.h"
#include "base/strings/string_util.h"
#include "third_party/icu/source/common/unicode/uchar.h"

namespace ash::string_matching {

TermBreakIterator::TermBreakIterator(const std::u16string& word)
    : word_(word),
      prev_(npos),
      pos_(0),
      iter_(std::make_unique<base::i18n::UTF16CharIterator>(word)),
      state_(STATE_START) {}

TermBreakIterator::~TermBreakIterator() = default;

bool TermBreakIterator::Advance() {
  // 2D matrix that defines term boundaries. Each row represents current state.
  // Each col represents new state from input char. Cells with true value
  // represents a term boundary.
  const bool kBoundary[][STATE_LAST] = {
      // START  NUMBER UPPER  LOWER  CHAR
      {false, false, false, false, false},  // START
      {false, false, true, true, true},     // NUMBER
      {false, true, false, false, true},    // UPPER
      {false, true, true, false, true},     // LOWER
      {false, true, true, true, false},     // CHAR
  };

  while (iter_->Advance()) {
    const State new_state = GetNewState((*word_)[iter_->array_pos()]);
    const bool is_boundary = kBoundary[state_][new_state];
    state_ = new_state;
    if (is_boundary)
      break;
  }

  prev_ = pos_;
  pos_ = iter_->array_pos();

  return prev_ != pos_ || !iter_->end();
}

const std::u16string TermBreakIterator::GetCurrentTerm() const {
  DCHECK(prev_ != npos && pos_ != npos);
  return word_->substr(prev_, pos_ - prev_);
}

TermBreakIterator::State TermBreakIterator::GetNewState(char16_t ch) {
  if (base::IsAsciiDigit(ch) || ch == '.' || ch == ',')
    return STATE_NUMBER;

  const bool is_upper = !!u_isUUppercase(ch);
  const bool is_lower = !!u_isULowercase(ch);

  if (is_upper && is_lower) {
    NOTREACHED() << "Invalid state for ch=" << std::u16string(1, ch);
  }

  if (is_upper)
    return STATE_UPPER;
  if (is_lower)
    return STATE_LOWER;

  return STATE_CHAR;
}

}  // namespace ash::string_matching