File: tokenizer_unittest.cc

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (115 lines) | stat: -rw-r--r-- 4,845 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/renderer/accessibility/phrase_segmentation/tokenizer.h"

#include <string>
#include <utility>
#include <vector>

#include "testing/gtest/include/gtest/gtest.h"

namespace {

class PhraseSegmentationTokenizerTest : public testing::Test {
 protected:
  PhraseSegmentationTokenizerTest() = default;
};

TEST_F(PhraseSegmentationTokenizerTest, TokenizeSentence) {
  const std::u16string input_string =
      u"They were described by the neighbors as a quiet middle-aged couple.";
  //    0123456789012345678901234567890123456789012345678901234567890123456

  std::vector<std::pair<int, int>> tokens{
      std::make_pair(0, 4),   std::make_pair(5, 9),   std::make_pair(10, 19),
      std::make_pair(20, 22), std::make_pair(23, 26), std::make_pair(27, 36),
      std::make_pair(37, 39), std::make_pair(40, 41), std::make_pair(42, 47),
      std::make_pair(48, 54), std::make_pair(54, 55), std::make_pair(55, 59),
      std::make_pair(60, 66), std::make_pair(66, 67)};
  Tokenizer tokenizer;
  EXPECT_EQ(tokenizer.Tokenize(input_string), tokens);
}

TEST_F(PhraseSegmentationTokenizerTest, TokenizeEmpty) {
  const std::u16string input_string = u"";
  std::vector<std::pair<int, int>> tokens{};
  Tokenizer tokenizer;
  EXPECT_EQ(tokenizer.Tokenize(input_string), tokens);
}

TEST_F(PhraseSegmentationTokenizerTest, TokenizeIrregularSpace) {
  const std::u16string input_string =
      u"They  were   described by the neighbors as a quiet middle-aged couple.";
  //    0123456789012345678901234567890123456789012345678901234567890123456789

  std::vector<std::pair<int, int>> tokens{
      std::make_pair(0, 4),   std::make_pair(6, 10),  std::make_pair(13, 22),
      std::make_pair(23, 25), std::make_pair(26, 29), std::make_pair(30, 39),
      std::make_pair(40, 42), std::make_pair(43, 44), std::make_pair(45, 50),
      std::make_pair(51, 57), std::make_pair(57, 58), std::make_pair(58, 62),
      std::make_pair(63, 69), std::make_pair(69, 70)};

  Tokenizer tokenizer;
  EXPECT_EQ(tokenizer.Tokenize(input_string), tokens);
}

TEST_F(PhraseSegmentationTokenizerTest, TokenizePunctuations) {
  const std::u16string input_string =
      u"They were described (by the neighbors) as a quiet, middle-aged couple.";
  //    0123456789012345678901234567890123456789012345678901234567890123456789
  std::vector<std::pair<int, int>> tokens{
      std::make_pair(0, 4),   std::make_pair(5, 9),   std::make_pair(10, 19),
      std::make_pair(20, 21), std::make_pair(21, 23), std::make_pair(24, 27),
      std::make_pair(28, 37), std::make_pair(37, 38), std::make_pair(39, 41),
      std::make_pair(42, 43), std::make_pair(44, 49), std::make_pair(49, 50),
      std::make_pair(51, 57), std::make_pair(57, 58), std::make_pair(58, 62),
      std::make_pair(63, 69), std::make_pair(69, 70)};

  Tokenizer tokenizer;
  EXPECT_EQ(tokenizer.Tokenize(input_string), tokens);
}

TEST_F(PhraseSegmentationTokenizerTest, TokenizeApostrophes) {
  const std::u16string input_string =
      u"David's father can't, won't, and didn't care.";
  //    0123456789012345678901234567890123456789012345
  std::vector<std::pair<int, int>> tokens{
      std::make_pair(0, 7),   std::make_pair(8, 14),  std::make_pair(15, 20),
      std::make_pair(20, 21), std::make_pair(22, 27), std::make_pair(27, 28),
      std::make_pair(29, 32), std::make_pair(33, 39), std::make_pair(40, 44),
      std::make_pair(44, 45)};

  Tokenizer tokenizer;
  EXPECT_EQ(tokenizer.Tokenize(input_string), tokens);
}

TEST_F(PhraseSegmentationTokenizerTest, TokenizeNonAscii) {
  const std::u16string input_string =
      u"Ce film est très intéressant : c'est un classique.";
  //    0123456789012345678901234567890123456789012345678901234567890123456789
  std::vector<std::pair<int, int>> tokens{
      std::make_pair(0, 2),   std::make_pair(3, 7),   std::make_pair(8, 11),
      std::make_pair(12, 16), std::make_pair(17, 28), std::make_pair(29, 30),
      std::make_pair(31, 36), std::make_pair(37, 39), std::make_pair(40, 49),
      std::make_pair(49, 50)};

  Tokenizer tokenizer;
  EXPECT_EQ(tokenizer.Tokenize(input_string), tokens);
}

TEST_F(PhraseSegmentationTokenizerTest, TokenizeNumbers) {
  const std::u16string input_string = u"2 °C is 35 °F for H2O.";
  //                                    01234567890123456789012
  std::vector<std::pair<int, int>> tokens{
      std::make_pair(0, 1),   std::make_pair(2, 3),   std::make_pair(3, 4),
      std::make_pair(5, 7),   std::make_pair(8, 10),  std::make_pair(11, 12),
      std::make_pair(12, 13), std::make_pair(14, 17), std::make_pair(18, 21),
      std::make_pair(21, 22)};

  Tokenizer tokenizer;
  EXPECT_EQ(tokenizer.Tokenize(input_string), tokens);
}

}  // namespace