File: phrase_segmenter.cc

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (157 lines) | stat: -rw-r--r-- 6,420 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/renderer/accessibility/phrase_segmentation/phrase_segmenter.h"

#include <deque>
#include <tuple>
#include <vector>

#include "chrome/renderer/accessibility/phrase_segmentation/base_phrase_segmenter.h"
#include "chrome/renderer/accessibility/phrase_segmentation/token_boundaries.h"
#include "chrome/renderer/accessibility/phrase_segmentation/tokenized_sentence.h"

// Constants used in the phrase splitting algorithm. These were calculated by
// performing a grid search over several values, and evaluating performance
// metrics for the GUM verified-1000 dataset.

// If all boundary weights of a phrase are less than this value, we
// don't break it further, even if the number of words/characters
// exceeds the threshold.
const float kMinThresholdForBreaking = 1;

// Conversely, if any of the boundary weights of a phrase are more than
// this value, we break it further, even if the number of
// words/characters already meets the threshold.
const float kMaxThresholdForBreaking = 5;

// Returns a vector that has a ramp up and a ramp down for the given length.
static std::vector<float> Triangle(int length) {
  std::vector<float> triangle(length);
  int index = 0;
  for (int i = 0; i < length / 2; i++) {
    triangle[index++] = static_cast<float>(i) / length;
  }
  for (int i = length - length / 2 - 1; i >= 0; i--) {
    triangle[index++] = static_cast<float>(i) / length;
  }
  return triangle;
}

// Checks whether a candidate phrase meets the phrase qualification criterion.
// Given a sentence fragment (indicated by start and end indices in a tokenized
// sentence), and a qualification strategy (i.e. a max number of words or
// characters in the phrase), this function returns whether the phrase
// qualifies.
static bool QualifiesAsPhrase(const TokenizedSentence& tokenized_sentence,
                              int start,
                              int end,
                              Strategy strategy,
                              int threshold) {
  bool qualifies_as_phrase = false;
  int n_words = tokenized_sentence.WordsBetween(start, end);
  int n_characters = tokenized_sentence.CharactersBetween(start, end);
  switch (strategy) {
    case Strategy::kWords:
      qualifies_as_phrase = n_words <= threshold;
      break;
    case Strategy::kCharacters:
      qualifies_as_phrase = (n_words == 1) || (n_characters <= threshold);
      break;
  }
  return qualifies_as_phrase;
}

std::vector<float> PhraseSegmenter::CalculatePhraseBoundariesVector(
    const TokenizedSentence& tokenized_sentence,
    const TokenBoundaries& token_boundaries,
    const Strategy strategy,
    const int threshold) {
  std::vector<float> break_likelihood(
      token_boundaries.boundary_weights().size());

  // Initialize candidates to a single phrase with the entire sentence.
  // The tuple consists of start, end, and recursion depth.
  std::deque<std::tuple<int, int, int>> candidate_phrases;
  candidate_phrases.emplace_back(0, token_boundaries.boundary_weights().size(),
                                 1);

  // This is a modified BFS algorithm, that iteratively breaks longer phrases
  // until all phrases meet the qualification criteria defined by Strategy and
  // threshold.
  while (!candidate_phrases.empty()) {
    auto [start, end, depth] = candidate_phrases.front();
    candidate_phrases.pop_front();

    if (start == end) {
      // Cannot split 1-word phrases
      continue;
    }

    // Calculate weights in the range, and the maximum weight.
    std::vector<float> weights = std::vector<float>(end - start);
    unsigned int index_max = 0;
    for (int i = start + 1; i < end; i++) {
      int index = i - start;
      weights[index] = token_boundaries.boundary_weights()[i];
      if (weights[index] > weights[index_max]) {
        index_max = index;
      }
    }

    // Check if the phrase meets the condition set by strategy and thresholds.

    // Even if the phrase doesn't meet the qualification criteria, if the
    // weights are all low (so no obvious break point), don't break it further.
    // Note that we do this only for words-based breaking, based on an
    // assumption that character-based breaking might be stricter with the
    // threshold (for example, character-based breaking might be used to fit a
    // phrase within a limited amount of screen space).
    if ((weights[index_max] <= kMinThresholdForBreaking) &&
        (strategy == Strategy::kWords)) {
      continue;
    }

    // If the phrase meets the qualification criterion, don't break it further.
    // The exception is if there's an obvious breaking point (as indicated by a
    // really high weight), in which case a further break would be warranted; so
    // we also check if the maximum weight is less than the breaking threshold.
    if (QualifiesAsPhrase(tokenized_sentence, start, end, strategy,
                          threshold) &&
        (weights[index_max] < kMaxThresholdForBreaking)) {
      continue;
    }

    // Add a triangular bias to the weights to bias towards breaking in the
    // center (i.e. generating longer phrases) in the case of ties.
    auto tri = Triangle(end - start);
    for (int i = start; i < end; i++) {
      weights[i - start] += tri[i - start];
    }

    // Insert a phrase break at the boundary with the greatest weight.
    // We need to do this again because we added the triangle.
    index_max = 0;
    for (unsigned int i = 1; i < weights.size(); i++) {
      if (weights[i] > weights[index_max]) {
        index_max = i;
      }
    }

    int break_location = start + index_max;
    // Set the probability to 1.0. This was found to have the best
    // performance for the GUM verified-1000 dataset (English only). Other
    // options that were tried included setting the break likelihood by scaling
    // down the weight, followed by a dynamic programming search for the most
    // likely phrases. These approaches might work better for other languages.
    break_likelihood[break_location] = 1.0f;

    candidate_phrases.emplace_back(start, start + index_max, depth + 1);
    if (index_max < token_boundaries.boundary_weights().size() - 1) {
      candidate_phrases.emplace_back(start + index_max + 1, end, depth + 1);
    }
  }

  return break_likelihood;
}