1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
|
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/live_caption/greedy_text_stabilizer.h"
#include <string>
#include <unordered_map>
#include <vector>
#include "base/strings/string_tokenizer.h"
namespace {
std::string RemoveTrailingSpace(const std::string& input) {
if (input.length() > 0 && std::isspace(input.back())) {
return input.substr(0, input.length() - 1);
} else {
return input;
}
}
} // namespace
namespace captions {
GreedyTextStabilizer::GreedyTextStabilizer(int min_token_frequency)
: min_token_frequency_(min_token_frequency < 0 ? 0 : min_token_frequency) {}
GreedyTextStabilizer::~GreedyTextStabilizer() = default;
std::string GreedyTextStabilizer::UpdateText(const std::string& input_text,
const bool is_final) {
// For final recognition results, we use all tokens even if they are unstable.
// Reset the stabilizer in preparation for receiving new partial recognition
// results.
if (is_final) {
Reset();
return input_text;
}
const std::vector<std::string> tokens = Tokenize(input_text);
// When min_token_frequency_ is 0, we define the output to be the input.
// Therefore, we can exit early.
if (min_token_frequency_ == 0) {
stable_text_ = input_text;
stable_token_count_ = tokens.size();
return input_text;
}
// Add each token to the correct position in the tokens dictionary.
for (unsigned long i = 0; i < tokens.size(); ++i) {
const std::string token = RemoveTrailingSpace(tokens[i]);
// If this location in the sentence does not yet exist, we need to extend
// the vector to include this location.
if (i >= tokens_histograms_.size()) {
std::unordered_map<std::string, int> token_histogram = {{token, 1}};
tokens_histograms_.push_back(token_histogram);
} else {
// Increment the count of the token in the dictionary at this location.
tokens_histograms_[i][token]++;
}
}
// Now compare the input tokens to those in the distributions.
// As we consider the token at each location, we determine it to be stable if
// it is the mode in the token dictionary for that location and its token
// frequency is high enough. Otherwise, it is considered unstable, and we exit
// early.
stable_token_count_ = 0;
int stable_character_count = 0;
for (unsigned long i = 0; i < tokens.size(); i++) {
const std::string token = RemoveTrailingSpace(tokens[i]);
if (i < tokens_histograms_.size() &&
tokens_histograms_[i][token] >= min_token_frequency_ &&
IsMode(token, tokens_histograms_[i])) {
// Use the size of the unstripped token.
stable_character_count += tokens[i].size();
stable_token_count_++;
} else {
break;
}
}
// Only use the new text if it has more tokens than the previous stable text.
// This prevents shrinkage of the text.
if (stable_token_count_ >= max_stable_token_count_) {
max_stable_token_count_ = stable_token_count_;
// Update the stable text.
if (stable_token_count_ <= 0) {
stable_text_ = std::string();
} else {
stable_text_ = input_text.substr(0, stable_character_count);
}
}
return stable_text_;
}
void GreedyTextStabilizer::Reset() {
max_stable_token_count_ = 0;
stable_token_count_ = 0;
stable_text_ = std::string();
tokens_histograms_.clear();
}
std::vector<std::string> GreedyTextStabilizer::Tokenize(
const std::string& input_text) {
std::vector<std::string> tokens;
base::StringTokenizer t(input_text, " ");
t.set_options(base::StringTokenizer::RETURN_DELIMS);
while (t.GetNext()) {
// Trailing punctuation should be treated as a separate token so that
// flickering punctuation can be handled appropriately.
if (t.token().size() > 0 && std::ispunct(t.token().back())) {
tokens.push_back(t.token().substr(0, t.token().size() - 1));
tokens.push_back(t.token().substr(t.token().size() - 1, 1));
} else {
tokens.push_back(t.token());
}
}
return tokens;
}
bool GreedyTextStabilizer::IsMode(
const std::string& token,
const std::unordered_map<std::string, int>& token_histogram) {
const int token_count = token_histogram.at(token);
// There could be multiple modes in the histogram, and we only need to ensure
// that the given token is one of the modes. Thus, the given token is a mode
// only if no other token has a higher count than the given token.
for (const auto& element : token_histogram) {
if (element.second > token_count) {
// If we have found a token with a higher count, we exit early and
// indicate that the given token is not a mode.
return false;
}
}
return true;
}
} // namespace captions
|