File: translation_util.cc

package info (click to toggle)
chromium 139.0.7258.127-1
links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (161 lines) | stat: -rw-r--r-- 5,758 bytes
parent folder | download | duplicates (6)
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/live_caption/translation_util.h"

#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "third_party/icu/source/common/unicode/brkiter.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/uscript.h"
#include "third_party/re2/src/re2/re2.h"
#include "ui/base/l10n/l10n_util.h"

namespace captions {
std::vector<std::string> SplitSentences(const std::string& text,
                                        const std::string& locale) {
  std::vector<std::string> sentences;
  UErrorCode status = U_ZERO_ERROR;

  // Use icu::BreakIterator instead of base::i18n::BreakIterator to avoid flakey
  // mid-string sentence breaks.
  icu::BreakIterator* iter =
      icu::BreakIterator::createSentenceInstance(locale.c_str(), status);

  DCHECK(U_SUCCESS(status))
      << "ICU could not open a break iterator: " << u_errorName(status) << " ("
      << status << ")";

  // Set the text to be analyzed.
  icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8(text);
  iter->setText(unicode_text);

  // Iterate over the sentences.
  int32_t start = iter->first();
  int32_t end = iter->next();
  while (end != icu::BreakIterator::DONE) {
    icu::UnicodeString sentence;
    unicode_text.extractBetween(start, end, sentence);
    std::string sentence_string;
    sentence.toUTF8String(sentence_string);
    sentences.emplace_back(sentence_string);
    start = end;
    end = iter->next();
  }

  delete iter;

  return sentences;
}

bool ContainsTrailingSpace(const std::string& str) {
  return !str.empty() && base::IsAsciiWhitespace(str.back());
}

std::string RemoveTrailingSpace(const std::string& str) {
  if (ContainsTrailingSpace(str)) {
    return str.substr(0, str.length() - 1);
  }

  return str;
}

std::string RemovePunctuationToLower(std::string str) {
  re2::RE2::GlobalReplace(&str, "[[:punct:]]", "");

  return base::ToLowerASCII(str);
}

std::string GetTranslationCacheKey(const std::string& source_language,
                                   const std::string& target_language,
                                   const std::string& transcription) {
  return base::StrCat({source_language, target_language, "|",
                       RemovePunctuationToLower(transcription)});
}

bool IsIdeographicLocale(const std::string& locale) {
  // Retrieve the script codes used by the given language from ICU. When the
  // given language consists of two or more scripts, we just use the first
  // script. The size of returned script codes is always < 8. Therefore, we use
  // an array of size 8 so we can include all script codes without insufficient
  // buffer errors.
  UErrorCode error = U_ZERO_ERROR;
  UScriptCode script_code[8];
  int scripts = uscript_getCode(locale.c_str(), script_code,
                                std::size(script_code), &error);

  return U_SUCCESS(error) && scripts >= 1 &&
         (script_code[0] == USCRIPT_HAN || script_code[0] == USCRIPT_HIRAGANA ||
          script_code[0] == USCRIPT_YI || script_code[0] == USCRIPT_KATAKANA);
}

TranslationCache::TranslationCache() = default;
TranslationCache::~TranslationCache() = default;

std::pair<std::string, std::string>
TranslationCache::FindCachedTranslationOrRemaining(
    const std::string& transcript,
    const std::string& source_language,
    const std::string& target_language) const {
  std::vector<std::string> sentences =
      SplitSentences(transcript, source_language);

  std::string cached_translation;
  std::string string_to_translate;
  bool cached_translation_found = true;
  for (const std::string& sentence : sentences) {
    if (cached_translation_found) {
      std::string trailing_space =
          ContainsTrailingSpace(sentence)
              ? sentence.substr(sentence.length() - 1, sentence.length())
              : std::string();
      auto translation_cache_key = GetTranslationCacheKey(
          source_language, target_language,
          trailing_space.empty() ? sentence : RemoveTrailingSpace(sentence));
      auto iter = translation_cache_.find(translation_cache_key);
      if (iter != translation_cache_.end()) {
        cached_translation += iter->second;
        if (!trailing_space.empty()) {
          cached_translation += trailing_space;
        }

        continue;
      }
      cached_translation_found = false;
    }

    string_to_translate = base::StrCat({string_to_translate, sentence});
  }
  if (cached_translation_found) {
    return std::make_pair("", cached_translation);
  } else {
    return std::make_pair(string_to_translate, cached_translation);
  }
}

void TranslationCache::InsertIntoCache(
    const std::string& original_transcription,
    const std::string& result,
    const std::string& source_language,
    const std::string& target_language) {
  auto original_sentences =
      SplitSentences(original_transcription, source_language);
  auto translated_sentences = SplitSentences(result, target_language);
  if (original_sentences.size() > 1 &&
      original_sentences.size() == translated_sentences.size()) {
    for (size_t i = 0; i < original_sentences.size() - 1; i++) {
      // Sentences are always cached without the trailing space.
      std::string sentence = RemoveTrailingSpace(original_sentences[i]);
      translation_cache_.insert(
          {GetTranslationCacheKey(source_language, target_language, sentence),
           RemoveTrailingSpace(translated_sentences[i])});
    }
  }
}

void TranslationCache::Clear() {
  translation_cache_.clear();
}

}  // namespace captions