File: omnibox_pedal_provider.cc

package info (click to toggle)
chromium 139.0.7258.127-1
links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (312 lines) | stat: -rw-r--r-- 13,049 bytes
parent folder | download | duplicates (6)
// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/omnibox/browser/actions/omnibox_pedal_provider.h"

#include <numeric>
#include <string_view>
#include <unordered_map>

#include "base/i18n/case_conversion.h"
#include "base/i18n/char_iterator.h"
#include "base/i18n/rtl.h"
#include "base/metrics/field_trial_params.h"
#include "base/strings/string_tokenizer.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/trace_event/memory_usage_estimator.h"
#include "components/omnibox/browser/actions/omnibox_pedal.h"
#include "components/omnibox/browser/actions/omnibox_pedal_concepts.h"
#include "components/omnibox/browser/autocomplete_input.h"
#include "components/omnibox/browser/autocomplete_provider_client.h"
#include "components/omnibox/browser/omnibox_field_trial.h"
#include "components/omnibox/common/omnibox_features.h"
#include "components/omnibox/resources/grit/omnibox_pedal_synonyms.h"
#include "ui/base/l10n/l10n_util.h"

namespace {
typedef base::StringTokenizerT<std::u16string, std::u16string::const_iterator>
    StringTokenizer16;

// This is a hard upper bound on the number of tokens that will be processed.
// The value 61 was determined from the original body of translation data,
// but translators gradually increased the length of strings. Each time
// the limit is exceeded, the translation pipeline breaks, so 100 provides
// some cushion; but it shouldn't be unbounded as that would adversely
// affect performance. Pedals are checked on each keystroke in the omnibox.
constexpr size_t kMaxTokens = 100;

// All characters in this string get removed from text before processing.
// U+200F is a RTL marker punctuation character that seems to throw
// off some triggers in 'ar'.
const char16_t kRemoveChars[] = {0x200F, 0};

}  // namespace

size_t EstimateMemoryUsage(scoped_refptr<OmniboxPedal> pedal) {
  // Consider the ref-counted Pedals to be part of the provider's memory usage.
  return pedal->EstimateMemoryUsage();
}

OmniboxPedalProvider::OmniboxPedalProvider(
    AutocompleteProviderClient& client,
    std::unordered_map<OmniboxPedalId, scoped_refptr<OmniboxPedal>> pedals)
    : client_(client),
      pedals_(std::move(pedals)),
      ignore_group_(false, false, 0),
      match_tokens_(kMaxTokens) {
  LoadPedalConcepts();

  // Cull Pedals with incomplete data; they won't trigger if not enabled,
  // but there's no need to keep them in the collection (iterated frequently).
  std::erase_if(pedals_, [](const auto& it) {
    const OmniboxPedal::LabelStrings& labels = it.second->GetLabelStrings();
    return labels.hint.empty() || labels.suggestion_contents.empty() ||
           labels.accessibility_hint.empty() ||
           labels.accessibility_suffix.empty();
  });
}

OmniboxPedalProvider::~OmniboxPedalProvider() = default;

size_t OmniboxPedalProvider::EstimateMemoryUsage() const {
  size_t total = 0;
  total += base::trace_event::EstimateMemoryUsage(dictionary_);
  total += base::trace_event::EstimateMemoryUsage(ignore_group_);
  total += base::trace_event::EstimateMemoryUsage(pedals_);
  total += base::trace_event::EstimateMemoryUsage(tokenize_characters_);
  return total;
}

OmniboxPedal* OmniboxPedalProvider::FindPedalMatch(
    const std::u16string& match_text) {
  Tokenize(match_tokens_, match_text);
  if (match_tokens_.Size() == 0) {
    return nullptr;
  }

  // Note the ignore group is the only one that does full container
  // element erasure. This is necessary to prevent stop words from
  // breaking meaningful token sequences. For example, in the case
  // "make the most of chrome features", "the" must be fully
  // removed so as to not break detection of sequence "make the most of"
  // where "the" is removed by preprocessing. It becomes
  // "make most of" and would not match sequence "make _ most of"
  // where "the" was merely consumed instead of fully removed.
  if (ignore_group_.EraseMatchesIn(match_tokens_, true) &&
      match_tokens_.Size() == 0) {
    // Only ignored tokens were present, and all tokens were erased. No match.
    return nullptr;
  }

  for (const auto& pedal : pedals_) {
    // This restores link validity after above EraseMatchesIn call and prepares
    // |match_tokens_| for the next check after iteration.
    match_tokens_.ResetLinks();
    if (pedal.second->IsConceptMatch(match_tokens_)) {
      return pedal.second.get();
    }
  }
  return nullptr;
}

OmniboxPedal* OmniboxPedalProvider::FindReadyPedalMatch(
    const AutocompleteInput& input,
    const std::u16string& match_text) {
  OmniboxPedal* const found = FindPedalMatch(match_text);
  if (found == nullptr || !found->IsReadyToTrigger(input, *client_)) {
    return nullptr;
  }

  return found;
}

void OmniboxPedalProvider::Tokenize(OmniboxPedal::TokenSequence& out_tokens,
                                    const std::u16string& text) const {
  // Note that FoldCase (not ToLower) is used here and elsewhere in this code.
  // See base/i18n/case_conversion.h for advice about unicode case handling.
  // FoldCase is equivalent to lower-casing for ASCII/English, but provides
  // more consistent (canonical) handling in other languages as well.
  std::u16string reduced_text = base::i18n::FoldCase(text);
  base::RemoveChars(reduced_text, kRemoveChars, &reduced_text);
  out_tokens.Clear();
  if (tokenize_characters_.empty()) {
    // Tokenize on Unicode character boundaries when we have no delimiters.
    base::i18n::UTF16CharIterator char_iter(reduced_text);
    size_t left = 0;
    while (!char_iter.end()) {
      char_iter.Advance();
      size_t right = char_iter.array_pos();
      if (right > left) {
        const auto token = reduced_text.substr(left, right - left);
        const auto iter = dictionary_.find(token);
        if (iter == dictionary_.end() || out_tokens.Size() >= kMaxTokens) {
          // No Pedal can possibly match because we found a token not
          // present in the token dictionary, or the text has too many tokens.
          out_tokens.Clear();
          break;
        } else {
          out_tokens.Add(iter->second);
        }
        left = right;
      } else {
        break;
      }
    }
  } else {
    // Delimiters will neatly divide the string into tokens.
    StringTokenizer16 tokenizer(reduced_text, tokenize_characters_);
    while (tokenizer.GetNext()) {
      const auto iter = dictionary_.find(tokenizer.token());
      if (iter == dictionary_.end() || out_tokens.Size() >= kMaxTokens) {
        // No Pedal can possibly match because we found a token not
        // present in the token dictionary, or the text has too many tokens.
        out_tokens.Clear();
        break;
      } else {
        out_tokens.Add(iter->second);
      }
    }
  }
}

void OmniboxPedalProvider::TokenizeAndExpandDictionary(
    OmniboxPedal::TokenSequence& out_tokens,
    const std::u16string& token_sequence_string) {
  out_tokens.Clear();
  if (tokenize_characters_.empty()) {
    // Tokenize on Unicode character boundaries when we have no delimiters.
    base::i18n::UTF16CharIterator char_iter(token_sequence_string);
    size_t left = 0;
    while (!char_iter.end()) {
      char_iter.Advance();
      size_t right = char_iter.array_pos();
      if (right > left) {
        const std::u16string raw_token =
            token_sequence_string.substr(left, right - left);
        std::u16string token = base::i18n::FoldCase(raw_token);
        const auto iter = dictionary_.find(token);
        if (iter == dictionary_.end()) {
          // Token not in dictionary; expand dictionary.
          out_tokens.Add(dictionary_.size());
          dictionary_.insert({std::move(token), dictionary_.size()});
        } else {
          // Token in dictionary; add existing token identifier to sequence.
          out_tokens.Add(iter->second);
        }
        left = right;
      } else {
        break;
      }
    }
  } else {
    // Delimiters will neatly divide the string into tokens.
    StringTokenizer16 tokenizer(token_sequence_string, tokenize_characters_);
    while (tokenizer.GetNext()) {
      std::u16string raw_token = tokenizer.token();
      std::u16string_view trimmed_token =
          base::TrimWhitespace(raw_token, base::TrimPositions::TRIM_ALL);
      std::u16string token = base::i18n::FoldCase(trimmed_token);
      const auto iter = dictionary_.find(token);
      if (iter == dictionary_.end()) {
        // Token not in dictionary; expand dictionary.
        out_tokens.Add(dictionary_.size());
        dictionary_.insert({std::move(token), dictionary_.size()});
      } else {
        // Token in dictionary; add existing token identifier to sequence.
        out_tokens.Add(iter->second);
      }
    }
  }
}

void OmniboxPedalProvider::LoadPedalConcepts() {
  // The locale is a two-letter language code, possibly followed by a dash and
  // country code. English locales include "en", "en-US", and "en-GB" while
  // non-English locales never start with "en".
  const std::string locale = base::i18n::GetConfiguredLocale();
  const std::string language_code = locale.substr(0, 2);

  // According to the pedals localization data, only a few languages
  // were set to tokenize each character, so those are checked directly here.
  // Note, zh-CN was set to tokenize each character but zh-TW was not so the
  // full locale is checked for that exceptional case.
  if (language_code == "ja" || (language_code == "zh" && locale != "zh-TW")) {
    tokenize_characters_ = u"";
  } else {
    tokenize_characters_ = u" -";
  }

  ignore_group_ = LoadSynonymGroupString(
      false, false, l10n_util::GetStringUTF16(IDS_OMNIBOX_PEDALS_IGNORE_GROUP));
  if (tokenize_characters_.empty()) {
    // Translation console sourced data has lots of spaces, but in practice
    // the ignore group doesn't include a single space sequence. Rather than
    // burden l10n with getting this nuance in the data precisely specified,
    // we simply hardcode to ignore spaces. This applies for all languages
    // that don't tokenize on spaces (see `tokenize_characters_` above).
    ignore_group_.AddSynonym(
        OmniboxPedal::TokenSequence(std::vector<int>({dictionary_[u" "]})));
  }
  ignore_group_.SortSynonyms();

  for (auto& entry : pedals_) {
    OmniboxPedal* pedal = entry.second.get();
    OmniboxPedal::TokenSequence verbatim_sequence(0);
    TokenizeAndExpandDictionary(verbatim_sequence,
                                pedal->GetLabelStrings().hint);
    ignore_group_.EraseMatchesIn(verbatim_sequence, true);
    pedal->AddVerbatimSequence(std::move(verbatim_sequence));

    std::vector<OmniboxPedal::SynonymGroupSpec> specs =
        pedal->SpecifySynonymGroups(language_code == "en");
    DCHECK(!specs.empty());
    for (const auto& spec : specs) {
      // Note, group strings are not preprocessed; they are the raw outputs
      // from translators in the localization pipeline, so we need to remove
      // ignore group sequences and validate remaining data. The groups
      // are sorted *after* erasing the ignore group to ensure no synonym
      // token sequences are made shorter than sequences later in the order,
      // which would break an invariant expected by the matching algorithm.
      OmniboxPedal::SynonymGroup group =
          LoadSynonymGroupString(spec.required, spec.match_once,
                                 l10n_util::GetStringUTF16(spec.message_id));
      group.EraseIgnoreGroup(ignore_group_);
      group.SortSynonyms();
      if (group.IsValid()) {
        pedal->AddSynonymGroup(std::move(group));
      }
    }
  }

  // Give all pedals a final chance to override/mutate based on feature flags.
  for (auto& entry : pedals_) {
    entry.second->OnLoaded();
  }
}

OmniboxPedal::SynonymGroup OmniboxPedalProvider::LoadSynonymGroupString(
    bool required,
    bool match_once,
    std::u16string synonyms_csv) {
  base::RemoveChars(synonyms_csv, kRemoveChars, &synonyms_csv);
  OmniboxPedal::SynonymGroup group(required, match_once, 0);
  // Note, 'ar' language uses '،' instead of ',' to delimit synonyms and
  // in some cases the 'ja' language data uses '、' to delimit synonyms.
  StringTokenizer16 tokenizer(synonyms_csv, u",،、");
  while (tokenizer.GetNext()) {
    OmniboxPedal::TokenSequence sequence(0);
    // In some languages where whitespace is significant but not a token
    // delimiter, we want to trim and normalize whitespace that might be
    // added by translators for reading convenience in translation console.
    TokenizeAndExpandDictionary(
        sequence, base::CollapseWhitespace(tokenizer.token(), false));
    // This DCHECK should only trigger in case of extra long translation
    // phrases which would need to be fixed in the translation data itself
    // for best efficiency, or by adjusting `kMaxTokens` if really necessary.
    DCHECK_LE(sequence.Size(), kMaxTokens);
    group.AddSynonym(std::move(sequence));
  }
  return group;
}