File: url_index_private_data.cc

package info (click to toggle)
chromium 139.0.7258.127-1
links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (1039 lines) | stat: -rw-r--r-- 41,595 bytes
parent folder | download | duplicates (5)
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/omnibox/browser/url_index_private_data.h"

#include <stdint.h>

#include <algorithm>
#include <limits>
#include <map>
#include <memory>
#include <numeric>
#include <set>
#include <string>
#include <utility>
#include <vector>

#include "base/containers/flat_set.h"
#include "base/containers/stack.h"
#include "base/feature_list.h"
#include "base/i18n/case_conversion.h"
#include "base/memory/raw_ptr.h"
#include "base/metrics/histogram_macros.h"
#include "base/stl_util.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "base/trace_event/memory_usage_estimator.h"
#include "components/bookmarks/browser/bookmark_model.h"
#include "components/bookmarks/browser/bookmark_utils.h"
#include "components/history/core/browser/history_database.h"
#include "components/history/core/browser/history_db_task.h"
#include "components/history/core/browser/history_service.h"
#include "components/omnibox/browser/in_memory_url_index.h"
#include "components/omnibox/browser/omnibox_field_trial.h"
#include "components/omnibox/browser/omnibox_triggered_feature_service.h"
#include "components/omnibox/browser/tailored_word_break_iterator.h"
#include "components/omnibox/common/omnibox_features.h"
#include "components/omnibox/common/string_cleaning.h"
#include "components/search_engines/template_url_service.h"
#include "third_party/metrics_proto/omnibox_event.pb.h"

namespace {

GURL ClearUsernameAndPassword(const GURL& url) {
  GURL::Replacements r;
  r.ClearUsername();
  r.ClearPassword();
  return url.ReplaceComponents(r);
}

// Algorithm Functions ---------------------------------------------------------

// Comparison function for sorting search terms by descending length.
bool LengthGreater(const std::u16string& string_a,
                   const std::u16string& string_b) {
  return string_a.length() > string_b.length();
}

}  // namespace

// UpdateRecentVisitsFromHistoryDBTask -----------------------------------------

// HistoryDBTask used to update the recent visit data for a particular
// row from the history database.
class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {
 public:
  explicit UpdateRecentVisitsFromHistoryDBTask(
      scoped_refptr<URLIndexPrivateData> private_data,
      history::URLID url_id);
  UpdateRecentVisitsFromHistoryDBTask(
      const UpdateRecentVisitsFromHistoryDBTask&) = delete;
  UpdateRecentVisitsFromHistoryDBTask& operator=(
      const UpdateRecentVisitsFromHistoryDBTask&) = delete;

  bool RunOnDBThread(history::HistoryBackend* backend,
                     history::HistoryDatabase* db) override;
  void DoneRunOnMainThread() override;

 private:
  ~UpdateRecentVisitsFromHistoryDBTask() override;

  // The URLIndexPrivateData that gets updated after the historyDB
  // task returns.
  scoped_refptr<URLIndexPrivateData> private_data_;
  // The ID of the URL to get visits for and then update.
  history::URLID url_id_;
  // Whether fetching the recent visits for the URL succeeded.
  bool succeeded_;
  // The awaited data that's shown to private_data_ for it to copy and
  // store.
  history::VisitVector recent_visits_;
};

UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
    scoped_refptr<URLIndexPrivateData> private_data,
    history::URLID url_id)
    : private_data_(std::move(private_data)),
      url_id_(url_id),
      succeeded_(false) {}

bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
    history::HistoryBackend* backend,
    history::HistoryDatabase* db) {
  succeeded_ = db->GetMostRecentVisitsForURL(
      url_id_, URLIndexPrivateData::kMaxVisitsToStoreInCache, &recent_visits_);
  if (!succeeded_)
    recent_visits_.clear();
  return true;  // Always claim to be done; do not retry failures.
}

void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
  if (succeeded_)
    private_data_->UpdateRecentVisits(url_id_, recent_visits_);
}

UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() =
    default;

// URLIndexPrivateData ---------------------------------------------------------

// static
constexpr size_t URLIndexPrivateData::kMaxVisitsToStoreInCache;

URLIndexPrivateData::URLIndexPrivateData() = default;

ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
    std::u16string original_search_string,
    size_t cursor_position,
    const std::string& host_filter,
    size_t max_matches,
    bookmarks::BookmarkModel* bookmark_model,
    TemplateURLService* template_url_service,
    OmniboxTriggeredFeatureService* triggered_feature_service) {
  // This list will contain the original search string and any other string
  // transformations.
  String16Vector search_strings;
  search_strings.push_back(original_search_string);
  if ((cursor_position != std::u16string::npos) &&
      (cursor_position < original_search_string.length()) &&
      (cursor_position > 0)) {
    // The original search_string broken at cursor position. This is one type of
    // transformation.  It's possible this transformation doesn't actually
    // break any words.  There's no harm in adding the transformation in this
    // case because the searching code below prevents running duplicate
    // searches.
    std::u16string transformed_search_string(original_search_string);
    transformed_search_string.insert(cursor_position, u" ");
    search_strings.push_back(transformed_search_string);
  }

  ScoredHistoryMatches scored_items;
  // Invalidate the term cache and return if we have indexed no words (probably
  // because we've not been initialized yet).
  if (word_list_.empty()) {
    search_term_cache_.clear();
    return scored_items;
  }
  // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
  // approach.
  ResetSearchTermCache();

  bool history_ids_were_trimmed = false;
  // A set containing the list of words extracted from each search string,
  // used to prevent running duplicate searches.
  std::set<String16Vector> seen_search_words;
  for (const std::u16string& search_string : search_strings) {
    // The search string we receive may contain escaped characters. For reducing
    // the index we need individual, lower-cased words, ignoring escapings. For
    // the final filtering we need whitespace separated substrings possibly
    // containing escaped characters.
    std::u16string lower_raw_string(base::i18n::ToLower(search_string));
    // Have to convert to UTF-8 and back, because UnescapeURLComponent doesn't
    // support unescaping UTF-8 characters and converting them to UTF-16.
    std::u16string lower_unescaped_string =
        base::UTF8ToUTF16(base::UnescapeURLComponent(
            base::UTF16ToUTF8(lower_raw_string),
            base::UnescapeRule::SPACES | base::UnescapeRule::PATH_SEPARATORS |
                base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));

    // Extract individual 'words' (as opposed to 'terms'; see the declaration
    // comment in `GetTermsAndWordStartsOffsets()`) from the search string. When
    // the user types "colspec=ID%20Mstone Release" we get four 'words':
    // "colspec", "id", "mstone" and "release".
    String16Vector lower_words(
        String16VectorFromString16(lower_unescaped_string, nullptr));
    if (lower_words.empty())
      continue;
    // If we've already searched for this list of words, don't do it again.
    if (seen_search_words.find(lower_words) != seen_search_words.end())
      continue;
    seen_search_words.insert(lower_words);

    HistoryIDVector history_ids = HistoryIDsFromWords(lower_words);
    history_ids_were_trimmed |= TrimHistoryIdsPool(&history_ids);

    HistoryIdsToScoredMatches(std::move(history_ids), lower_raw_string,
                              host_filter, template_url_service, bookmark_model,
                              &scored_items, triggered_feature_service);
  }
  // Select and sort only the top |max_matches| results.
  if (scored_items.size() > max_matches) {
    // Sort the top |max_matches| * 2 matches which is cheaper than sorting all
    // matches yet likely sufficient to contain |max_matches| unique matches
    // most of the time.
    auto first_pass_size = std::min(scored_items.size(), max_matches * 2);
    std::partial_sort(
        scored_items.begin(), scored_items.begin() + first_pass_size,
        scored_items.end(), ScoredHistoryMatch::MatchScoreGreater);

    // When ML scoring w/increased candidates is enabled, all candidates outside
    // of some light filtering should be passed to the controller to be
    // re-scored. Do not discard matches by resizing. These will have a zero
    // relevance score, so it's ok to not sort anything past `first_pass_size`.
    bool skip_resize =
        OmniboxFieldTrial::IsMlUrlScoringUnlimitedNumCandidatesEnabled();
    if (!skip_resize) {
      scored_items.resize(first_pass_size);
    }

    // Filter unique matches to maximize the use of the `max_matches` capacity.
    // It's possible this'll still end up with duplicates as having unique
    // URL IDs does not guarantee having unique `stripped_destination_url`.
    std::set<HistoryID> seen_history_ids;
    std::erase_if(scored_items, [&](const auto& scored_item) {
      HistoryID scored_item_id = scored_item.url_info.id();
      bool duplicate = seen_history_ids.count(scored_item_id);
      seen_history_ids.insert(scored_item_id);
      return duplicate;
    });
    if (!skip_resize && scored_items.size() > max_matches) {
      scored_items.resize(max_matches);
    }

  } else {
    std::sort(scored_items.begin(), scored_items.end(),
              ScoredHistoryMatch::MatchScoreGreater);
  }
  if (history_ids_were_trimmed) {
    // If we trim the results set we do not want to cache the results for next
    // time as the user's ultimately desired result could easily be eliminated
    // in the early rough filter.
    search_term_cache_.clear();
  } else {
    // Remove any stale SearchTermCacheItems.
    std::erase_if(search_term_cache_,
                  [](const SearchTermCacheMap::value_type& item) {
                    return !item.second.used_;
                  });
  }

  return scored_items;
}

const std::vector<std::string>& URLIndexPrivateData::HighlyVisitedHosts()
    const {
  return highly_visited_hosts_;
}

bool URLIndexPrivateData::UpdateURL(
    history::HistoryService* history_service,
    const history::URLRow& row,
    const std::set<std::string>& scheme_allowlist,
    base::CancelableTaskTracker* tracker) {
  // The row may or may not already be in our index. If it is not already
  // indexed and it qualifies then it gets indexed. If it is already
  // indexed and still qualifies then it gets updated, otherwise it
  // is deleted from the index.
  bool row_was_updated = false;
  history::URLID row_id = row.id();
  auto row_pos = history_info_map_.find(row_id);
  if (row_pos == history_info_map_.end()) {
    // This new row should be indexed if it qualifies.
    history::URLRow new_row(row);
    new_row.set_id(row_id);
    row_was_updated =
        RowQualifiesAsSignificant(new_row, base::Time()) &&
        IndexRow(nullptr, history_service, new_row, scheme_allowlist, tracker);
  } else if (RowQualifiesAsSignificant(row, base::Time())) {
    // TODO(manukh): If we decide to launch `kDomainSuggestions`, `host_visits_`
    //   should be incremented here.
    // This indexed row still qualifies and will be re-indexed.
    // The url won't have changed but the title, visit count, etc.
    // might have changed.
    history::URLRow& row_to_update = row_pos->second.url_row;
    bool title_updated = row_to_update.title() != row.title();
    if (row_to_update.visit_count() != row.visit_count() ||
        row_to_update.typed_count() != row.typed_count() ||
        row_to_update.last_visit() != row.last_visit() || title_updated) {
      row_to_update.set_visit_count(row.visit_count());
      row_to_update.set_typed_count(row.typed_count());
      row_to_update.set_last_visit(row.last_visit());
      // If something appears to have changed, update the recent visits
      // information.
      ScheduleUpdateRecentVisits(history_service, row_id, tracker);
      // While the URL is guaranteed to remain stable, the title may have
      // changed. If so, then update the index with the changed words.
      if (title_updated) {
        // Clear all words associated with this row and re-index both the
        // URL and title.
        RemoveRowWordsFromIndex(row_to_update);
        row_to_update.set_title(row.title());
        RowWordStarts word_starts;
        AddRowWordsToIndex(row_to_update, &word_starts);
        word_starts_map_[row_id] = std::move(word_starts);
      }
      row_was_updated = true;
    }
  } else {
    // This indexed row no longer qualifies and will be de-indexed by clearing
    // all words associated with this row.
    // TODO(manukh): If we decide to launch `kDomainSuggestions`, `host_visits_`
    //  should be decremented here, and if it falls below the threshold, the URL
    //  removed from `highly_visited_hosts_`.
    RemoveRowFromIndex(row);
    row_was_updated = true;
  }
  if (row_was_updated)
    search_term_cache_.clear();  // This invalidates the cache.
  return row_was_updated;
}

void URLIndexPrivateData::UpdateRecentVisits(
    history::URLID url_id,
    const history::VisitVector& recent_visits) {
  auto row_pos = history_info_map_.find(url_id);
  if (row_pos != history_info_map_.end()) {
    VisitInfoVector* visits = &row_pos->second.visits;
    visits->clear();
    const size_t size =
        std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
    visits->reserve(size);
    for (size_t i = 0; i < size; i++) {
      // Copy from the history::VisitVector the only fields visits needs.
      visits->push_back(std::make_pair(recent_visits[i].visit_time,
                                       recent_visits[i].transition));
    }
  }
  // Else: Oddly, the URL doesn't seem to exist in the private index.
  // Ignore this update.  This can happen if, for instance, the user
  // removes the URL from URLIndexPrivateData before the historyDB call
  // returns.
}

void URLIndexPrivateData::ScheduleUpdateRecentVisits(
    history::HistoryService* history_service,
    history::URLID url_id,
    base::CancelableTaskTracker* tracker) {
  history_service->ScheduleDBTask(
      FROM_HERE,
      std::unique_ptr<history::HistoryDBTask>(
          new UpdateRecentVisitsFromHistoryDBTask(this, url_id)),
      tracker);
}

bool URLIndexPrivateData::DeleteURL(const GURL& url) {
  // Find the matching entry in the history_info_map_.
  // To avoid creating a temporary GURL instance,
  // the lambda expression should return the GURL reference.
  auto pos = std::ranges::find(
      history_info_map_, url,
      [](const std::pair<const HistoryID, HistoryInfoMapValue>& item)
          -> const GURL& { return item.second.url_row.url(); });
  if (pos == history_info_map_.end())
    return false;
  RemoveRowFromIndex(pos->second.url_row);
  search_term_cache_.clear();  // This invalidates the cache.
  return true;
}

// static
scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
    history::HistoryDatabase* history_db,
    const std::set<std::string>& scheme_allowlist) {
  if (!history_db)
    return nullptr;

  history::URLDatabase::URLEnumerator history_enum;
  if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
    return nullptr;

  scoped_refptr<URLIndexPrivateData> rebuilt_data(new URLIndexPrivateData);

  // Limiting the number of URLs indexed degrades the quality of suggestions to
  // save memory. This limit is only applied for urls indexed at startup and
  // more urls can be indexed during the browsing session. The primary use case
  // is for Android devices where the session is typically short, and low-memory
  // machines in general (Desktop or Mobile).
  const int max_urls_indexed =
      OmniboxFieldTrial::MaxNumHQPUrlsIndexedAtStartup();
  int num_urls_indexed = 0;
  for (history::URLRow row; history_enum.GetNextURL(&row);) {
    CHECK(row.url().is_valid());
    // Do not use >= to account for case of -1 for unlimited urls.
    if (rebuilt_data->IndexRow(history_db, nullptr, row, scheme_allowlist,
                               nullptr) &&
        num_urls_indexed++ == max_urls_indexed) {
      break;
    }
  }

  UMA_HISTOGRAM_COUNTS_1M("History.InMemoryURLHistoryItems",
                          rebuilt_data->history_id_word_map_.size());
  // TODO(manukh): Add histograms if we decide to experiment with
  //  `kDomainSuggestions`.

  return rebuilt_data;
}

scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
  scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
  data_copy->word_list_ = word_list_;
  data_copy->available_words_ = available_words_;
  data_copy->word_map_ = word_map_;
  data_copy->char_word_map_ = char_word_map_;
  data_copy->word_id_history_map_ = word_id_history_map_;
  data_copy->history_id_word_map_ = history_id_word_map_;
  data_copy->history_info_map_ = history_info_map_;
  data_copy->word_starts_map_ = word_starts_map_;
  return data_copy;
  // Not copied:
  //    search_term_cache_
}

bool URLIndexPrivateData::Empty() const {
  return history_info_map_.empty();
}

void URLIndexPrivateData::Clear() {
  word_list_.clear();
  available_words_ = base::stack<WordID>();
  word_map_.clear();
  char_word_map_.clear();
  word_id_history_map_.clear();
  history_id_word_map_.clear();
  history_info_map_.clear();
  word_starts_map_.clear();
}

size_t URLIndexPrivateData::EstimateMemoryUsage() const {
  size_t res = 0;

  res += base::trace_event::EstimateMemoryUsage(search_term_cache_);
  res += base::trace_event::EstimateMemoryUsage(word_list_);
  res += base::trace_event::EstimateMemoryUsage(available_words_);
  res += base::trace_event::EstimateMemoryUsage(word_map_);
  res += base::trace_event::EstimateMemoryUsage(char_word_map_);
  res += base::trace_event::EstimateMemoryUsage(word_id_history_map_);
  res += base::trace_event::EstimateMemoryUsage(history_id_word_map_);
  res += base::trace_event::EstimateMemoryUsage(history_info_map_);
  res += base::trace_event::EstimateMemoryUsage(word_starts_map_);
  res += base::trace_event::EstimateMemoryUsage(host_visits_);
  res += base::trace_event::EstimateMemoryUsage(highly_visited_hosts_);

  return res;
}

// Note that when running Chrome normally this destructor isn't called during
// shutdown because these objects are intentionally leaked. See
// InMemoryURLIndex::Shutdown for details.
URLIndexPrivateData::~URLIndexPrivateData() = default;

HistoryIDVector URLIndexPrivateData::HistoryIDsFromWords(
    const String16Vector& unsorted_words) {
  // Break the terms down into individual terms (words), get the candidate
  // set for each term, and intersect each to get a final candidate list.
  // Note that a single 'term' from the user's perspective might be
  // a string like "http://www.somewebsite.com" which, from our perspective,
  // is four words: 'http', 'www', 'somewebsite', and 'com'.
  HistoryIDVector history_ids;
  String16Vector words(unsorted_words);
  // Sort the words into the longest first as such are likely to narrow down
  // the results quicker. Also, single character words are the most expensive
  // to process so save them for last.
  std::sort(words.begin(), words.end(), LengthGreater);

  // TODO(dyaroshev): write a generic algorithm(crbug.com/696167).
  for (auto iter = words.begin(); iter != words.end(); ++iter) {
    HistoryIDSet term_history_set = HistoryIDsForTerm(*iter);
    if (term_history_set.empty())
      return HistoryIDVector();

    if (iter == words.begin()) {
      history_ids = {term_history_set.begin(), term_history_set.end()};
    } else {
      // set-intersection
      std::erase_if(history_ids, base::IsNotIn<HistoryIDSet>(term_history_set));
    }
  }
  return history_ids;
}

bool URLIndexPrivateData::TrimHistoryIdsPool(
    HistoryIDVector* history_ids) const {
  constexpr size_t kItemsToScoreLimit = 500;
  if (history_ids->size() <= kItemsToScoreLimit)
    return false;

  // Trim down the set by sorting by typed-count, visit-count, and last
  // visit.
  auto new_end = history_ids->begin() + kItemsToScoreLimit;
  HistoryItemFactorGreater item_factor_functor(history_info_map_);

  std::nth_element(history_ids->begin(), new_end, history_ids->end(),
                   item_factor_functor);
  history_ids->erase(new_end, history_ids->end());

  return true;
}

HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
    const std::u16string& term) {
  if (term.empty())
    return HistoryIDSet();

  // TODO(mrossetti): Consider optimizing for very common terms such as
  //  'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
  //  occurring words in the user's searches.

  size_t term_length = term.length();
  WordIDSet word_id_set;
  if (term_length > 1) {
    // See if this term or a prefix thereof is present in the cache.
    std::u16string term_lower = base::i18n::ToLower(term);
    auto best_prefix(search_term_cache_.end());
    for (auto cache_iter = search_term_cache_.begin();
         cache_iter != search_term_cache_.end(); ++cache_iter) {
      if (base::StartsWith(term_lower, base::i18n::ToLower(cache_iter->first),
                           base::CompareCase::SENSITIVE) &&
          (best_prefix == search_term_cache_.end() ||
           cache_iter->first.length() > best_prefix->first.length()))
        best_prefix = cache_iter;
    }

    // If a prefix was found then determine the leftover characters to be used
    // for further refining the results from that prefix.
    Char16Set prefix_chars;
    std::u16string leftovers(term);
    if (best_prefix != search_term_cache_.end()) {
      // If the prefix is an exact match for the term then grab the cached
      // results and we're done.
      size_t prefix_length = best_prefix->first.length();
      if (prefix_length == term_length) {
        best_prefix->second.used_ = true;
        return best_prefix->second.history_id_set_;
      }

      // Otherwise we have a handy starting point.
      // If there are no history results for this prefix then we can bail early
      // as there will be no history results for the full term.
      if (best_prefix->second.history_id_set_.empty()) {
        search_term_cache_[term] = SearchTermCacheItem();
        return HistoryIDSet();
      }
      word_id_set = best_prefix->second.word_id_set_;
      prefix_chars = Char16SetFromString16(best_prefix->first);
      leftovers = term.substr(prefix_length);
    }

    // Filter for each remaining, unique character in the term.
    Char16Set leftover_chars = Char16SetFromString16(leftovers);
    Char16Set unique_chars =
        base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);

    // Reduce the word set with any leftover, unprocessed characters.
    if (!unique_chars.empty()) {
      WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
      // We might come up empty on the leftovers.
      if (leftover_set.empty()) {
        search_term_cache_[term] = SearchTermCacheItem();
        return HistoryIDSet();
      }
      // Or there may not have been a prefix from which to start.
      if (prefix_chars.empty()) {
        word_id_set = std::move(leftover_set);
      } else {
        // set-intersection
        base::EraseIf(word_id_set, base::IsNotIn<WordIDSet>(leftover_set));
      }
    }

    // We must filter the word list because the resulting word set surely
    // contains words which do not have the search term as a proper subset.
    base::EraseIf(word_id_set, [this, &term](WordID word_id) {
      return word_list_[word_id].find(term) == std::u16string::npos;
    });
  } else {
    word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
  }

  // If any words resulted then we can compose a set of history IDs by unioning
  // the sets from each word.
  // We use |buffer| because it's more efficient to collect everything and then
  // construct a flat_set than to insert elements one by one.
  HistoryIDVector buffer;
  for (WordID word_id : word_id_set) {
    auto word_iter = word_id_history_map_.find(word_id);
    if (word_iter != word_id_history_map_.end()) {
      HistoryIDSet& word_history_id_set(word_iter->second);
      buffer.insert(buffer.end(), word_history_id_set.begin(),
                    word_history_id_set.end());
    }
  }
  HistoryIDSet history_id_set(buffer.begin(), buffer.end());

  // Record a new cache entry for this word if the term is longer than
  // a single character.
  if (term_length > 1)
    search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);

  return history_id_set;
}

WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
    const Char16Set& term_chars) {
  // TODO(dyaroshev): write a generic algorithm(crbug.com/696167).

  WordIDSet word_id_set;
  for (auto c_iter = term_chars.begin(); c_iter != term_chars.end(); ++c_iter) {
    auto char_iter = char_word_map_.find(*c_iter);
    // A character was not found so there are no matching results: bail.
    if (char_iter == char_word_map_.end())
      return WordIDSet();

    const WordIDSet& char_word_id_set(char_iter->second);
    // It is possible for there to no longer be any words associated with
    // a particular character. Give up in that case.
    if (char_word_id_set.empty())
      return WordIDSet();

    if (c_iter == term_chars.begin()) {
      word_id_set = char_word_id_set;
    } else {
      // set-intersection
      base::EraseIf(word_id_set, base::IsNotIn<WordIDSet>(char_word_id_set));
    }
  }
  return word_id_set;
}

void URLIndexPrivateData::HistoryIdsToScoredMatches(
    HistoryIDVector history_ids,
    const std::u16string& lower_raw_string,
    const std::string& host_filter,
    const TemplateURLService* template_url_service,
    bookmarks::BookmarkModel* bookmark_model,
    ScoredHistoryMatches* scored_items,
    OmniboxTriggeredFeatureService* triggered_feature_service) const {
  if (history_ids.empty())
    return;

  auto [lower_raw_terms, lower_terms_to_word_starts_offsets] =
      GetTermsAndWordStartsOffsets(lower_raw_string);

  // Don't score matches when there are no terms to score against.  (It's
  // possible that the word break iterater that extracts words to search for in
  // the database allows some whitespace "words" whereas SplitString excludes a
  // long list of whitespace.)  One could write a scoring function that gives a
  // reasonable order to matches when there are no terms (i.e., all the words
  // are some form of whitespace), but this is such a rare edge case that it's
  // not worth the time.
  if (lower_raw_terms.empty()) {
    return;
  }

  // Filter bad matches and other matches we don't want to display.
  std::erase_if(history_ids, [&](const HistoryID history_id) {
    return ShouldExclude(history_id, host_filter, template_url_service);
  });

  // Score the matches.
  const base::Time now = base::Time::Now();

  // `ScoredHistoryMatch` will score suggestions higher when there are fewer
  // matches. However, since HQP doesn't dedupe suggestions, this can be
  // problematic when there are multiple duplicate matches. Try counting the
  // unique hosts in the matches instead.
  size_t num_unique_hosts;
  std::set<std::string> unique_hosts = {};
  for (const auto& history_id : history_ids) {
    DCHECK(history_info_map_.count(history_id));
    unique_hosts.insert(
        history_info_map_.find(history_id)->second.url_row.url().host());
    // `ScoredHistoryMatch` assigns the same specificity to suggestions for
    // counts 4 or larger.
    // TODO(manukh) Should share `kMaxUniqueHosts` with `ScoredHistoryMatch`,
    //  but doing so is complicated as it's derived from parsing the default
    //  string value for the finch param `kHQPNumMatchesScoresRule`.
    constexpr size_t kMaxUniqueHosts = 4;
    if (unique_hosts.size() >= kMaxUniqueHosts)
      break;
  }
  num_unique_hosts = unique_hosts.size();

  for (HistoryID history_id : history_ids) {
    auto hist_pos = history_info_map_.find(history_id);
    const history::URLRow& hist_item = hist_pos->second.url_row;
    auto starts_pos = word_starts_map_.find(history_id);
    CHECK(starts_pos != word_starts_map_.end());

    bool is_highly_visited_host =
        !host_filter.empty() ||
        std::ranges::find(HighlyVisitedHosts(), hist_item.url().host()) !=
            HighlyVisitedHosts().end();
    ScoredHistoryMatch new_scored_match(
        hist_item, hist_pos->second.visits, lower_raw_string, lower_raw_terms,
        lower_terms_to_word_starts_offsets, starts_pos->second,
        bookmark_model && bookmark_model->IsBookmarked(hist_item.url()),
        num_unique_hosts, is_highly_visited_host, now);

    if (new_scored_match.raw_score_before_domain_boosting <
        new_scored_match.raw_score_after_domain_boosting) {
      triggered_feature_service->FeatureTriggered(
          metrics::OmniboxEventProto_Feature_DOMAIN_SUGGESTIONS);
    }

    // Filter new matches that ended up scoring 0. (These are usually matches
    // which didn't match the user's raw terms.)
    if (new_scored_match.raw_score > 0)
      scored_items->push_back(std::move(new_scored_match));
  }
}

// static
void URLIndexPrivateData::CalculateWordStartsOffsets(
    const String16Vector& lower_terms,
    WordStarts* lower_terms_to_word_starts_offsets) {
  // Calculate offsets for each term.  For instance, the offset for
  // ".net" should be 1, indicating that the actual word-part of the term
  // starts at offset 1.
  lower_terms_to_word_starts_offsets->resize(lower_terms.size(), 0u);
  for (size_t i = 0; i < lower_terms.size(); ++i) {
    TailoredWordBreakIterator iter(lower_terms[i]);
    // If the iterator doesn't work, assume an offset of 0.
    if (!iter.Init())
      continue;
    // Find the first word start. If the iterator didn't find a word break,
    // set an offset of term size. For example, the offset for "://" should be
    // 3, indicating that the word-part is missing.
    while (iter.Advance() && !iter.IsWord()) {
    }

    (*lower_terms_to_word_starts_offsets)[i] = iter.prev();
  }
}

bool URLIndexPrivateData::IndexRow(
    history::HistoryDatabase* history_db,
    history::HistoryService* history_service,
    const history::URLRow& row,
    const std::set<std::string>& scheme_allowlist,
    base::CancelableTaskTracker* tracker) {
  const GURL& gurl(row.url());

  // Index only URLs with an allowlisted scheme.
  if (!URLSchemeIsAllowlisted(gurl, scheme_allowlist))
    return false;

  const history::URLID row_id = row.id();
  // Strip out username and password before saving and indexing.
  const GURL new_url = ClearUsernameAndPassword(gurl);

  HistoryID history_id = static_cast<HistoryID>(row_id);
  DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());

  // Add the row for quick lookup in the history info store.
  history::URLRow new_row(new_url, row_id);
  new_row.set_visit_count(row.visit_count());
  new_row.set_typed_count(row.typed_count());
  new_row.set_last_visit(row.last_visit());
  new_row.set_title(row.title());

  // Index the words contained in the URL and title of the row.
  RowWordStarts word_starts;
  AddRowWordsToIndex(new_row, &word_starts);
  word_starts_map_[history_id] = std::move(word_starts);

  history_info_map_[history_id].url_row = std::move(new_row);

  // Update the recent visits information or schedule the update
  // as appropriate.
  if (history_db) {
    // We'd like to check that we're on the history DB thread.
    // However, unittest code actually calls this on the UI thread.
    // So we don't do any thread checks.
    history::VisitVector recent_visits;
    if (history_db->GetMostRecentVisitsForURL(row_id, kMaxVisitsToStoreInCache,
                                              &recent_visits))
      UpdateRecentVisits(row_id, recent_visits);
  } else if (history_service) {
    DCHECK(tracker);
    ScheduleUpdateRecentVisits(history_service, row_id, tracker);
  }

  // Increment `host_visits_` for and possibly add the host to
  // `highly_visited_hosts`.
  static const bool domain_suggestions_enabled =
      base::FeatureList::IsEnabled(omnibox::kDomainSuggestions);
  if (domain_suggestions_enabled) {
    auto& host_info = host_visits_[gurl.host()];
    const bool was_highly_visited = host_info.IsHighlyVisited();
    host_info.AddUrl(row);
    // If the host was already added to `highly_visited_hosts_`, no need to
    // re-add it.
    if (!was_highly_visited && host_info.IsHighlyVisited())
      highly_visited_hosts_.push_back(gurl.host());
  }

  return true;
}

void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow& row,
                                             RowWordStarts* word_starts) {
  HistoryID history_id = static_cast<HistoryID>(row.id());
  // Split URL into individual, unique words then add in the title words.
  const GURL& gurl(row.url());
  DCHECK(gurl.is_valid());
  const std::u16string& url =
      string_cleaning::CleanUpUrlForMatching(gurl, nullptr);
  String16Set url_words = String16SetFromString16(
      url, word_starts ? &word_starts->url_word_starts_ : nullptr);
  const std::u16string& title =
      string_cleaning::CleanUpTitleForMatching(row.title());
  String16Set title_words = String16SetFromString16(
      title, word_starts ? &word_starts->title_word_starts_ : nullptr);
  for (const auto& word :
       base::STLSetUnion<String16Set>(url_words, title_words))
    AddWordToIndex(word, history_id);

  search_term_cache_.clear();  // Invalidate the term cache.
}

void URLIndexPrivateData::AddWordToIndex(const std::u16string& term,
                                         HistoryID history_id) {
  auto [word_pos, is_new] = word_map_.insert(std::make_pair(term, WordID()));

  // Adding a new word (i.e. a word that is not already in the word index).
  if (is_new) {
    word_pos->second = AddNewWordToWordList(term);

    // For each character in the newly added word add the word to the character
    // index.
    for (char16_t uni_char : Char16SetFromString16(term))
      char_word_map_[uni_char].insert(word_pos->second);
  }

  word_id_history_map_[word_pos->second].insert(history_id);
  history_id_word_map_[history_id].insert(word_pos->second);
}

WordID URLIndexPrivateData::AddNewWordToWordList(const std::u16string& term) {
  WordID word_id = word_list_.size();
  if (available_words_.empty()) {
    word_list_.push_back(term);
    return word_id;
  }

  word_id = available_words_.top();
  available_words_.pop();
  word_list_[word_id] = term;
  return word_id;
}

void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow& row) {
  RemoveRowWordsFromIndex(row);
  HistoryID history_id = static_cast<HistoryID>(row.id());
  history_info_map_.erase(history_id);
  word_starts_map_.erase(history_id);
}

void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow& row) {
  // Remove the entries in history_id_word_map_ and word_id_history_map_ for
  // this row.
  HistoryID history_id = static_cast<HistoryID>(row.id());
  WordIDSet word_id_set = history_id_word_map_[history_id];
  history_id_word_map_.erase(history_id);

  // Reconcile any changes to word usage.
  for (WordID word_id : word_id_set) {
    auto word_id_history_map_iter = word_id_history_map_.find(word_id);
    CHECK(word_id_history_map_iter != word_id_history_map_.end());

    word_id_history_map_iter->second.erase(history_id);
    if (!word_id_history_map_iter->second.empty())
      continue;

    // The word is no longer in use. Reconcile any changes to character usage.
    const std::u16string& word = word_list_[word_id];
    for (char16_t uni_char : Char16SetFromString16(word)) {
      auto char_word_map_iter = char_word_map_.find(uni_char);
      char_word_map_iter->second.erase(word_id);
      if (char_word_map_iter->second.empty())
        char_word_map_.erase(char_word_map_iter);
    }

    // Complete the removal of references to the word.
    word_id_history_map_.erase(word_id_history_map_iter);
    word_map_.erase(word);
    word_list_[word_id] = std::u16string();
    available_words_.push(word_id);
  }
}

void URLIndexPrivateData::ResetSearchTermCache() {
  for (auto& item : search_term_cache_)
    item.second.used_ = false;
}

// static
bool URLIndexPrivateData::URLSchemeIsAllowlisted(
    const GURL& gurl,
    const std::set<std::string>& allowlist) {
  return allowlist.find(gurl.scheme()) != allowlist.end();
}

bool URLIndexPrivateData::ShouldExclude(
    const HistoryID history_id,
    const std::string& host_filter,
    const TemplateURLService* template_url_service) const {
  auto hist_pos = history_info_map_.find(history_id);
  if (hist_pos == history_info_map_.end())
    return true;

  GURL url = hist_pos->second.url_row.url();
  if (!url.is_valid())  // Possible in case of profile corruption.
    return true;

  if (!host_filter.empty() && url.host() != host_filter)
    return true;

  // Skip results corresponding to queries from the default search engine.
  // These are low-quality, difficult-to-understand matches for users.
  // SearchProvider should surface past queries in a better way.
  const TemplateURL* default_search_engine =
      template_url_service ? template_url_service->GetDefaultSearchProvider()
                           : nullptr;
  return default_search_engine &&
         default_search_engine->IsSearchURL(
             url, template_url_service->search_terms_data());
}

// SearchTermCacheItem ---------------------------------------------------------

URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
    const WordIDSet& word_id_set,
    const HistoryIDSet& history_id_set)
    : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {}

URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {}

URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
    const SearchTermCacheItem& other) = default;

size_t URLIndexPrivateData::SearchTermCacheItem::EstimateMemoryUsage() const {
  return base::trace_event::EstimateMemoryUsage(word_id_set_) +
         base::trace_event::EstimateMemoryUsage(history_id_set_);
}

// static
std::pair<String16Vector, WordStarts>
URLIndexPrivateData::GetTermsAndWordStartsOffsets(
    const std::u16string& lower_raw_string) {
  String16Vector lower_raw_terms =
      base::SplitString(lower_raw_string, base::kWhitespaceUTF16,
                        base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
  if (lower_raw_terms.empty()) {
    return {String16Vector(), WordStarts()};
  }

  WordStarts lower_terms_to_word_starts_offsets;
  CalculateWordStartsOffsets(lower_raw_terms,
                             &lower_terms_to_word_starts_offsets);
  return {std::move(lower_raw_terms),
          std::move(lower_terms_to_word_starts_offsets)};
}

URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() = default;

// URLIndexPrivateData::HistoryItemFactorGreater -------------------------------

URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
    const HistoryInfoMap& history_info_map)
    : history_info_map_(history_info_map) {}

URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() =
    default;

bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
    const HistoryID h1,
    const HistoryID h2) {
  auto entry1(history_info_map_->find(h1));
  if (entry1 == history_info_map_->end()) {
    return false;
  }
  auto entry2(history_info_map_->find(h2));
  if (entry2 == history_info_map_->end()) {
    return true;
  }
  const history::URLRow& r1(entry1->second.url_row);
  const history::URLRow& r2(entry2->second.url_row);
  // First cut: typed count, visit count, recency.
  // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
  // recently visited (within the last 12/24 hours) as highly important. Get
  // input from mpearson.
  if (r1.typed_count() != r2.typed_count())
    return (r1.typed_count() > r2.typed_count());
  if (r1.visit_count() != r2.visit_count())
    return (r1.visit_count() > r2.visit_count());
  return (r1.last_visit() > r2.last_visit());
}

// HostInfo --------------------------------------------------------------------

bool URLIndexPrivateData::HostInfo::IsHighlyVisited() const {
  static const int visited_urls_threshold =
      OmniboxFieldTrial::kDomainSuggestionsTypedUrlsThreshold.Get();
  static const int typed_visit_threshold =
      OmniboxFieldTrial::kDomainSuggestionsTypedVisitThreshold.Get();

  return typed_urls_ >= visited_urls_threshold &&
         typed_visits_ >= typed_visit_threshold;
}

void URLIndexPrivateData::HostInfo::AddUrl(const history::URLRow& row) {
  static const int visited_urls_offset =
      OmniboxFieldTrial::kDomainSuggestionsTypedUrlsOffset.Get();
  static const int typed_visit_offset =
      OmniboxFieldTrial::kDomainSuggestionsTypedVisitOffset.Get();
  static const int typed_visit_cap_per_visit =
      OmniboxFieldTrial::kDomainSuggestionsTypedVisitCapPerVisit.Get();

  if (row.typed_count() >= visited_urls_offset)
    typed_urls_++;

  typed_visits_ += std::clamp(row.typed_count() - typed_visit_offset, 0,
                              typed_visit_cap_per_visit);
}