1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
|
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_
#define COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_
#include <array>
#include <memory>
#include <unordered_map>
#include <vector>
#include "base/memory/weak_ptr.h"
#include "base/scoped_observation.h"
#include "base/synchronization/waitable_event.h"
#include "base/task/cancelable_task_tracker.h"
#include "components/history/core/browser/history_service.h"
#include "components/history/core/browser/history_service_observer.h"
#include "components/history/core/browser/history_types.h"
#include "components/omnibox/browser/autocomplete_input.h"
#include "components/omnibox/browser/autocomplete_match.h"
#include "components/omnibox/browser/autocomplete_result.h"
#include "components/omnibox/browser/history_provider.h"
// This namespace encapsulates the implementation details of fuzzy matching and
// correction. It is used by the public (non-namespaced) HistoryFuzzyProvider
// below.
namespace fuzzy {
// An `Edit` represents a single character change to a string.
struct Edit {
// The kind of change to apply to text. Note, KEEP essentially means
// no edit, and will never be applied or kept as part of a `Correction`.
enum class Kind {
KEEP,
DELETE,
INSERT,
REPLACE,
TRANSPOSE,
};
Edit(Kind kind, size_t at, char16_t new_char);
// Applies this edit to given text, mutating it in place.
void ApplyTo(std::u16string& text) const;
// The edit operation, the kind of change to make to text.
Kind kind;
// Character data; relevant for REPLACE and INSERT, and also for
// TRANSPOSE (as a minor optimization, first char is stored here).
char16_t new_char;
// Text index at which to apply change.
size_t at;
};
// A `Correction` is a short list of `Edit`s required to change
// an input string that escapes the trie into a string that is contained by it.
struct Correction {
// Tolerance schedules must use a `limit` of no more than `kMaxEdits`.
constexpr static int kMaxEdits = 3;
// Creates a new correction including this one plus a given `Edit`.
Correction WithEdit(Edit edit) const;
// Applies this edit to given text, mutating it in place.
void ApplyTo(std::u16string& text) const;
// Number of edits to apply for the full correction.
size_t edit_count = 0;
// The actual edits to apply; only the first `edit_count` elements are valid.
// This is a fixed-size in-struct array instead of a vector because
// finding and building corrections is performance critical and keeping this
// struct simple on the stack is much faster than pounding on the allocator.
std::array<Edit, kMaxEdits> edits = {Edit{Edit::Kind::KEEP, 0, '_'},
Edit{Edit::Kind::KEEP, 0, '_'},
Edit{Edit::Kind::KEEP, 0, '_'}};
};
// This utility struct defines how tolerance changes across the length
// of input text being processed.
// Example: this schedule `{ .start_index = 1, .step_length = 4, .limit = 3 }`
// means the first character must match, then starting from the second
// character, one correction is tolerated per four characters, up to a maximum
// of three total corrections.
struct ToleranceSchedule {
int ToleranceAt(int index) {
if (index < start_index) {
return 0;
}
if (step_length <= 0) {
return limit;
}
return std::min(limit, 1 + (index - start_index) / step_length);
}
// Index at which tolerance is allowed to exceed zero.
int start_index = 0;
// Number of index steps between successive tolerance increases.
// When nonpositive, the `limit` value is used directly instead of stepping.
int step_length = 0;
// Regardless of index, tolerance will not exceed this limit.
// Note, `limit` must not exceed `Correction::kMaxEdits`.
int limit = 0;
};
// Nodes form a trie structure used to find potential input corrections.
struct Node {
Node();
Node(Node&& node);
Node& operator=(Node&&) = default;
~Node();
// Walk the trie, injecting nodes as necessary to build the given `text`
// starting at `text_index`. The `text_index` parameter advances as an index
// into `text` and ensures recursion is bounded.
void Insert(const std::u16string& text, size_t text_index);
// Delete nodes as necessary to remove given `text` from the trie.
void Delete(const std::u16string& text, size_t text_index);
// Delete all nodes to clear the trie.
void Clear();
// Produce corrections necessary to get `text` back on trie. Each correction
// will be of size bounded by `tolerance_schedule`, and none will have smaller
// edit distance than any other (i.e. all corrections are equally optimal).
// Returns whether input `text` starting at `from` is present in this trie.
// - true without corrections -> `text` on trie.
// - false without corrections -> cannot complete on trie within schedule.
// - true with corrections -> never happens because `text` is on trie.
// - false with corrections -> `text` off trie but corrections are on trie.
// Note: For efficiency, not all possible corrections are returned; any found
// valid corrections will preclude further more elaborate subcorrections.
bool FindCorrections(const std::u16string& text,
ToleranceSchedule tolerance_schedule,
std::vector<Correction>& corrections) const;
// Estimates dynamic memory usage.
// See base/trace_event/memory_usage_estimator.h for more info.
size_t EstimateMemoryUsage() const;
// Returns number of terminals contained within this trie (may include self).
int TerminalCount() const;
// This is used to distinguish terminal nodes in the trie (nonzero values).
int relevance = 0;
// This maintains the sum of `relevance` plus all `relevance_total` values
// contained within `next`. As long as `relevance` values are 0 or 1, this can
// be used as a count of contained terminals. When it drops to zero, the
// node may be deleted from the trie.
int relevance_total = 0;
// Note: Some C++ implementations of unordered_map support using the
// containing struct (Node) as the element type, but some do not. To avoid
// potential build issues in downstream projects that use Chromium code,
// ensure the element type is of known size (a fully declared type).
std::unordered_map<char16_t, std::unique_ptr<Node>> next;
};
} // namespace fuzzy
// This class is an autocomplete provider which provides URL results from
// history for inputs that may match inexactly.
// The mechanism that makes it "fuzzy" is a trie search that tolerates errors
// and produces corrected inputs which can then be autocompleted as normal.
// Relevance penalties are applied for corrections as errors reduce confidence.
// The trie is built from history URL text and any text that can be formed
// by tracing a path from the root is said to be "on trie" while any text
// that escapes the trie with a character not present is said to be "off trie".
// If the autocomplete input is fully on trie, no suggestions are provided
// because exact matching should already provide the best results. If the
// autocomplete input is off trie, corrections of bounded size are produced
// to get the input back on trie, and these should then be eligible for
// autocompletion.
// The data structure could contain anything helpful, including ways to mark
// terminal nodes (signaling a path as a complete string). A relevance score
// could serve this purpose and make full independent autocompletion possible,
// but efficiency is a top concern so node size is minimized, just enough to
// get inputs back on track, not to replicate the full completion and scoring
// of other autocomplete providers.
class HistoryFuzzyProvider : public HistoryProvider,
public history::HistoryServiceObserver {
public:
// Records fuzzy matching related metrics when user opens a match.
static void RecordOpenMatchMetrics(const AutocompleteResult& result,
const AutocompleteMatch& match_opened);
explicit HistoryFuzzyProvider(AutocompleteProviderClient* client);
HistoryFuzzyProvider(const HistoryFuzzyProvider&) = delete;
HistoryFuzzyProvider& operator=(const HistoryFuzzyProvider&) = delete;
// HistoryProvider:
// AutocompleteProvider. `minimal_changes` is ignored since there is no async
// completion performed.
void Start(const AutocompleteInput& input, bool minimal_changes) override;
// Estimates dynamic memory usage.
// See base/trace_event/memory_usage_estimator.h for more info.
size_t EstimateMemoryUsage() const override;
private:
~HistoryFuzzyProvider() override;
// Performs the autocomplete matching and scoring.
void DoAutocomplete();
// Add the best matches, converting them to fuzzy suggestions in the process.
// The given `penalty` is a percentage relevance penalty that will
// deduct from the relevance of each match.
// Returns the number of matches actually added.
int AddConvertedMatches(const ACMatches& matches, int penalty);
// Main thread callback to receive trie of URLs loaded from database.
void OnUrlsLoaded(fuzzy::Node node);
// history::HistoryServiceObserver:
// Adds visited URL host to trie.
void OnURLVisited(history::HistoryService* history_service,
const history::URLRow& url_row,
const history::VisitRow& new_visit) override;
// Removes deleted (or all) URLs from trie.
void OnHistoryDeletions(history::HistoryService* history_service,
const history::DeletionInfo& deletion_info) override;
// Record UMA histogram data for measuring usefulness of sub-providers.
void RecordMatchConversion(const char* name, int count);
AutocompleteInput autocomplete_input_;
// This is the trie facilitating search for input alternatives.
fuzzy::Node root_;
// This provides a thread-safe way to check that loading has completed.
// Keeping the event may not be necessary since signal check is done from
// same main thread, but having it should provide some robustness in case
// we autocomplete from another thread or while db task is running.
base::WaitableEvent urls_loaded_event_;
// Task tracker for URL data loading.
base::CancelableTaskTracker task_tracker_;
// Tracks the observed history service, for cleanup.
base::ScopedObservation<history::HistoryService,
history::HistoryServiceObserver>
history_service_observation_{this};
// This threshold determines the input length at which fuzzy suggestions
// will start being searched and generated. Shorter inputs won't be checked.
size_t min_input_length_;
// These are tunable parameters that affect the fuzzy suggestion
// relevance penalty.
int penalty_low_;
int penalty_high_;
size_t penalty_taper_length_;
// Weak pointer factory for callback binding safety.
base::WeakPtrFactory<HistoryFuzzyProvider> weak_ptr_factory_{this};
};
#endif // COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_
|