File: history_fuzzy_provider.h

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (268 lines) | stat: -rw-r--r-- 11,255 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_
#define COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_

#include <array>
#include <memory>
#include <unordered_map>
#include <vector>

#include "base/memory/weak_ptr.h"
#include "base/scoped_observation.h"
#include "base/synchronization/waitable_event.h"
#include "base/task/cancelable_task_tracker.h"
#include "components/history/core/browser/history_service.h"
#include "components/history/core/browser/history_service_observer.h"
#include "components/history/core/browser/history_types.h"
#include "components/omnibox/browser/autocomplete_input.h"
#include "components/omnibox/browser/autocomplete_match.h"
#include "components/omnibox/browser/autocomplete_result.h"
#include "components/omnibox/browser/history_provider.h"

// This namespace encapsulates the implementation details of fuzzy matching and
// correction. It is used by the public (non-namespaced) HistoryFuzzyProvider
// below.
namespace fuzzy {

// An `Edit` represents a single character change to a string.
struct Edit {
  // The kind of change to apply to text. Note, KEEP essentially means
  // no edit, and will never be applied or kept as part of a `Correction`.
  enum class Kind {
    KEEP,
    DELETE,
    INSERT,
    REPLACE,
    TRANSPOSE,
  };

  Edit(Kind kind, size_t at, char16_t new_char);

  // Applies this edit to given text, mutating it in place.
  void ApplyTo(std::u16string& text) const;

  // The edit operation, the kind of change to make to text.
  Kind kind;

  // Character data; relevant for REPLACE and INSERT, and also for
  // TRANSPOSE (as a minor optimization, first char is stored here).
  char16_t new_char;

  // Text index at which to apply change.
  size_t at;
};

// A `Correction` is a short list of `Edit`s required to change
// an input string that escapes the trie into a string that is contained by it.
struct Correction {
  // Tolerance schedules must use a `limit` of no more than `kMaxEdits`.
  constexpr static int kMaxEdits = 3;

  // Creates a new correction including this one plus a given `Edit`.
  Correction WithEdit(Edit edit) const;

  // Applies this edit to given text, mutating it in place.
  void ApplyTo(std::u16string& text) const;

  // Number of edits to apply for the full correction.
  size_t edit_count = 0;

  // The actual edits to apply; only the first `edit_count` elements are valid.
  // This is a fixed-size in-struct array instead of a vector because
  // finding and building corrections is performance critical and keeping this
  // struct simple on the stack is much faster than pounding on the allocator.
  std::array<Edit, kMaxEdits> edits = {Edit{Edit::Kind::KEEP, 0, '_'},
                                       Edit{Edit::Kind::KEEP, 0, '_'},
                                       Edit{Edit::Kind::KEEP, 0, '_'}};
};

// This utility struct defines how tolerance changes across the length
// of input text being processed.
// Example: this schedule `{ .start_index = 1, .step_length = 4, .limit = 3 }`
// means the first character must match, then starting from the second
// character, one correction is tolerated per four characters, up to a maximum
// of three total corrections.
struct ToleranceSchedule {
  int ToleranceAt(int index) {
    if (index < start_index) {
      return 0;
    }
    if (step_length <= 0) {
      return limit;
    }
    return std::min(limit, 1 + (index - start_index) / step_length);
  }

  // Index at which tolerance is allowed to exceed zero.
  int start_index = 0;

  // Number of index steps between successive tolerance increases.
  // When nonpositive, the `limit` value is used directly instead of stepping.
  int step_length = 0;

  // Regardless of index, tolerance will not exceed this limit.
  // Note, `limit` must not exceed `Correction::kMaxEdits`.
  int limit = 0;
};

// Nodes form a trie structure used to find potential input corrections.
struct Node {
  Node();
  Node(Node&& node);
  Node& operator=(Node&&) = default;
  ~Node();

  // Walk the trie, injecting nodes as necessary to build the given `text`
  // starting at `text_index`. The `text_index` parameter advances as an index
  // into `text` and ensures recursion is bounded.
  void Insert(const std::u16string& text, size_t text_index);

  // Delete nodes as necessary to remove given `text` from the trie.
  void Delete(const std::u16string& text, size_t text_index);

  // Delete all nodes to clear the trie.
  void Clear();

  // Produce corrections necessary to get `text` back on trie. Each correction
  // will be of size bounded by `tolerance_schedule`, and none will have smaller
  // edit distance than any other (i.e. all corrections are equally optimal).
  // Returns whether input `text` starting at `from` is present in this trie.
  //  - true without corrections -> `text` on trie.
  //  - false without corrections -> cannot complete on trie within schedule.
  //  - true with corrections -> never happens because `text` is on trie.
  //  - false with corrections -> `text` off trie but corrections are on trie.
  // Note: For efficiency, not all possible corrections are returned; any found
  // valid corrections will preclude further more elaborate subcorrections.
  bool FindCorrections(const std::u16string& text,
                       ToleranceSchedule tolerance_schedule,
                       std::vector<Correction>& corrections) const;

  // Estimates dynamic memory usage.
  // See base/trace_event/memory_usage_estimator.h for more info.
  size_t EstimateMemoryUsage() const;

  // Returns number of terminals contained within this trie (may include self).
  int TerminalCount() const;

  // This is used to distinguish terminal nodes in the trie (nonzero values).
  int relevance = 0;

  // This maintains the sum of `relevance` plus all `relevance_total` values
  // contained within `next`. As long as `relevance` values are 0 or 1, this can
  // be used as a count of contained terminals. When it drops to zero, the
  // node may be deleted from the trie.
  int relevance_total = 0;

  // Note: Some C++ implementations of unordered_map support using the
  // containing struct (Node) as the element type, but some do not. To avoid
  // potential build issues in downstream projects that use Chromium code,
  // ensure the element type is of known size (a fully declared type).
  std::unordered_map<char16_t, std::unique_ptr<Node>> next;
};

}  // namespace fuzzy

// This class is an autocomplete provider which provides URL results from
// history for inputs that may match inexactly.
// The mechanism that makes it "fuzzy" is a trie search that tolerates errors
// and produces corrected inputs which can then be autocompleted as normal.
// Relevance penalties are applied for corrections as errors reduce confidence.
// The trie is built from history URL text and any text that can be formed
// by tracing a path from the root is said to be "on trie" while any text
// that escapes the trie with a character not present is said to be "off trie".
// If the autocomplete input is fully on trie, no suggestions are provided
// because exact matching should already provide the best results. If the
// autocomplete input is off trie, corrections of bounded size are produced
// to get the input back on trie, and these should then be eligible for
// autocompletion.
// The data structure could contain anything helpful, including ways to mark
// terminal nodes (signaling a path as a complete string). A relevance score
// could serve this purpose and make full independent autocompletion possible,
// but efficiency is a top concern so node size is minimized, just enough to
// get inputs back on track, not to replicate the full completion and scoring
// of other autocomplete providers.
class HistoryFuzzyProvider : public HistoryProvider,
                             public history::HistoryServiceObserver {
 public:
  // Records fuzzy matching related metrics when user opens a match.
  static void RecordOpenMatchMetrics(const AutocompleteResult& result,
                                     const AutocompleteMatch& match_opened);

  explicit HistoryFuzzyProvider(AutocompleteProviderClient* client);
  HistoryFuzzyProvider(const HistoryFuzzyProvider&) = delete;
  HistoryFuzzyProvider& operator=(const HistoryFuzzyProvider&) = delete;

  // HistoryProvider:
  // AutocompleteProvider. `minimal_changes` is ignored since there is no async
  // completion performed.
  void Start(const AutocompleteInput& input, bool minimal_changes) override;

  // Estimates dynamic memory usage.
  // See base/trace_event/memory_usage_estimator.h for more info.
  size_t EstimateMemoryUsage() const override;

 private:
  ~HistoryFuzzyProvider() override;

  // Performs the autocomplete matching and scoring.
  void DoAutocomplete();

  // Add the best matches, converting them to fuzzy suggestions in the process.
  // The given `penalty` is a percentage relevance penalty that will
  // deduct from the relevance of each match.
  // Returns the number of matches actually added.
  int AddConvertedMatches(const ACMatches& matches, int penalty);

  // Main thread callback to receive trie of URLs loaded from database.
  void OnUrlsLoaded(fuzzy::Node node);

  // history::HistoryServiceObserver:
  // Adds visited URL host to trie.
  void OnURLVisited(history::HistoryService* history_service,
                    const history::URLRow& url_row,
                    const history::VisitRow& new_visit) override;

  // Removes deleted (or all) URLs from trie.
  void OnHistoryDeletions(history::HistoryService* history_service,
                          const history::DeletionInfo& deletion_info) override;

  // Record UMA histogram data for measuring usefulness of sub-providers.
  void RecordMatchConversion(const char* name, int count);

  AutocompleteInput autocomplete_input_;

  // This is the trie facilitating search for input alternatives.
  fuzzy::Node root_;

  // This provides a thread-safe way to check that loading has completed.
  // Keeping the event may not be necessary since signal check is done from
  // same main thread, but having it should provide some robustness in case
  // we autocomplete from another thread or while db task is running.
  base::WaitableEvent urls_loaded_event_;

  // Task tracker for URL data loading.
  base::CancelableTaskTracker task_tracker_;

  // Tracks the observed history service, for cleanup.
  base::ScopedObservation<history::HistoryService,
                          history::HistoryServiceObserver>
      history_service_observation_{this};

  // This threshold determines the input length at which fuzzy suggestions
  // will start being searched and generated. Shorter inputs won't be checked.
  size_t min_input_length_;

  // These are tunable parameters that affect the fuzzy suggestion
  // relevance penalty.
  int penalty_low_;
  int penalty_high_;
  size_t penalty_taper_length_;

  // Weak pointer factory for callback binding safety.
  base::WeakPtrFactory<HistoryFuzzyProvider> weak_ptr_factory_{this};
};

#endif  // COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_