File: prefix_matcher.cc

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (233 lines) | stat: -rw-r--r-- 8,882 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chromeos/ash/components/string_matching/prefix_matcher.h"

#include <cstddef>
#include <queue>
#include <string>

#include "base/check.h"
#include "base/containers/flat_map.h"
#include "base/strings/strcat.h"
#include "chromeos/ash/components/string_matching/tokenized_string.h"

namespace ash::string_matching {
namespace {
using prefix_matcher_constants::kIsFrontOfTokenCharScore;
using prefix_matcher_constants::kIsPrefixCharScore;
using prefix_matcher_constants::kIsWeakHitCharScore;
using prefix_matcher_constants::kNoMatchScore;

MatchInfo::MatchInfo() = default;
MatchInfo::~MatchInfo() = default;
}  // namespace

// PrefixMatcher:
//
// The factors below are applied when the current char of query matches
// the current char of the text to be matched. Different factors are chosen
// based on where the match happens:
//
// 1) `kIsPrefixCharScore` is used when the matched portion is a prefix of both
// the query and the text, which implies that the matched chars are at the same
// position in query and text. This is the most preferred case thus it has the
// highest score.
//
// 2) `kIsFrontOfTokenCharScore` will be used if the first char of the token
// matches the current char of the query.
//
// 3) Otherwise, the match is considered as weak, and `kIsWeakHitCharScore` is
// used.
//
// Examples:
//
//   For text: 'Google Chrome'.
//
//   Query 'go' would yield kIsPrefixCharScore for each char.
//   Query 'ch' would use kIsFrontOfTokenCharScore for 'c' and
//       kIsWeakHitCharScore for 'h'.
//
// kNoMatchScore is a relevance score that represents no match.

PrefixMatcher::PrefixMatcher(const TokenizedString& query,
                             const TokenizedString& text)
    : query_(query), text_(text) {}
PrefixMatcher::~PrefixMatcher() = default;

bool PrefixMatcher::Match() {
  MatchInfo sentence_match_info;
  SentencePrefixMatch(sentence_match_info);
  MatchInfo token_match_info;
  TokenPrefixMatch(token_match_info);

  MatchInfo& better_match =
      sentence_match_info.relevance >= token_match_info.relevance
          ? sentence_match_info
          : token_match_info;
  relevance_ = better_match.relevance;
  hits_ = better_match.hits;
  return relevance_ > 0.0;
}

void PrefixMatcher::SentencePrefixMatch(MatchInfo& sentence_match_info) {
  // Since we are concatenating the tokens, we do not have whitespace
  // separation, and so we have to be careful with index calculation later on.
  std::u16string query_sentence = base::StrCat(query_->tokens());
  std::u16string text_sentence = base::StrCat(text_->tokens());

  // Queue to store the starting index of each token in the `text_sentence`.
  std::queue<size_t> text_token_indexes;
  size_t token_index = 0;
  for (auto text_token : text_->tokens()) {
    text_token_indexes.push(token_index);
    token_index += text_token.size();
  }

  while (!text_token_indexes.empty()) {
    size_t start_index = text_token_indexes.front();
    size_t matched_index = text_sentence.find(query_sentence, start_index);

    while (!text_token_indexes.empty() &&
           text_token_indexes.front() < matched_index) {
      text_token_indexes.pop();
    }

    if (text_token_indexes.empty() ||
        text_token_indexes.front() > matched_index) {
      continue;
    }

    // Calculate the `relevance` score and `hits` and return if the found match
    // begins from the starting index of a token.
    DCHECK(text_token_indexes.front() == matched_index);
    size_t text_pos = text_->tokens().size() - text_token_indexes.size();
    size_t text_sentence_end_pos = matched_index + query_sentence.size();

    sentence_match_info.relevance = 0.0;
    sentence_match_info.current_match.set_start(
        text_->mappings()[text_pos].start());

    while (!text_token_indexes.empty() &&
           text_token_indexes.front() < text_sentence_end_pos) {
      // Text size may be smaller than the token size as prefix matching is
      // allowed for the last token.
      size_t text_size =
          std::min(text_->tokens()[text_pos].size(),
                   text_sentence_end_pos - text_token_indexes.front());
      sentence_match_info.relevance +=
          matched_index == 0 ? kIsPrefixCharScore * text_size
                             : kIsFrontOfTokenCharScore +
                                   kIsWeakHitCharScore * (text_size - 1);
      sentence_match_info.current_match.set_end(
          text_->mappings()[text_pos].start() + text_size);

      ++text_pos;
      text_token_indexes.pop();
    }
    sentence_match_info.hits.push_back(sentence_match_info.current_match);
    return;
  }
}

void PrefixMatcher::TokenPrefixMatch(MatchInfo& token_match_info) {
  const size_t num_query_token = query_->tokens().size();
  const size_t num_text_token = text_->tokens().size();

  // Not a prefix match if `query_`  contains more tokens than  `text_` , or
  // if `query_`  is empty.
  if (query_->tokens().empty() || num_query_token > num_text_token) {
    return;
  }

  // We use `query_map` to allow fast match between query tokens and text
  // tokens. Queue stores the query token index and ensures the tokens to
  // match in sequence when multiple same tokens exists. It does not store the
  // last query token as the last query token allows token prefix matching.
  base::flat_map<std::u16string, std::queue<size_t>> query_map;
  for (size_t i = 0; i < num_query_token - 1; ++i) {
    query_map[query_->tokens()[i]].emplace(i);
  }
  const std::u16string last_query_token = query_->tokens()[num_query_token - 1];

  size_t matched_num = 0;
  bool last_query_token_matched = false;
  for (size_t text_pos = 0; text_pos < num_text_token; ++text_pos) {
    const std::u16string text_token = text_->tokens()[text_pos];

    if (query_map.contains(text_token)) {
      DCHECK(!query_map[text_token].empty());
      size_t query_pos = query_map[text_token].front();

      query_map[text_token].pop();
      if (query_map[text_token].empty())
        query_map.erase(text_token);

      UpdateInfoForTokenPrefixMatch(query_pos, text_pos, token_match_info);
      ++matched_num;
    }
    // The last query token can be matched for at most once.
    else if (!last_query_token_matched &&
             last_query_token.size() <= text_token.size() &&
             text_token.compare(0, last_query_token.size(), last_query_token) ==
                 0) {
      // This case handles an incomplete last query.
      // Example:
      //   For text: 'Google Chrome'.
      //
      //   Query 'Google Ch' is also a match as the query token 'ch' is the
      //   prefix of the text token 'chrome'.
      UpdateInfoForTokenPrefixMatch(num_query_token - 1, text_pos,
                                    token_match_info);
      ++matched_num;
      last_query_token_matched = true;
    }

    if (matched_num == num_query_token) {
      DCHECK(token_match_info.current_match.IsValid());
      token_match_info.hits.push_back(token_match_info.current_match);
      return;
    }
  }
  token_match_info.relevance = kNoMatchScore;
}

void PrefixMatcher::UpdateInfoForTokenPrefixMatch(size_t query_pos,
                                                  size_t text_pos,
                                                  MatchInfo& token_match_info) {
  const size_t hit_start_pos = text_->mappings()[text_pos].start();
  const size_t hit_end_pos =
      text_->mappings()[text_pos].start() + query_->tokens()[query_pos].size();

  // Update the hits information.
  if (query_pos != token_match_info.last_query_pos + 1 ||
      text_pos != token_match_info.last_query_pos + 1) {
    token_match_info.is_front = false;

    // When it's not continuous matching and we have a valid `current_match`,
    // push it `hits` and start a new match.
    if (token_match_info.current_match.IsValid()) {
      token_match_info.hits.push_back(token_match_info.current_match);
      token_match_info.current_match = gfx::Range::InvalidRange();
    }
  }
  // It's a continuous matching if the `current_match` is valid.
  // Update only the end position if it's a continuous matching, and update both
  // the start && end positions otherwise.
  if (!token_match_info.current_match.IsValid())
    token_match_info.current_match.set_start(hit_start_pos);
  token_match_info.current_match.set_end(hit_end_pos);

  // Update relevance score.
  token_match_info.relevance +=
      token_match_info.is_front
          ? kIsPrefixCharScore * query_->tokens().at(query_pos).size()
          : kIsFrontOfTokenCharScore +
                kIsWeakHitCharScore *
                    (query_->tokens().at(query_pos).size() - 1);
  token_match_info.last_query_pos = query_pos;
  token_match_info.last_text_pos = text_pos;
}

}  // namespace ash::string_matching