1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
|
// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chromeos/ash/components/string_matching/sequence_matcher.h"
#include <algorithm>
#include <cmath>
#include <queue>
#include "base/check_op.h"
namespace ash::string_matching {
namespace {
using Match = SequenceMatcher::Match;
using Matches = std::vector<Match>;
// The maximum recognized text size if text-length agnosticism is applied.
constexpr int kTextAgnosticismSize = 9;
bool CompareMatches(const Match& m1, const Match& m2) {
return m1.pos_first_string < m2.pos_first_string;
}
} // namespace
SequenceMatcher::Match::Match() = default;
SequenceMatcher::Match::Match(int pos_first, int pos_second, int len)
: pos_first_string(pos_first), pos_second_string(pos_second), length(len) {
DCHECK_GE(pos_first_string, 0);
DCHECK_GE(pos_second_string, 0);
DCHECK_GE(length, 0);
}
SequenceMatcher::SequenceMatcher(const std::u16string& first_string,
const std::u16string& second_string,
double num_matching_blocks_penalty)
: first_string_(first_string),
second_string_(second_string),
num_matching_blocks_penalty_(num_matching_blocks_penalty),
dp_common_string_(second_string.size() + 1, 0) {
if (first_string_.empty() && second_string_.empty()) {
block_matching_ratio_ = 0;
}
for (size_t i = 0; i < second_string_.size(); i++) {
char_to_positions_[second_string_[i]].emplace_back(i);
}
}
SequenceMatcher::~SequenceMatcher() = default;
// Compute the longest common substring, with optimisations for:
//
// 1) Time: By pre-computing some letter positions (stored in
// `char_to_positions_`.
//
// 2) Memory: Store only the latest row of the DP table (in
// `dp_common_string_`).
//
// 3) Time: Fast-update `dp_common_string_`.
Match SequenceMatcher::FindLongestMatch(int first_start,
int first_end,
int second_start,
int second_end) {
Match match(first_start, second_start, 0);
// These two vectors are used for fast updating of `dp_common_string_`.
// Only erase or update values which are known to have been changed.
//
// `dp_values_to_erase` contains the values which should be erased from
// `dp_common_string_`.
// `dp_values_to_affect` contains the values which should be updated in
// `dp_common_string_`.
std::vector<std::pair<int, int>> dp_values_to_erase;
std::vector<std::pair<int, int>> dp_values_to_affect;
// Outer loop: Iterate through the characters of `first_string`.
// Keep up-to-date `dp_common_string_` (the latest row of the DP table).
for (int i = first_start; i < first_end; i++) {
dp_values_to_affect.clear();
// Inner loop: Iterate through characters of `second_string`, where those
// characters are equal to first_string_[i], and within range.
for (auto j : char_to_positions_[first_string_[i]]) {
if (j < second_start) {
continue;
}
if (j >= second_end) {
break;
}
// dp_common_string_[j + 1] is the length of the longest common substring
// ending at first_string_[i] and second_string_[j].
const int length = dp_common_string_[j] + 1;
dp_values_to_affect.emplace_back(j + 1, length);
// Store newly-found longer matches.
if (length > match.length) {
match.pos_first_string = i - length + 1;
match.pos_second_string = j - length + 1;
match.length = length;
}
}
// Update `dp_common_string_`.
for (auto const& element : dp_values_to_erase) {
dp_common_string_[element.first] = 0;
}
for (auto const& element : dp_values_to_affect) {
dp_common_string_[element.first] = element.second;
}
std::swap(dp_values_to_erase, dp_values_to_affect);
}
// Erase temporary values in preparation for future calls.
std::fill(dp_common_string_.begin(), dp_common_string_.end(), 0);
return match;
}
Matches SequenceMatcher::GetMatchingBlocks() {
if (!matching_blocks_.empty()) {
return matching_blocks_;
}
// This queue contains a tuple of 4 integers that represent 2 substrings to
// find the longest match in the following order: first_start, first_end,
// second_start, second_end.
std::queue<std::tuple<int, int, int, int>> queue_block;
queue_block.emplace(0, first_string_.size(), 0, second_string_.size());
// Find all matching blocks recursively. Prioritize longer blocks: Find the
// longest matching block first, then recurse to the left and right into the
// remaining as-yet unmatched sections of the two strings.
while (!queue_block.empty()) {
int first_start, first_end, second_start, second_end;
std::tie(first_start, first_end, second_start, second_end) =
queue_block.front();
queue_block.pop();
const Match match =
FindLongestMatch(first_start, first_end, second_start, second_end);
if (match.length > 0) {
// Exclude matching blocks with whitespace only.
const bool whitespace_only =
match.length == 1 && first_string_[match.pos_first_string] == u' ';
if (!whitespace_only) {
matching_blocks_.push_back(match);
}
// Recurse left.
if (first_start < match.pos_first_string &&
second_start < match.pos_second_string) {
queue_block.emplace(first_start, match.pos_first_string, second_start,
match.pos_second_string);
}
// Recurse right.
if (match.pos_first_string + match.length < first_end &&
match.pos_second_string + match.length < second_end) {
queue_block.emplace(match.pos_first_string + match.length, first_end,
match.pos_second_string + match.length, second_end);
}
}
}
// Always store a final matching block. In case no matching blocks
// were discovered above, this final matching block serves
// the purpose of indicating that block matching has taken place.
matching_blocks_.push_back(
Match(first_string_.size(), second_string_.size(), 0));
sort(matching_blocks_.begin(), matching_blocks_.end(), CompareMatches);
return matching_blocks_;
}
double SequenceMatcher::Ratio(bool text_length_agnostic) {
// Uses block matching to calculate ratio.
if (block_matching_ratio_ < 0) {
int sum_match = 0;
const int query_size = first_string_.size();
const int text_size = second_string_.size();
int sum_length = query_size;
// Text-length agnosticism is applied for long texts if
// `text_length_agnostic` is true, but we still keep it not shorter
// than the query length. Text length agnosticism is ignored if we have a
// longer query than the text.
if (text_length_agnostic && query_size < text_size) {
int max_recognized_text_length =
std::max(kTextAgnosticismSize, query_size);
sum_length += std::min(text_size, max_recognized_text_length);
} else {
sum_length += text_size;
}
DCHECK_NE(sum_length, 0);
const int num_blocks = GetMatchingBlocks().size();
for (const auto& match : GetMatchingBlocks()) {
sum_match += match.length;
}
// The last block is always a placeholder "empty" block, so subtract one.
// And, allow for one "penalty-free" block, so subtract one again. Hence,
// apply a penalty by using |num_blocks - 2|. Example:
//
// If num_blocks = 5, the actual number of matching blocks is 4. This
// means there are 3 blocks in excess of 1.
block_matching_ratio_ =
2.0 * sum_match / sum_length *
exp(-(num_blocks - 2) * num_matching_blocks_penalty_);
}
return block_matching_ratio_;
}
} // namespace ash::string_matching
|