1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
|
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022-present Max Bachmann */
#pragma once
#include <limits>
#include <rapidfuzz/details/Range.hpp>
#include <rapidfuzz/distance/Levenshtein_impl.hpp>
namespace rapidfuzz {
/**
* @brief Calculates the minimum number of insertions, deletions, and substitutions
* required to change one sequence into the other according to Levenshtein with custom
* costs for insertion, deletion and substitution
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
* @param weights
* The weights for the three operations in the form
* (insertion, deletion, substitution). Default is {1, 1, 1},
* which gives all three operations a weight of 1.
* @param max
* Maximum Levenshtein distance between s1 and s2, that is
* considered as a result. If the distance is bigger than max,
* max + 1 is returned instead. Default is std::numeric_limits<size_t>::max(),
* which deactivates this behaviour.
*
* @return returns the levenshtein distance between s1 and s2
*
* @remarks
* @parblock
* Depending on the input parameters different optimized implementation are used
* to improve the performance. Worst-case performance is ``O(m * n)``.
*
* <b>Insertion = Deletion = Substitution:</b>
*
* This is known as uniform Levenshtein distance and is the distance most commonly
* referred to as Levenshtein distance. The following implementation is used
* with a worst-case performance of ``O([N/64]M)``.
*
* - if max is 0 the similarity can be calculated using a direct comparision,
* since no difference between the strings is allowed. The time complexity of
* this algorithm is ``O(N)``.
*
* - A common prefix/suffix of the two compared strings does not affect
* the Levenshtein distance, so the affix is removed before calculating the
* similarity.
*
* - If max is <= 3 the mbleven algorithm is used. This algorithm
* checks all possible edit operations that are possible under
* the threshold `max`. The time complexity of this algorithm is ``O(N)``.
*
* - If the length of the shorter string is <= 64 after removing the common affix
* Hyyrös' algorithm is used, which calculates the Levenshtein distance in
* parallel. The algorithm is described by @cite hyrro_2002. The time complexity of this
* algorithm is ``O(N)``.
*
* - If the length of the shorter string is >= 64 after removing the common affix
* a blockwise implementation of Myers' algorithm is used, which calculates
* the Levenshtein distance in parallel (64 characters at a time).
* The algorithm is described by @cite myers_1999. The time complexity of this
* algorithm is ``O([N/64]M)``.
*
*
* <b>Insertion = Deletion, Substitution >= Insertion + Deletion:</b>
*
* Since every Substitution can be performed as Insertion + Deletion, this variant
* of the Levenshtein distance only uses Insertions and Deletions. Therefore this
* variant is often referred to as InDel-Distance. The following implementation
* is used with a worst-case performance of ``O([N/64]M)``.
*
* - if max is 0 the similarity can be calculated using a direct comparision,
* since no difference between the strings is allowed. The time complexity of
* this algorithm is ``O(N)``.
*
* - if max is 1 and the two strings have a similar length, the similarity can be
* calculated using a direct comparision aswell, since a substitution would cause
* a edit distance higher than max. The time complexity of this algorithm
* is ``O(N)``.
*
* - A common prefix/suffix of the two compared strings does not affect
* the Levenshtein distance, so the affix is removed before calculating the
* similarity.
*
* - If max is <= 4 the mbleven algorithm is used. This algorithm
* checks all possible edit operations that are possible under
* the threshold `max`. As a difference to the normal Levenshtein distance this
* algorithm can even be used up to a threshold of 4 here, since the higher weight
* of substitutions decreases the amount of possible edit operations.
* The time complexity of this algorithm is ``O(N)``.
*
* - If the length of the shorter string is <= 64 after removing the common affix
* Hyyrös' lcs algorithm is used, which calculates the InDel distance in
* parallel. The algorithm is described by @cite hyrro_lcs_2004 and is extended with support
* for UTF32 in this implementation. The time complexity of this
* algorithm is ``O(N)``.
*
* - If the length of the shorter string is >= 64 after removing the common affix
* a blockwise implementation of Hyyrös' lcs algorithm is used, which calculates
* the Levenshtein distance in parallel (64 characters at a time).
* The algorithm is described by @cite hyrro_lcs_2004. The time complexity of this
* algorithm is ``O([N/64]M)``.
*
* <b>Other weights:</b>
*
* The implementation for other weights is based on Wagner-Fischer.
* It has a performance of ``O(N * M)`` and has a memory usage of ``O(N)``.
* Further details can be found in @cite wagner_fischer_1974.
* @endparblock
*
* @par Examples
* @parblock
* Find the Levenshtein distance between two strings:
* @code{.cpp}
* // dist is 2
* size_t dist = levenshtein_distance("lewenstein", "levenshtein");
* @endcode
*
* Setting a maximum distance allows the implementation to select
* a more efficient implementation:
* @code{.cpp}
* // dist is 2
* size_t dist = levenshtein_distance("lewenstein", "levenshtein", {1, 1, 1}, 1);
* @endcode
*
* It is possible to select different weights by passing a `weight` struct.
* @code{.cpp}
* // dist is 3
* size_t dist = levenshtein_distance("lewenstein", "levenshtein", {1, 1, 2});
* @endcode
* @endparblock
*/
template <typename InputIt1, typename InputIt2>
size_t levenshtein_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
LevenshteinWeightTable weights = {1, 1, 1},
size_t score_cutoff = std::numeric_limits<size_t>::max(),
size_t score_hint = std::numeric_limits<size_t>::max())
{
return detail::Levenshtein::distance(first1, last1, first2, last2, weights, score_cutoff, score_hint);
}
template <typename Sentence1, typename Sentence2>
size_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
LevenshteinWeightTable weights = {1, 1, 1},
size_t score_cutoff = std::numeric_limits<size_t>::max(),
size_t score_hint = std::numeric_limits<size_t>::max())
{
return detail::Levenshtein::distance(s1, s2, weights, score_cutoff, score_hint);
}
template <typename InputIt1, typename InputIt2>
size_t levenshtein_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
LevenshteinWeightTable weights = {1, 1, 1}, size_t score_cutoff = 0,
size_t score_hint = 0)
{
return detail::Levenshtein::similarity(first1, last1, first2, last2, weights, score_cutoff, score_hint);
}
template <typename Sentence1, typename Sentence2>
size_t levenshtein_similarity(const Sentence1& s1, const Sentence2& s2,
LevenshteinWeightTable weights = {1, 1, 1}, size_t score_cutoff = 0,
size_t score_hint = 0)
{
return detail::Levenshtein::similarity(s1, s2, weights, score_cutoff, score_hint);
}
template <typename InputIt1, typename InputIt2>
double levenshtein_normalized_distance(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 1.0,
double score_hint = 1.0)
{
return detail::Levenshtein::normalized_distance(first1, last1, first2, last2, weights, score_cutoff,
score_hint);
}
template <typename Sentence1, typename Sentence2>
double levenshtein_normalized_distance(const Sentence1& s1, const Sentence2& s2,
LevenshteinWeightTable weights = {1, 1, 1}, double score_cutoff = 1.0,
double score_hint = 1.0)
{
return detail::Levenshtein::normalized_distance(s1, s2, weights, score_cutoff, score_hint);
}
/**
* @brief Calculates a normalized levenshtein distance using custom
* costs for insertion, deletion and substitution.
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
* @param weights
* The weights for the three operations in the form
* (insertion, deletion, substitution). Default is {1, 1, 1},
* which gives all three operations a weight of 1.
* @param score_cutoff
* Optional argument for a score threshold as a float between 0 and 1.0.
* For ratio < score_cutoff 0 is returned instead. Default is 0,
* which deactivates this behaviour.
*
* @return Normalized weighted levenshtein distance between s1 and s2
* as a double between 0 and 1.0
*
* @see levenshtein()
*
* @remarks
* @parblock
* The normalization of the Levenshtein distance is performed in the following way:
*
* \f{align*}{
* ratio &= \frac{distance(s1, s2)}{max_dist}
* \f}
* @endparblock
*
*
* @par Examples
* @parblock
* Find the normalized Levenshtein distance between two strings:
* @code{.cpp}
* // ratio is 81.81818181818181
* double ratio = normalized_levenshtein("lewenstein", "levenshtein");
* @endcode
*
* Setting a score_cutoff allows the implementation to select
* a more efficient implementation:
* @code{.cpp}
* // ratio is 0.0
* double ratio = normalized_levenshtein("lewenstein", "levenshtein", {1, 1, 1}, 85.0);
* @endcode
*
* It is possible to select different weights by passing a `weight` struct
* @code{.cpp}
* // ratio is 85.71428571428571
* double ratio = normalized_levenshtein("lewenstein", "levenshtein", {1, 1, 2});
* @endcode
* @endparblock
*/
template <typename InputIt1, typename InputIt2>
double levenshtein_normalized_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
LevenshteinWeightTable weights = {1, 1, 1},
double score_cutoff = 0.0, double score_hint = 0.0)
{
return detail::Levenshtein::normalized_similarity(first1, last1, first2, last2, weights, score_cutoff,
score_hint);
}
template <typename Sentence1, typename Sentence2>
double levenshtein_normalized_similarity(const Sentence1& s1, const Sentence2& s2,
LevenshteinWeightTable weights = {1, 1, 1},
double score_cutoff = 0.0, double score_hint = 0.0)
{
return detail::Levenshtein::normalized_similarity(s1, s2, weights, score_cutoff, score_hint);
}
/**
* @brief Return list of EditOp describing how to turn s1 into s2.
*
* @tparam Sentence1 This is a string that can be converted to
* basic_string_view<char_type>
* @tparam Sentence2 This is a string that can be converted to
* basic_string_view<char_type>
*
* @param s1
* string to compare with s2 (for type info check Template parameters above)
* @param s2
* string to compare with s1 (for type info check Template parameters above)
*
* @return Edit operations required to turn s1 into s2
*/
template <typename InputIt1, typename InputIt2>
Editops levenshtein_editops(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
size_t score_hint = std::numeric_limits<size_t>::max())
{
return detail::levenshtein_editops(detail::make_range(first1, last1), detail::make_range(first2, last2),
score_hint);
}
template <typename Sentence1, typename Sentence2>
Editops levenshtein_editops(const Sentence1& s1, const Sentence2& s2,
size_t score_hint = std::numeric_limits<size_t>::max())
{
return detail::levenshtein_editops(detail::make_range(s1), detail::make_range(s2), score_hint);
}
#ifdef RAPIDFUZZ_SIMD
namespace experimental {
template <int MaxLen>
struct MultiLevenshtein : public detail::MultiDistanceBase<MultiLevenshtein<MaxLen>, size_t, 0,
std::numeric_limits<int64_t>::max()> {
private:
friend detail::MultiDistanceBase<MultiLevenshtein<MaxLen>, size_t, 0,
std::numeric_limits<int64_t>::max()>;
friend detail::MultiNormalizedMetricBase<MultiLevenshtein<MaxLen>, size_t>;
RAPIDFUZZ_CONSTEXPR_CXX14 static size_t get_vec_size()
{
# ifdef RAPIDFUZZ_AVX2
using namespace detail::simd_avx2;
# else
using namespace detail::simd_sse2;
# endif
RAPIDFUZZ_IF_CONSTEXPR (MaxLen <= 8)
return native_simd<uint8_t>::size;
else RAPIDFUZZ_IF_CONSTEXPR (MaxLen <= 16)
return native_simd<uint16_t>::size;
else RAPIDFUZZ_IF_CONSTEXPR (MaxLen <= 32)
return native_simd<uint32_t>::size;
else RAPIDFUZZ_IF_CONSTEXPR (MaxLen <= 64)
return native_simd<uint64_t>::size;
static_assert(MaxLen <= 64, "expected MaxLen <= 64");
}
static size_t find_block_count(size_t count)
{
size_t vec_size = get_vec_size();
size_t simd_vec_count = detail::ceil_div(count, vec_size);
return detail::ceil_div(simd_vec_count * vec_size * MaxLen, 64);
}
public:
MultiLevenshtein(size_t count, LevenshteinWeightTable aWeights = {1, 1, 1})
: input_count(count), PM(find_block_count(count) * 64), weights(aWeights)
{
str_lens.resize(result_count());
if (weights.delete_cost != 1 || weights.insert_cost != 1 || weights.replace_cost > 2)
throw std::invalid_argument("unsupported weights");
}
/**
* @brief get minimum size required for result vectors passed into
* - distance
* - similarity
* - normalized_distance
* - normalized_similarity
*
* @return minimum vector size
*/
size_t result_count() const
{
size_t vec_size = get_vec_size();
size_t simd_vec_count = detail::ceil_div(input_count, vec_size);
return simd_vec_count * vec_size;
}
template <typename Sentence1>
void insert(const Sentence1& s1_)
{
insert(detail::to_begin(s1_), detail::to_end(s1_));
}
template <typename InputIt1>
void insert(InputIt1 first1, InputIt1 last1)
{
auto len = std::distance(first1, last1);
int block_pos = static_cast<int>((pos * MaxLen) % 64);
auto block = (pos * MaxLen) / 64;
assert(len <= MaxLen);
if (pos >= input_count) throw std::invalid_argument("out of bounds insert");
str_lens[pos] = static_cast<size_t>(len);
for (; first1 != last1; ++first1) {
PM.insert(block, *first1, block_pos);
block_pos++;
}
pos++;
}
private:
template <typename InputIt2>
void _distance(size_t* scores, size_t score_count, const detail::Range<InputIt2>& s2,
size_t score_cutoff = std::numeric_limits<size_t>::max()) const
{
if (score_count < result_count())
throw std::invalid_argument("scores has to have >= result_count() elements");
auto scores_ = detail::make_range(scores, scores + score_count);
RAPIDFUZZ_IF_CONSTEXPR (MaxLen == 8)
detail::levenshtein_hyrroe2003_simd<uint8_t>(scores_, PM, str_lens, s2, score_cutoff);
else RAPIDFUZZ_IF_CONSTEXPR (MaxLen == 16)
detail::levenshtein_hyrroe2003_simd<uint16_t>(scores_, PM, str_lens, s2, score_cutoff);
else RAPIDFUZZ_IF_CONSTEXPR (MaxLen == 32)
detail::levenshtein_hyrroe2003_simd<uint32_t>(scores_, PM, str_lens, s2, score_cutoff);
else RAPIDFUZZ_IF_CONSTEXPR (MaxLen == 64)
detail::levenshtein_hyrroe2003_simd<uint64_t>(scores_, PM, str_lens, s2, score_cutoff);
}
template <typename InputIt2>
size_t maximum(size_t s1_idx, const detail::Range<InputIt2>& s2) const
{
return detail::levenshtein_maximum(str_lens[s1_idx], s2.size(), weights);
}
size_t get_input_count() const noexcept
{
return input_count;
}
size_t input_count;
size_t pos = 0;
detail::BlockPatternMatchVector PM;
std::vector<size_t> str_lens;
LevenshteinWeightTable weights;
};
} /* namespace experimental */
#endif /* RAPIDFUZZ_SIMD */
template <typename CharT1>
struct CachedLevenshtein : public detail::CachedDistanceBase<CachedLevenshtein<CharT1>, size_t, 0,
std::numeric_limits<int64_t>::max()> {
template <typename Sentence1>
explicit CachedLevenshtein(const Sentence1& s1_, LevenshteinWeightTable aWeights = {1, 1, 1})
: CachedLevenshtein(detail::to_begin(s1_), detail::to_end(s1_), aWeights)
{}
template <typename InputIt1>
CachedLevenshtein(InputIt1 first1, InputIt1 last1, LevenshteinWeightTable aWeights = {1, 1, 1})
: s1(first1, last1), PM(detail::make_range(first1, last1)), weights(aWeights)
{}
private:
friend detail::CachedDistanceBase<CachedLevenshtein<CharT1>, size_t, 0,
std::numeric_limits<int64_t>::max()>;
friend detail::CachedNormalizedMetricBase<CachedLevenshtein<CharT1>>;
template <typename InputIt2>
size_t maximum(const detail::Range<InputIt2>& s2) const
{
return detail::levenshtein_maximum(s1.size(), s2.size(), weights);
}
template <typename InputIt2>
size_t _distance(const detail::Range<InputIt2>& s2, size_t score_cutoff, size_t score_hint) const
{
if (weights.insert_cost == weights.delete_cost) {
/* when insertions + deletions operations are free there can not be any edit distance */
if (weights.insert_cost == 0) return 0;
/* uniform Levenshtein multiplied with the common factor */
if (weights.insert_cost == weights.replace_cost) {
// max can make use of the common divisor of the three weights
size_t new_score_cutoff = detail::ceil_div(score_cutoff, weights.insert_cost);
size_t new_score_hint = detail::ceil_div(score_hint, weights.insert_cost);
size_t dist = detail::uniform_levenshtein_distance(PM, detail::make_range(s1), s2,
new_score_cutoff, new_score_hint);
dist *= weights.insert_cost;
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
}
/*
* when replace_cost >= insert_cost + delete_cost no substitutions are performed
* therefore this can be implemented as InDel distance multiplied with the common factor
*/
else if (weights.replace_cost >= weights.insert_cost + weights.delete_cost) {
// max can make use of the common divisor of the three weights
size_t new_max = detail::ceil_div(score_cutoff, weights.insert_cost);
size_t dist = detail::indel_distance(PM, detail::make_range(s1), s2, new_max);
dist *= weights.insert_cost;
return (dist <= score_cutoff) ? dist : score_cutoff + 1;
}
}
return detail::generalized_levenshtein_distance(detail::make_range(s1), s2, weights, score_cutoff);
}
std::vector<CharT1> s1;
detail::BlockPatternMatchVector PM;
LevenshteinWeightTable weights;
};
#ifdef RAPIDFUZZ_DEDUCTION_GUIDES
template <typename Sentence1>
explicit CachedLevenshtein(const Sentence1& s1_, LevenshteinWeightTable aWeights = {
1, 1, 1}) -> CachedLevenshtein<char_type<Sentence1>>;
template <typename InputIt1>
CachedLevenshtein(InputIt1 first1, InputIt1 last1,
LevenshteinWeightTable aWeights = {1, 1, 1}) -> CachedLevenshtein<iter_value_t<InputIt1>>;
#endif
} // namespace rapidfuzz
|