1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
|
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022-present Max Bachmann */
#pragma once
#include <algorithm>
#include <cstddef>
#include <vector>
namespace rapidfuzz_reference {
template <typename InputIt1, typename InputIt2>
double jaro_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
double score_cutoff = 0.0)
{
size_t P_len = static_cast<size_t>(std::distance(P_first, P_last));
size_t T_len = static_cast<size_t>(std::distance(T_first, T_last));
if (score_cutoff > 1.0) return 0.0;
if (!P_len || !T_len) return double(!P_len && !T_len);
std::vector<int> P_flag(P_len + 1);
std::vector<int> T_flag(T_len + 1);
size_t Bound = std::max(P_len, T_len) / 2;
if (Bound > 0) Bound--;
size_t CommonChars = 0;
for (size_t i = 0; i < T_len; i++) {
size_t lowlim = (i >= Bound) ? i - Bound : 0;
size_t hilim = (i + Bound <= P_len - 1) ? (i + Bound) : P_len - 1;
for (size_t j = lowlim; j <= hilim; j++) {
if (!P_flag[j] && (P_first[static_cast<ptrdiff_t>(j)] == T_first[static_cast<ptrdiff_t>(i)])) {
T_flag[i] = 1;
P_flag[j] = 1;
CommonChars++;
break;
}
}
}
// Count the number of transpositions
size_t Transpositions = 0;
size_t k = 0;
for (size_t i = 0; i < T_len; i++) {
if (T_flag[i]) {
size_t j = k;
for (; j < P_len; j++) {
if (P_flag[j]) {
k = j + 1;
break;
}
}
if (T_first[static_cast<ptrdiff_t>(i)] != P_first[static_cast<ptrdiff_t>(j)]) Transpositions++;
}
}
Transpositions /= 2;
double Sim = 0;
Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
Sim += (static_cast<double>(CommonChars) - static_cast<double>(Transpositions)) /
static_cast<double>(CommonChars);
Sim /= 3.0;
return (Sim >= score_cutoff) ? Sim : 0;
}
template <typename Sentence1, typename Sentence2>
double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
{
return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff);
}
} /* namespace rapidfuzz_reference */
|