File: Jaro.hpp

package info (click to toggle)
rapidfuzz-cpp 3.1.1-1~bpo12%2B1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm-backports
  • size: 2,444 kB
  • sloc: cpp: 30,295; python: 63; makefile: 26; sh: 8
file content (74 lines) | stat: -rw-r--r-- 2,428 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022-present Max Bachmann */

#pragma once
#include <algorithm>
#include <cstddef>
#include <vector>

namespace rapidfuzz_reference {

template <typename InputIt1, typename InputIt2>
double jaro_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, InputIt2 T_last,
                       double score_cutoff = 0.0)
{
    size_t P_len = static_cast<size_t>(std::distance(P_first, P_last));
    size_t T_len = static_cast<size_t>(std::distance(T_first, T_last));

    if (score_cutoff > 1.0) return 0.0;

    if (!P_len || !T_len) return double(!P_len && !T_len);

    std::vector<int> P_flag(P_len + 1);
    std::vector<int> T_flag(T_len + 1);

    size_t Bound = std::max(P_len, T_len) / 2;
    if (Bound > 0) Bound--;

    size_t CommonChars = 0;
    for (size_t i = 0; i < T_len; i++) {
        size_t lowlim = (i >= Bound) ? i - Bound : 0;
        size_t hilim = (i + Bound <= P_len - 1) ? (i + Bound) : P_len - 1;
        for (size_t j = lowlim; j <= hilim; j++) {
            if (!P_flag[j] && (P_first[static_cast<ptrdiff_t>(j)] == T_first[static_cast<ptrdiff_t>(i)])) {
                T_flag[i] = 1;
                P_flag[j] = 1;
                CommonChars++;
                break;
            }
        }
    }

    // Count the number of transpositions
    size_t Transpositions = 0;
    size_t k = 0;
    for (size_t i = 0; i < T_len; i++) {
        if (T_flag[i]) {
            size_t j = k;
            for (; j < P_len; j++) {
                if (P_flag[j]) {
                    k = j + 1;
                    break;
                }
            }
            if (T_first[static_cast<ptrdiff_t>(i)] != P_first[static_cast<ptrdiff_t>(j)]) Transpositions++;
        }
    }

    Transpositions /= 2;
    double Sim = 0;
    Sim += static_cast<double>(CommonChars) / static_cast<double>(P_len);
    Sim += static_cast<double>(CommonChars) / static_cast<double>(T_len);
    Sim += (static_cast<double>(CommonChars) - static_cast<double>(Transpositions)) /
           static_cast<double>(CommonChars);
    Sim /= 3.0;
    return (Sim >= score_cutoff) ? Sim : 0;
}

template <typename Sentence1, typename Sentence2>
double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
{
    return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), score_cutoff);
}

} /* namespace rapidfuzz_reference */