File: Indel_impl.hpp

package info (click to toggle)
rapidfuzz-cpp 3.3.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,480 kB
  • sloc: cpp: 30,893; python: 63; makefile: 26; sh: 8
file content (70 lines) | stat: -rw-r--r-- 3,042 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/* SPDX-License-Identifier: MIT */
/* Copyright © 2022-present Max Bachmann */

#include <rapidfuzz/details/PatternMatchVector.hpp>
#include <rapidfuzz/details/Range.hpp>
#include <rapidfuzz/details/common.hpp>
#include <rapidfuzz/details/distance.hpp>
#include <rapidfuzz/details/intrinsics.hpp>
#include <rapidfuzz/distance/LCSseq.hpp>

namespace rapidfuzz {
namespace detail {

template <typename InputIt1, typename InputIt2>
size_t indel_distance(const BlockPatternMatchVector& block, const Range<InputIt1>& s1,
                      const Range<InputIt2>& s2, size_t score_cutoff)
{
    size_t maximum = s1.size() + s2.size();
    size_t lcs_cutoff = (maximum / 2 >= score_cutoff) ? maximum / 2 - score_cutoff : 0;
    size_t lcs_sim = lcs_seq_similarity(block, s1, s2, lcs_cutoff);
    size_t dist = maximum - 2 * lcs_sim;
    return (dist <= score_cutoff) ? dist : score_cutoff + 1;
}

template <typename InputIt1, typename InputIt2>
double indel_normalized_distance(const BlockPatternMatchVector& block, const Range<InputIt1>& s1,
                                 const Range<InputIt2>& s2, double score_cutoff)
{
    size_t maximum = s1.size() + s2.size();
    size_t cutoff_distance = static_cast<size_t>(std::ceil(static_cast<double>(maximum) * score_cutoff));
    size_t dist = indel_distance(block, s1, s2, cutoff_distance);
    double norm_dist = (maximum) ? static_cast<double>(dist) / static_cast<double>(maximum) : 0.0;
    return (norm_dist <= score_cutoff) ? norm_dist : 1.0;
}

template <typename InputIt1, typename InputIt2>
double indel_normalized_similarity(const BlockPatternMatchVector& block, const Range<InputIt1>& s1,
                                   const Range<InputIt2>& s2, double score_cutoff)
{
    double cutoff_score = NormSim_to_NormDist(score_cutoff);
    double norm_dist = indel_normalized_distance(block, s1, s2, cutoff_score);
    double norm_sim = 1.0 - norm_dist;
    return (norm_sim >= score_cutoff) ? norm_sim : 0.0;
}

class Indel : public DistanceBase<Indel, size_t, 0, std::numeric_limits<int64_t>::max()> {
    friend DistanceBase<Indel, size_t, 0, std::numeric_limits<int64_t>::max()>;
    friend NormalizedMetricBase<Indel>;

    template <typename InputIt1, typename InputIt2>
    static size_t maximum(const Range<InputIt1>& s1, const Range<InputIt2>& s2)
    {
        return s1.size() + s2.size();
    }

    template <typename InputIt1, typename InputIt2>
    static size_t _distance(const Range<InputIt1>& s1, const Range<InputIt2>& s2, size_t score_cutoff,
                            size_t score_hint)
    {
        size_t maximum = Indel::maximum(s1, s2);
        size_t lcs_cutoff = (maximum / 2 >= score_cutoff) ? maximum / 2 - score_cutoff : 0;
        size_t lcs_hint = (maximum / 2 >= score_hint) ? maximum / 2 - score_hint : 0;
        size_t lcs_sim = LCSseq::similarity(s1, s2, lcs_cutoff, lcs_hint);
        size_t dist = maximum - 2 * lcs_sim;
        return (dist <= score_cutoff) ? dist : score_cutoff + 1;
    }
};

} // namespace detail
} // namespace rapidfuzz