File: fuzz_jaro_similarity.cpp

package info (click to toggle)
rapidfuzz-cpp 3.3.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,480 kB
  • sloc: cpp: 30,893; python: 63; makefile: 26; sh: 8
file content (99 lines) | stat: -rw-r--r-- 3,151 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/* SPDX-License-Identifier: MIT */
/* Copyright © 2021 Max Bachmann */

#include "../rapidfuzz_reference/Jaro.hpp"
#include "fuzzing.hpp"
#include <rapidfuzz/details/Range.hpp>
#include <rapidfuzz/distance/Jaro.hpp>
#include <stdexcept>
#include <string>

bool is_close(double a, double b, double epsilon)
{
    return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
}

template <size_t MaxLen>
void validate_simd(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2)
{
#ifdef RAPIDFUZZ_SIMD
    size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0);
    if (count == 0) return;

    rapidfuzz::experimental::MultiJaro<MaxLen> scorer(count);

    std::vector<std::vector<uint8_t>> strings;

    for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) {
        if (std::distance(it1, s1.end()) < static_cast<ptrdiff_t>(MaxLen)) {
            strings.emplace_back(it1, s1.end());
            break;
        }
        else {
            strings.emplace_back(it1, it1 + MaxLen);
        }
    }

    for (const auto& s : strings)
        scorer.insert(s);

    std::vector<double> simd_results(scorer.result_count());
    scorer.similarity(&simd_results[0], simd_results.size(), s2);

    for (size_t i = 0; i < strings.size(); ++i) {
        double reference_sim = rapidfuzz_reference::jaro_similarity(strings[i], s2);

        if (!is_close(simd_results[i], reference_sim, 0.0001)) {
            print_seq("s1", strings[i]);
            print_seq("s2", s2);
            throw std::logic_error(std::string("jaro similarity using simd failed (reference_score = ") +
                                   std::to_string(reference_sim) + std::string(", score = ") +
                                   std::to_string(simd_results[i]) + std::string(", i = ") +
                                   std::to_string(i) + ")");
        }
    }

#else
    (void)s1;
    (void)s2;
#endif
}

void validate_distance(const std::vector<uint8_t>& s1, const std::vector<uint8_t>& s2)
{
    double reference_sim = rapidfuzz_reference::jaro_similarity(s1, s2);
    double sim = rapidfuzz::jaro_similarity(s1, s2);

    if (!is_close(sim, reference_sim, 0.0001)) {
        print_seq("s1", s1);
        print_seq("s2", s2);
        throw std::logic_error(std::string("jaro similarity failed (reference_score = ") +
                               std::to_string(reference_sim) + std::string(", score = ") +
                               std::to_string(sim) + ")");
    }

    validate_simd<8>(s1, s2);
    validate_simd<16>(s1, s2);
    validate_simd<32>(s1, s2);
    validate_simd<64>(s1, s2);
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
{
    std::vector<uint8_t> s1, s2;
    if (!extract_strings(data, size, s1, s2)) return 0;

    validate_distance(s1, s2);

    /* test long sequences */
    for (unsigned int i = 2; i < 9; ++i) {
        std::vector<uint8_t> s1_ = vec_multiply(s1, pow<size_t>(2, i));
        std::vector<uint8_t> s2_ = vec_multiply(s2, pow<size_t>(2, i));

        if (s1_.size() > 10000 || s2_.size() > 10000) break;

        validate_distance(s1_, s2_);
    }

    return 0;
}