File: common.py

package info (click to toggle)
rapidfuzz 3.12.2%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,436 kB
  • sloc: python: 7,571; cpp: 7,481; sh: 30; makefile: 23
file content (397 lines) | stat: -rw-r--r-- 15,407 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
"""
common parts of the test suite for rapidfuzz
"""

from __future__ import annotations

from dataclasses import dataclass
from math import isnan
from typing import Any

import pytest

from rapidfuzz import process_cpp, process_py

try:
    from pandas import NA as pandas_NA
except BaseException:
    pandas_NA = None


def _get_scorer_flags_py(scorer: Any, scorer_kwargs: dict[str, Any]) -> tuple[int, int]:
    params = getattr(scorer, "_RF_ScorerPy", None)
    if params is not None:
        flags = params["get_scorer_flags"](**scorer_kwargs)
        return (flags["worst_score"], flags["optimal_score"])
    return (0, 100)


def is_none(s):
    if s is None or s is pandas_NA:
        return True

    if isinstance(s, float) and isnan(s):
        return True

    return False


def call_and_maybe_catch(call, *args, catch_exceptions=False, **kwargs):
    if not catch_exceptions:
        return call(*args, **kwargs)

    try:
        return call(*args, **kwargs)
    except AssertionError as e:
        raise e
    except Exception as e:
        return e


def compare_exceptions(e1, e2):
    try:
        return str(e1) == str(e2)
    except Exception:
        return False


def scorer_tester(scorer, s1, s2, catch_exceptions=False, **kwargs):
    score1 = call_and_maybe_catch(scorer, s1, s2, **kwargs)
    exception = isinstance(score1, Exception)

    temp_kwargs = kwargs.copy()
    process_kwargs = {}

    if "processor" in kwargs:
        process_kwargs["processor"] = kwargs["processor"]
        del temp_kwargs["processor"]

    if "score_cutoff" in kwargs:
        process_kwargs["score_cutoff"] = kwargs["score_cutoff"]
        del temp_kwargs["score_cutoff"]

    if temp_kwargs:
        process_kwargs["scorer_kwargs"] = temp_kwargs

    extractOne_res1 = call_and_maybe_catch(
        process_cpp.extractOne, s1, [s2], catch_exceptions=catch_exceptions, scorer=scorer, **process_kwargs
    )
    extractOne_res2 = call_and_maybe_catch(
        process_py.extractOne, s1, [s2], catch_exceptions=catch_exceptions, scorer=scorer, **process_kwargs
    )
    extract_res1 = call_and_maybe_catch(
        process_cpp.extract, s1, [s2], catch_exceptions=catch_exceptions, scorer=scorer, **process_kwargs
    )
    extract_res2 = call_and_maybe_catch(
        process_py.extract, s1, [s2], catch_exceptions=catch_exceptions, scorer=scorer, **process_kwargs
    )
    extract_iter_res1 = call_and_maybe_catch(
        list, process_cpp.extract_iter(s1, [s2], scorer=scorer, **process_kwargs), catch_exceptions=catch_exceptions
    )
    extract_iter_res2 = call_and_maybe_catch(
        list, process_py.extract_iter(s1, [s2], scorer=scorer, **process_kwargs), catch_exceptions=catch_exceptions
    )

    if exception:
        assert compare_exceptions(extractOne_res1, score1)
        assert compare_exceptions(extractOne_res2, score1)
        assert compare_exceptions(extract_res1, score1)
        assert compare_exceptions(extract_res2, score1)
        assert compare_exceptions(extract_iter_res1, score1)
        assert compare_exceptions(extract_iter_res2, score1)
    elif is_none(s1) or is_none(s2):
        assert extractOne_res1 is None
        assert extractOne_res2 is None
        assert extract_res1 == []
        assert extract_res2 == []
        assert extract_iter_res1 == []
        assert extract_iter_res2 == []
    elif kwargs.get("score_cutoff") is not None:
        worst_score, optimal_score = _get_scorer_flags_py(scorer, process_kwargs.get("scorer_kwargs", {}))
        lowest_score_worst = optimal_score > worst_score
        is_filtered = score1 < kwargs["score_cutoff"] if lowest_score_worst else score1 > kwargs["score_cutoff"]

        if is_filtered:
            assert extractOne_res1 is None
            assert extractOne_res2 is None
            assert extract_res1 == []
            assert extract_res2 == []
            assert extract_iter_res1 == []
            assert extract_iter_res2 == []
        else:
            assert pytest.approx(score1) == extractOne_res1[1]
            assert pytest.approx(score1) == extractOne_res2[1]
            assert pytest.approx(score1) == extract_res1[0][1]
            assert pytest.approx(score1) == extract_res2[0][1]
            assert pytest.approx(score1) == extract_iter_res1[0][1]
            assert pytest.approx(score1) == extract_iter_res2[0][1]
    else:
        assert pytest.approx(score1) == extractOne_res1[1]
        assert pytest.approx(score1) == extractOne_res2[1]
        assert pytest.approx(score1) == extract_res1[0][1]
        assert pytest.approx(score1) == extract_res2[0][1]
        assert pytest.approx(score1) == extract_iter_res1[0][1]
        assert pytest.approx(score1) == extract_iter_res2[0][1]

    try:
        import numpy as np
    except Exception:
        np = None

    if np is not None:
        cdist_scores1 = call_and_maybe_catch(
            process_cpp.cdist, [s1], [s2], catch_exceptions=catch_exceptions, scorer=scorer, **process_kwargs
        )
        cdist_scores2 = call_and_maybe_catch(
            process_py.cdist, [s1], [s2], catch_exceptions=catch_exceptions, scorer=scorer, **process_kwargs
        )
        # probably trigger multi match / simd implementations
        cdist_scores3 = call_and_maybe_catch(
            process_cpp.cdist, [s1] * 2, [s2] * 4, catch_exceptions=catch_exceptions, scorer=scorer, **process_kwargs
        )
        cdist_scores4 = call_and_maybe_catch(
            process_py.cdist, [s1] * 2, [s2] * 4, catch_exceptions=catch_exceptions, scorer=scorer, **process_kwargs
        )

        if exception:
            assert compare_exceptions(cdist_scores1, score1)
            assert compare_exceptions(cdist_scores2, score1)
            assert compare_exceptions(cdist_scores3, score1)
            assert compare_exceptions(cdist_scores4, score1)
        else:
            assert np.all(np.isclose(cdist_scores1, score1))
            assert np.all(np.isclose(cdist_scores2, score1))
            assert np.all(np.isclose(cdist_scores3, score1))
            assert np.all(np.isclose(cdist_scores4, score1))

    if exception:
        raise score1

    return score1


def symmetric_scorer_tester(scorer, s1, s2, catch_exceptions=False, **kwargs):
    score1 = call_and_maybe_catch(scorer_tester, scorer, s1, s2, catch_exceptions=catch_exceptions, **kwargs)
    score2 = call_and_maybe_catch(scorer_tester, scorer, s2, s1, catch_exceptions=catch_exceptions, **kwargs)

    if isinstance(score1, Exception):
        assert compare_exceptions(score1, score2)
        raise score1

    assert pytest.approx(score1) == score2
    return score1


@dataclass
class Scorer:
    distance: Any
    similarity: Any
    normalized_distance: Any
    normalized_similarity: Any
    editops: Any
    opcodes: Any


class GenericScorer:
    def __init__(self, py_scorers, cpp_scorers, get_scorer_flags):
        self.py_scorers = py_scorers
        self.cpp_scorers = cpp_scorers
        self.scorers = self.py_scorers + self.cpp_scorers

        def validate_attrs(func1, func2):
            assert hasattr(func1, "_RF_ScorerPy")
            assert hasattr(func2, "_RF_ScorerPy")
            assert func1.__name__ == func2.__name__
            assert func1.__qualname__ == func2.__qualname__
            assert func1.__doc__ == func2.__doc__

        for scorer in self.scorers:
            validate_attrs(scorer.distance, self.scorers[0].distance)
            validate_attrs(scorer.similarity, self.scorers[0].similarity)
            validate_attrs(scorer.normalized_distance, self.scorers[0].normalized_distance)
            validate_attrs(scorer.normalized_similarity, self.scorers[0].normalized_similarity)

        for scorer in self.cpp_scorers:
            assert hasattr(scorer.distance, "_RF_Scorer")
            assert hasattr(scorer.similarity, "_RF_Scorer")
            assert hasattr(scorer.normalized_distance, "_RF_Scorer")
            assert hasattr(scorer.normalized_similarity, "_RF_Scorer")

        self.get_scorer_flags = get_scorer_flags

    def _editops(self, s1, s2, catch_exceptions=False, **kwargs):
        results = [
            call_and_maybe_catch(scorer.editops, s1, s2, catch_exceptions=catch_exceptions, **kwargs)
            for scorer in self.scorers
        ]

        for result in results:
            assert compare_exceptions(result, results[0])

        if any(isinstance(result, Exception) for result in results):
            raise results[0]

        return results[0]

    def _opcodes(self, s1, s2, catch_exceptions=False, **kwargs):
        results = [
            call_and_maybe_catch(scorer.opcodes, s1, s2, catch_exceptions=catch_exceptions, **kwargs)
            for scorer in self.scorers
        ]

        for result in results:
            assert compare_exceptions(result, results[0])

        if any(isinstance(result, Exception) for result in results):
            raise results[0]

        return results[0]

    def _distance(self, s1, s2, catch_exceptions=False, **kwargs):
        symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"]
        tester = symmetric_scorer_tester if symmetric else scorer_tester

        scores = [
            call_and_maybe_catch(tester, scorer.distance, s1, s2, catch_exceptions=catch_exceptions, **kwargs)
            for scorer in self.scorers
        ]

        if any(isinstance(score, Exception) for score in scores):
            for score in scores:
                assert compare_exceptions(score, scores[0])
            raise scores[0]

        scores = sorted(scores)
        assert pytest.approx(scores[0]) == scores[-1]
        return scores[0]

    def _similarity(self, s1, s2, catch_exceptions=False, **kwargs):
        symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"]
        tester = symmetric_scorer_tester if symmetric else scorer_tester

        scores = [
            call_and_maybe_catch(tester, scorer.similarity, s1, s2, catch_exceptions=catch_exceptions, **kwargs)
            for scorer in self.scorers
        ]

        if any(isinstance(score, Exception) for score in scores):
            for score in scores:
                assert compare_exceptions(score, scores[0])
            raise scores[0]

        scores = sorted(scores)
        assert pytest.approx(scores[0]) == scores[-1]
        return scores[0]

    def _normalized_distance(self, s1, s2, catch_exceptions=False, **kwargs):
        symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"]
        tester = symmetric_scorer_tester if symmetric else scorer_tester

        scores = [
            call_and_maybe_catch(
                tester, scorer.normalized_distance, s1, s2, catch_exceptions=catch_exceptions, **kwargs
            )
            for scorer in self.scorers
        ]

        if any(isinstance(score, Exception) for score in scores):
            for score in scores:
                assert compare_exceptions(score, scores[0])
            raise scores[0]

        scores = sorted(scores)
        assert pytest.approx(scores[0]) == scores[-1]
        return scores[0]

    def _normalized_similarity(self, s1, s2, catch_exceptions=False, **kwargs):
        symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"]
        tester = symmetric_scorer_tester if symmetric else scorer_tester

        scores = [
            call_and_maybe_catch(
                tester, scorer.normalized_similarity, s1, s2, catch_exceptions=catch_exceptions, **kwargs
            )
            for scorer in self.scorers
        ]

        if any(isinstance(score, Exception) for score in scores):
            for score in scores:
                assert compare_exceptions(score, scores[0])
            raise scores[0]

        scores = sorted(scores)
        assert pytest.approx(scores[0]) == scores[-1]
        return scores[0]

    def _validate(self, s1, s2, catch_exceptions=False, **kwargs):
        # todo requires more complex test handling
        # score_cutoff = kwargs.get("score_cutoff")
        kwargs = {k: v for k, v in kwargs.items() if k != "score_cutoff"}

        maximum = self.get_scorer_flags(s1, s2, **kwargs)["maximum"]

        dist = call_and_maybe_catch(self._distance, s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        sim = call_and_maybe_catch(self._similarity, s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        norm_dist = call_and_maybe_catch(self._normalized_distance, s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        norm_sim = call_and_maybe_catch(
            self._normalized_similarity, s1, s2, catch_exceptions=catch_exceptions, **kwargs
        )

        if isinstance(dist, Exception):
            assert compare_exceptions(dist, sim)
            assert compare_exceptions(dist, norm_dist)
            assert compare_exceptions(dist, norm_sim)
            raise dist

        assert pytest.approx(dist) == maximum - sim
        if maximum != 0:
            assert pytest.approx(dist / maximum) == norm_dist
            assert pytest.approx(sim / maximum) == norm_sim
        else:
            assert pytest.approx(0.0) == norm_dist
            assert pytest.approx(1.0) == norm_sim

        return dist, sim, norm_dist, norm_sim

    def distance(self, s1, s2, catch_exceptions=False, **kwargs):
        dist, _, _, _ = self._validate(s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        if "score_cutoff" not in kwargs:
            return dist

        return self._distance(s1, s2, catch_exceptions=catch_exceptions, **kwargs)

    def similarity(self, s1, s2, catch_exceptions=False, **kwargs):
        _, sim, _, _ = self._validate(s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        if "score_cutoff" not in kwargs:
            return sim

        return self._similarity(s1, s2, catch_exceptions=catch_exceptions, **kwargs)

    def normalized_distance(self, s1, s2, catch_exceptions=False, **kwargs):
        if not is_none(s1) and not is_none(s2):
            _, _, norm_dist, _ = self._validate(s1, s2, catch_exceptions=catch_exceptions, **kwargs)
            # todo we should be able to handle this in a nicer way
            if "score_cutoff" not in kwargs:
                return norm_dist
        return self._normalized_distance(s1, s2, catch_exceptions=catch_exceptions, **kwargs)

    def normalized_similarity(self, s1, s2, catch_exceptions=False, **kwargs):
        if not is_none(s1) and not is_none(s2):
            _, _, _, norm_sim = self._validate(s1, s2, catch_exceptions=catch_exceptions, **kwargs)
            if "score_cutoff" not in kwargs:
                return norm_sim
        return self._normalized_similarity(s1, s2, catch_exceptions=catch_exceptions, **kwargs)

    def editops(self, s1, s2, catch_exceptions=False, **kwargs):
        editops_ = self._editops(s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        opcodes_ = self._opcodes(s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        assert opcodes_.as_editops() == editops_
        assert opcodes_ == editops_.as_opcodes()
        return editops_

    def opcodes(self, s1, s2, catch_exceptions=False, **kwargs):
        editops_ = self._editops(s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        opcodes_ = self._opcodes(s1, s2, catch_exceptions=catch_exceptions, **kwargs)
        assert opcodes_.as_editops() == editops_
        assert opcodes_ == editops_.as_opcodes()
        return opcodes_