1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
import csv
import pytest
open_kwargs = {"encoding": "utf8"}
def assertAlmostEqual(a, b, places=3):
assert abs(a - b) < (0.1**places)
implementations = ["python", "rust"]
@pytest.fixture(params=implementations)
def jf(request):
if request.param == "python":
import jellyfish._jellyfish as jf
elif request.param == "rust":
from jellyfish import _rustyfish as jf
return jf
def _load_data(name):
with open("testdata/{}.csv".format(name), **open_kwargs) as f:
yield from csv.reader(f)
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler"), ids=str)
def test_jaro_winkler_similarity(jf, s1, s2, value):
value = float(value)
assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2), value, places=3)
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler_longtol"), ids=str)
def test_jaro_winkler_similarity_longtol(jf, s1, s2, value):
value = float(value)
assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2, True), value, places=3)
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_distance"), ids=str)
def test_jaro_similarity(jf, s1, s2, value):
value = float(value)
assertAlmostEqual(jf.jaro_similarity(s1, s2), value, places=3)
@pytest.mark.parametrize("s1,s2,value", _load_data("hamming"), ids=str)
def test_hamming_distance(jf, s1, s2, value):
value = int(value)
assert jf.hamming_distance(s1, s2) == value
@pytest.mark.parametrize("s1,s2,value", _load_data("levenshtein"), ids=str)
def test_levenshtein_distance(jf, s1, s2, value):
value = int(value)
assert jf.levenshtein_distance(s1, s2) == value
@pytest.mark.parametrize("s1,s2,value", _load_data("damerau_levenshtein"), ids=str)
def test_damerau_levenshtein_distance(jf, s1, s2, value):
value = int(value)
assert jf.damerau_levenshtein_distance(s1, s2) == value
@pytest.mark.parametrize("s1,code", _load_data("soundex"), ids=str)
def test_soundex(jf, s1, code):
assert jf.soundex(s1) == code
@pytest.mark.parametrize("s1,code", _load_data("metaphone"), ids=str)
def test_metaphone(jf, s1, code):
assert jf.metaphone(s1) == code
@pytest.mark.parametrize("s1,s2", _load_data("nysiis"), ids=str)
def test_nysiis(jf, s1, s2):
assert jf.nysiis(s1) == s2
@pytest.mark.parametrize("s1,s2", _load_data("match_rating_codex"), ids=str)
def test_match_rating_codex(jf, s1, s2):
assert jf.match_rating_codex(s1) == s2
@pytest.mark.parametrize("s1,s2,value", _load_data("match_rating_comparison"), ids=str)
def test_match_rating_comparison(jf, s1, s2, value):
value = {"True": True, "False": False, "None": None}[value]
assert jf.match_rating_comparison(s1, s2) is value
def test_jaro_winkler_long_tolerance(jf):
no_lt = jf.jaro_winkler_similarity(
"two long strings", "two long stringz", long_tolerance=False
)
with_lt = jf.jaro_winkler_similarity(
"two long strings", "two long stringz", long_tolerance=True
)
# make sure long_tolerance does something
assertAlmostEqual(no_lt, 0.975)
assertAlmostEqual(with_lt, 0.984)
def test_damerau_levenshtein_distance_type(jf):
jf.damerau_levenshtein_distance("abc", "abc")
with pytest.raises(TypeError) as exc:
jf.damerau_levenshtein_distance(b"abc", b"abc")
def test_levenshtein_distance_type(jf):
assert jf.levenshtein_distance("abc", "abc") == 0
with pytest.raises(TypeError) as exc:
jf.levenshtein_distance(b"abc", b"abc")
def test_jaro_similarity_type(jf):
assert jf.jaro_similarity("abc", "abc") == 1
with pytest.raises(TypeError) as exc:
jf.jaro_similarity(b"abc", b"abc")
def test_jaro_winkler_type(jf):
assert jf.jaro_winkler_similarity("abc", "abc") == 1
with pytest.raises(TypeError) as exc:
jf.jaro_winkler_similarity(b"abc", b"abc")
def test_mra_comparison_type(jf):
assert jf.match_rating_comparison("abc", "abc") is True
with pytest.raises(TypeError) as exc:
jf.match_rating_comparison(b"abc", b"abc")
def test_hamming_type(jf):
assert jf.hamming_distance("abc", "abc") == 0
with pytest.raises(TypeError) as exc:
jf.hamming_distance(b"abc", b"abc")
def test_soundex_type(jf):
assert jf.soundex("ABC") == "A120"
with pytest.raises(TypeError) as exc:
jf.soundex(b"ABC")
def test_metaphone_type(jf):
assert jf.metaphone("abc") == "ABK"
with pytest.raises(TypeError) as exc:
jf.metaphone(b"abc")
def test_nysiis_type(jf):
assert jf.nysiis("abc") == "ABC"
with pytest.raises(TypeError) as exc:
jf.nysiis(b"abc")
def test_mr_codex_type(jf):
assert jf.match_rating_codex("abc") == "ABC"
with pytest.raises(TypeError) as exc:
jf.match_rating_codex(b"abc")
def test_mr_codex_bad_string(jf):
with pytest.raises(ValueError) as exc:
res = jf.match_rating_codex("i’m")
print(res)
|