1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
from __future__ import annotations
import pytest
from charset_normalizer.cd import (
encoding_languages,
filter_alt_coherence_matches,
get_target_features,
is_multi_byte_encoding,
mb_encoding_languages,
)
@pytest.mark.parametrize(
"iana_encoding, expected_languages",
[
("cp864", ["Arabic", "Farsi"]),
("cp862", ["Hebrew"]),
("cp737", ["Greek"]),
("cp424", ["Hebrew"]),
("cp273", ["Latin Based"]),
("johab", ["Korean"]),
("shift_jis", ["Japanese"]),
("mac_greek", ["Greek"]),
("iso2022_jp", ["Japanese"]),
],
)
def test_infer_language_from_cp(iana_encoding, expected_languages):
languages = (
mb_encoding_languages(iana_encoding)
if is_multi_byte_encoding(iana_encoding)
else encoding_languages(iana_encoding)
)
for expected_language in expected_languages:
assert (
expected_language in languages
), "Wrongly detected language for given code page"
@pytest.mark.parametrize(
"language, expected_have_accents, expected_pure_latin",
[
("English", False, True),
("French", True, True),
("Hebrew", False, False),
("Arabic", False, False),
("Vietnamese", True, True),
("Turkish", True, True),
],
)
def test_target_features(language, expected_have_accents, expected_pure_latin):
target_have_accents, target_pure_latin = get_target_features(language)
assert target_have_accents is expected_have_accents
assert target_pure_latin is expected_pure_latin
@pytest.mark.parametrize(
"matches, expected_return",
[
(
[
(
"English",
0.88,
),
("English—", 0.99),
],
[("English", 0.99)],
),
(
[
(
"English",
0.88,
),
("English—", 0.99),
("English——", 0.999),
],
[("English", 0.999)],
),
(
[
(
"English",
0.88,
),
("English—", 0.77),
],
[("English", 0.88)],
),
(
[
(
"English",
0.88,
),
("Italian", 0.77),
],
[("English", 0.88), ("Italian", 0.77)],
),
],
)
def test_filter_alt_coherence_matches(matches, expected_return):
results = filter_alt_coherence_matches(matches)
assert results == expected_return
|