File: test_coherence_detection.py

package info (click to toggle)
python-charset-normalizer 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 712 kB
  • sloc: python: 5,434; makefile: 25; sh: 17
file content (108 lines) | stat: -rw-r--r-- 2,741 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from __future__ import annotations

import pytest

from charset_normalizer.cd import (
    encoding_languages,
    filter_alt_coherence_matches,
    get_target_features,
    is_multi_byte_encoding,
    mb_encoding_languages,
)


@pytest.mark.parametrize(
    "iana_encoding, expected_languages",
    [
        ("cp864", ["Arabic", "Farsi"]),
        ("cp862", ["Hebrew"]),
        ("cp737", ["Greek"]),
        ("cp424", ["Hebrew"]),
        ("cp273", ["Latin Based"]),
        ("johab", ["Korean"]),
        ("shift_jis", ["Japanese"]),
        ("mac_greek", ["Greek"]),
        ("iso2022_jp", ["Japanese"]),
    ],
)
def test_infer_language_from_cp(iana_encoding, expected_languages):
    languages = (
        mb_encoding_languages(iana_encoding)
        if is_multi_byte_encoding(iana_encoding)
        else encoding_languages(iana_encoding)
    )

    for expected_language in expected_languages:
        assert (
            expected_language in languages
        ), "Wrongly detected language for given code page"


@pytest.mark.parametrize(
    "language, expected_have_accents, expected_pure_latin",
    [
        ("English", False, True),
        ("French", True, True),
        ("Hebrew", False, False),
        ("Arabic", False, False),
        ("Vietnamese", True, True),
        ("Turkish", True, True),
    ],
)
def test_target_features(language, expected_have_accents, expected_pure_latin):
    target_have_accents, target_pure_latin = get_target_features(language)

    assert target_have_accents is expected_have_accents
    assert target_pure_latin is expected_pure_latin


@pytest.mark.parametrize(
    "matches, expected_return",
    [
        (
            [
                (
                    "English",
                    0.88,
                ),
                ("English—", 0.99),
            ],
            [("English", 0.99)],
        ),
        (
            [
                (
                    "English",
                    0.88,
                ),
                ("English—", 0.99),
                ("English——", 0.999),
            ],
            [("English", 0.999)],
        ),
        (
            [
                (
                    "English",
                    0.88,
                ),
                ("English—", 0.77),
            ],
            [("English", 0.88)],
        ),
        (
            [
                (
                    "English",
                    0.88,
                ),
                ("Italian", 0.77),
            ],
            [("English", 0.88), ("Italian", 0.77)],
        ),
    ],
)
def test_filter_alt_coherence_matches(matches, expected_return):
    results = filter_alt_coherence_matches(matches)

    assert results == expected_return