File: test_edge_case.py

package info (click to toggle)
python-charset-normalizer 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 712 kB
  • sloc: python: 5,434; makefile: 25; sh: 17
file content (59 lines) | stat: -rw-r--r-- 1,644 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from __future__ import annotations

import platform

import pytest

from charset_normalizer import from_bytes


@pytest.mark.xfail(
    platform.python_version_tuple()[0] == "3"
    and platform.python_version_tuple()[1] == "7",
    reason="Unicode database is too old for this case (Python 3.7)",
)
def test_unicode_edge_case():
    payload = b"\xef\xbb\xbf\xf0\x9f\xa9\xb3"

    best_guess = from_bytes(payload).best()

    assert (
        best_guess is not None
    ), "Payload should have given something, detection failure"
    assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected"


def test_issue_gh520():
    """Verify that minorities does not strip basic latin characters!"""
    payload = b"/includes/webform.compon\xd2\xaants.inc/"

    best_guess = from_bytes(payload).best()

    assert (
        best_guess is not None
    ), "Payload should have given something, detection failure"
    assert "Basic Latin" in best_guess.alphabets


def test_issue_gh509():
    """Two common ASCII punctuations should render as-is."""
    payload = b");"

    best_guess = from_bytes(payload).best()

    assert (
        best_guess is not None
    ), "Payload should have given something, detection failure"
    assert "ascii" == best_guess.encoding


def test_issue_gh498():
    """This case was mistaken for utf-16-le, this should never happen again."""
    payload = b"\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx"

    best_guess = from_bytes(payload).best()

    assert (
        best_guess is not None
    ), "Payload should have given something, detection failure"
    assert "Cyrillic" in best_guess.alphabets