File: test_large_payload.py

package info (click to toggle)
python-charset-normalizer 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 712 kB
  • sloc: python: 5,434; makefile: 25; sh: 17
file content (55 lines) | stat: -rw-r--r-- 1,951 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from __future__ import annotations

import pytest

from charset_normalizer import from_bytes
from charset_normalizer.constant import TOO_BIG_SEQUENCE


def test_large_payload_u8_sig_basic_entry():
    payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8_sig")
    best_guess = from_bytes(payload).best()

    assert best_guess is not None, "Large U8 payload case detection completely failed"
    assert (
        best_guess.encoding == "utf_8"
    ), "Large U8 payload case detection wrongly detected!"
    assert best_guess.bom is True, "SIG/BOM property should be True"
    assert len(best_guess.raw) == len(
        payload
    ), "Large payload should remain untouched when accessed through .raw"
    assert (
        best_guess._string is not None
    ), "str should be decoded before direct access (sig available)"


def test_large_payload_ascii_basic_entry():
    payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8")
    best_guess = from_bytes(payload).best()

    assert (
        best_guess is not None
    ), "Large ASCII payload case detection completely failed"
    assert (
        best_guess.encoding == "ascii"
    ), "Large ASCII payload case detection wrongly detected!"
    assert best_guess.bom is False, "SIG/BOM property should be False"
    assert len(best_guess.raw) == len(
        payload
    ), "Large payload should remain untouched when accessed through .raw"
    assert best_guess._string is None, "str should not be decoded until direct access"


def test_misleading_large_sequence():
    content = (
        ("hello simple ascii " * TOO_BIG_SEQUENCE) + ("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。")
    ).encode("utf_8")

    guesses = from_bytes(content)

    assert len(guesses) > 0
    match = guesses.best()
    assert match is not None
    assert match._string is not None, "str should be cached as only match"
    assert match.encoding == "utf_8"
    assert str(match) is not None