File: test_large_payload.py

package info (click to toggle)

python-charset-normalizer 3.4.3-1

links: PTS, VCS
area: main
in suites: forky, sid
size: 712 kB
sloc: python: 5,434; makefile: 25; sh: 17

file content (55 lines) | stat: -rw-r--r-- 1,951 bytes

parent folder | download | duplicates (2)

from __future__ import annotations

import pytest

from charset_normalizer import from_bytes
from charset_normalizer.constant import TOO_BIG_SEQUENCE


def test_large_payload_u8_sig_basic_entry():
    payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8_sig")
    best_guess = from_bytes(payload).best()

    assert best_guess is not None, "Large U8 payload case detection completely failed"
    assert (
        best_guess.encoding == "utf_8"
    ), "Large U8 payload case detection wrongly detected!"
    assert best_guess.bom is True, "SIG/BOM property should be True"
    assert len(best_guess.raw) == len(
        payload
    ), "Large payload should remain untouched when accessed through .raw"
    assert (
        best_guess._string is not None
    ), "str should be decoded before direct access (sig available)"


def test_large_payload_ascii_basic_entry():
    payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8")
    best_guess = from_bytes(payload).best()

    assert (
        best_guess is not None
    ), "Large ASCII payload case detection completely failed"
    assert (
        best_guess.encoding == "ascii"
    ), "Large ASCII payload case detection wrongly detected!"
    assert best_guess.bom is False, "SIG/BOM property should be False"
    assert len(best_guess.raw) == len(
        payload
    ), "Large payload should remain untouched when accessed through .raw"
    assert best_guess._string is None, "str should not be decoded until direct access"


def test_misleading_large_sequence():
    content = (
        ("hello simple ascii " * TOO_BIG_SEQUENCE) + ("我没有埋怨，磋砣的只是一些时间。 磋砣的只是一些时间。")
    ).encode("utf_8")

    guesses = from_bytes(content)

    assert len(guesses) > 0
    match = guesses.best()
    assert match is not None
    assert match._string is not None, "str should be cached as only match"
    assert match.encoding == "utf_8"
    assert str(match) is not None