from __future__ import annotations import pytest from charset_normalizer import CharsetMatch from charset_normalizer.utils import any_specified_encoding @pytest.mark.parametrize( "payload, expected_encoding", [ (b'', "euc_jp"), (b'', "utf_8"), (b'', None), (b"# coding: utf-8", "utf_8"), (b'', "utf_8"), (b'', "ascii"), (b'', "johab"), (b'', "cp037"), (b"", "cp1252"), (b'', "cp1256"), ], ) def test_detect_most_common_body_encoding(payload, expected_encoding): specified_encoding = any_specified_encoding(payload) assert ( specified_encoding == expected_encoding ), "Unable to determine properly encoding from given body" @pytest.mark.parametrize( "payload, expected_outcome", [ ( b'', b'', ), ( b'', b'', ), ( b'', b'', ), (b"# coding: utf-8", b"# coding: utf-8"), ( b'', b'', ), ( b'', b'', ), ( b'', b'', ), ( b"", b"", ), ( b'', b'', ), ], ) def test_preemptive_mark_replacement(payload, expected_outcome): """ When generating (to Unicode converted) bytes, we want to change any potential declarative charset to utf-8. This test that. """ specified_encoding = any_specified_encoding(payload) detected_encoding = ( specified_encoding if specified_encoding is not None else "utf-8" ) m = CharsetMatch( payload, detected_encoding, 0.0, False, [], preemptive_declaration=specified_encoding, ) transformed_output = m.output() assert transformed_output == expected_outcome