File: test_preemptive_detection.py

package info (click to toggle)
python-charset-normalizer 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 712 kB
  • sloc: python: 5,434; makefile: 25; sh: 17
file content (92 lines) | stat: -rw-r--r-- 3,039 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from __future__ import annotations

import pytest

from charset_normalizer import CharsetMatch
from charset_normalizer.utils import any_specified_encoding


@pytest.mark.parametrize(
    "payload, expected_encoding",
    [
        (b'<?xml version="1.0" encoding="EUC-JP"?>', "euc_jp"),
        (b'<html><head><meta charset="utf-8"></head></html>', "utf_8"),
        (b'<html><head><meta charset="utf-57"></head></html>', None),
        (b"# coding: utf-8", "utf_8"),
        (b'<?xml version="1.0" encoding="UTF-8"?>', "utf_8"),
        (b'<?xml version="1.0" encoding="US-ASCII"?>', "ascii"),
        (b'<?xml version="1.0" encoding="JohaB"?>', "johab"),
        (b'<?xml version="1.0" encoding="ibm037"?>', "cp037"),
        (b"<html><head><meta charset=WINDOWS-1252></head></html>", "cp1252"),
        (b'<html><head><meta charset="WINDOWS-1256"></head></html>', "cp1256"),
    ],
)
def test_detect_most_common_body_encoding(payload, expected_encoding):
    specified_encoding = any_specified_encoding(payload)

    assert (
        specified_encoding == expected_encoding
    ), "Unable to determine properly encoding from given body"


@pytest.mark.parametrize(
    "payload, expected_outcome",
    [
        (
            b'<?xml version="1.0" encoding="EUC-JP"?>',
            b'<?xml version="1.0" encoding="utf-8"?>',
        ),
        (
            b'<html><head><meta charset="utf-8"></head></html>',
            b'<html><head><meta charset="utf-8"></head></html>',
        ),
        (
            b'<html><head><meta charset="utf-57"></head></html>',
            b'<html><head><meta charset="utf-57"></head></html>',
        ),
        (b"# coding: utf-8", b"# coding: utf-8"),
        (
            b'<?xml version="1.0" encoding="UTF-8"?>',
            b'<?xml version="1.0" encoding="UTF-8"?>',
        ),
        (
            b'<?xml version="1.0" encoding="US-ASCII"?>',
            b'<?xml version="1.0" encoding="utf-8"?>',
        ),
        (
            b'<?xml version="1.0" encoding="JohaB"?>',
            b'<?xml version="1.0" encoding="utf-8"?>',
        ),
        (
            b"<html><head><meta charset=WINDOWS-1252></head></html>",
            b"<html><head><meta charset=utf-8></head></html>",
        ),
        (
            b'<html><head><meta charset="WINDOWS-1256"></head></html>',
            b'<html><head><meta charset="utf-8"></head></html>',
        ),
    ],
)
def test_preemptive_mark_replacement(payload, expected_outcome):
    """
    When generating (to Unicode converted) bytes, we want to change any potential declarative charset
    to utf-8. This test that.
    """
    specified_encoding = any_specified_encoding(payload)

    detected_encoding = (
        specified_encoding if specified_encoding is not None else "utf-8"
    )

    m = CharsetMatch(
        payload,
        detected_encoding,
        0.0,
        False,
        [],
        preemptive_declaration=specified_encoding,
    )

    transformed_output = m.output()

    assert transformed_output == expected_outcome