File: test_encoding.py

package info (click to toggle)
python-w3lib 2.3.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 396 kB
  • sloc: python: 3,141; makefile: 133
file content (316 lines) | stat: -rw-r--r-- 12,620 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
from __future__ import annotations

import codecs
import unittest
from typing import Any

from w3lib.encoding import (
    html_body_declared_encoding,
    html_to_unicode,
    http_content_type_encoding,
    read_bom,
    resolve_encoding,
    to_unicode,
)


class RequestEncodingTests(unittest.TestCase):
    utf8_fragments = [
        # Content-Type as meta http-equiv
        b"""<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""",
        b"""\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""",
        b"""<meta http-equiv="Content-Type" content="text/html" charset="utf-8">""",
        b"""<meta http-equiv=Content-Type content="text/html" charset='utf-8'>""",
        b"""<meta http-equiv="Content-Type" content\t=\n"text/html" charset\t="utf-8">""",
        b"""<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""",
        b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""",
        # html5 meta charset
        b"""<meta charset="utf-8">""",
        b"""<meta charset =\n"utf-8">""",
        # xml encoding
        b"""<?xml version="1.0" encoding="utf-8"?>""",
    ]

    def test_bom(self):
        # cjk water character in unicode
        water_unicode = "\u6C34"
        # BOM + water character encoded
        utf16be = b"\xfe\xff\x6c\x34"
        utf16le = b"\xff\xfe\x34\x6c"
        utf32be = b"\x00\x00\xfe\xff\x00\x00\x6c\x34"
        utf32le = b"\xff\xfe\x00\x00\x34\x6c\x00\x00"
        for string in (utf16be, utf16le, utf32be, utf32le):
            bom_encoding, bom = read_bom(string)
            assert bom_encoding is not None
            assert bom is not None
            decoded = string[len(bom) :].decode(bom_encoding)
            self.assertEqual(water_unicode, decoded)
        # Body without BOM
        enc, bom = read_bom(b"foo")
        self.assertEqual(enc, None)
        self.assertEqual(bom, None)
        # Empty body
        enc, bom = read_bom(b"")
        self.assertEqual(enc, None)
        self.assertEqual(bom, None)

    def test_http_encoding_header(self):
        header_value = "Content-Type: text/html; charset=ISO-8859-4"
        extracted = http_content_type_encoding(header_value)
        self.assertEqual(extracted, "iso8859-4")
        self.assertEqual(None, http_content_type_encoding("something else"))

    def test_html_body_declared_encoding(self):
        for fragment in self.utf8_fragments:
            encoding = html_body_declared_encoding(fragment)
            self.assertEqual(encoding, "utf-8", fragment)
        self.assertEqual(None, html_body_declared_encoding(b"something else"))
        self.assertEqual(
            None,
            html_body_declared_encoding(
                b"""
            <head></head><body>
            this isn't searched
            <meta charset="utf-8">
        """
            ),
        )
        self.assertEqual(
            None,
            html_body_declared_encoding(
                b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
            ),
        )

    def test_html_body_declared_encoding_unicode(self):
        # html_body_declared_encoding should work when unicode body is passed
        self.assertEqual(None, html_body_declared_encoding("something else"))

        for fragment in self.utf8_fragments:
            encoding = html_body_declared_encoding(fragment.decode("utf8"))
            self.assertEqual(encoding, "utf-8", fragment)

        self.assertEqual(
            None,
            html_body_declared_encoding(
                """
            <head></head><body>
            this isn't searched
            <meta charset="utf-8">
        """
            ),
        )
        self.assertEqual(
            None,
            html_body_declared_encoding(
                """<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
            ),
        )


class CodecsEncodingTestCase(unittest.TestCase):
    def test_resolve_encoding(self):
        self.assertEqual(resolve_encoding("latin1"), "cp1252")
        self.assertEqual(resolve_encoding(" Latin-1"), "cp1252")
        self.assertEqual(resolve_encoding("gb_2312-80"), "gb18030")
        self.assertEqual(resolve_encoding("unknown encoding"), None)


class UnicodeDecodingTestCase(unittest.TestCase):
    def test_utf8(self):
        self.assertEqual(to_unicode(b"\xc2\xa3", "utf-8"), "\xa3")

    def test_invalid_utf8(self):
        self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3")


def ct(charset: str | None) -> str | None:
    return "Content-Type: text/html; charset=" + charset if charset else None


def norm_encoding(enc: str) -> str:
    return codecs.lookup(enc).name


class HtmlConversionTests(unittest.TestCase):
    def test_unicode_body(self):
        unicode_string = "\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442"
        original_string = unicode_string.encode("cp1251")
        encoding, body_unicode = html_to_unicode(ct("cp1251"), original_string)
        # check body_as_unicode
        self.assertTrue(isinstance(body_unicode, str))
        self.assertEqual(body_unicode, unicode_string)

    def _assert_encoding(
        self,
        content_type: str | None,
        body: bytes,
        expected_encoding: str,
        expected_unicode: str | list[str],
    ) -> None:
        assert not isinstance(body, str)
        encoding, body_unicode = html_to_unicode(ct(content_type), body)
        self.assertTrue(isinstance(body_unicode, str))
        self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))

        if isinstance(expected_unicode, str):
            self.assertEqual(body_unicode, expected_unicode)
        else:
            self.assertTrue(
                body_unicode in expected_unicode,
                f"{body_unicode} is not in {expected_unicode}",
            )

    def test_content_type_and_conversion(self):
        """Test content type header is interpreted and text converted as
        expected
        """
        self._assert_encoding("utf-8", b"\xc2\xa3", "utf-8", "\xa3")
        # something like this in the scrapy tests - but that's invalid?
        # self._assert_encoding('', "\xa3", 'utf-8', "\xa3")
        # iso-8859-1 is overridden to cp1252
        self._assert_encoding("iso-8859-1", b"\xa3", "cp1252", "\xa3")
        self._assert_encoding("", b"\xc2\xa3", "utf-8", "\xa3")
        self._assert_encoding("none", b"\xc2\xa3", "utf-8", "\xa3")
        self._assert_encoding("gb2312", b"\xa8D", "gb18030", "\u2015")
        self._assert_encoding("gbk", b"\xa8D", "gb18030", "\u2015")
        self._assert_encoding("big5", b"\xf9\xda", "big5hkscs", "\u6052")

    def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
        # unlike scrapy, the BOM is stripped
        self._assert_encoding(
            "utf-8", b"\xef\xbb\xbfWORD\xe3\xabWORD2", "utf-8", "WORD\ufffdWORD2"
        )
        self._assert_encoding(
            None, b"\xef\xbb\xbfWORD\xe3\xabWORD2", "utf-8", "WORD\ufffdWORD2"
        )

    def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self):
        # Python implementations handle unexpected end of UTF8 data
        # differently (see https://bugs.pypy.org/issue1536).
        # It is hard to fix this for PyPy in w3lib, so the test
        # is permissive.

        # unlike scrapy, the BOM is stripped
        self._assert_encoding(
            "utf-8",
            b"\xef\xbb\xbfWORD\xe3\xab",
            "utf-8",
            ["WORD\ufffd\ufffd", "WORD\ufffd"],
        )
        self._assert_encoding(
            None,
            b"\xef\xbb\xbfWORD\xe3\xab",
            "utf-8",
            ["WORD\ufffd\ufffd", "WORD\ufffd"],
        )

    def test_replace_wrong_encoding(self):
        """Test invalid chars are replaced properly"""
        encoding, body_unicode = html_to_unicode(ct("utf-8"), b"PREFIX\xe3\xabSUFFIX")
        # XXX: Policy for replacing invalid chars may suffer minor variations
        # but it should always contain the unicode replacement char ('\ufffd')
        assert "\ufffd" in body_unicode, repr(body_unicode)
        assert "PREFIX" in body_unicode, repr(body_unicode)
        assert "SUFFIX" in body_unicode, repr(body_unicode)

        # Do not destroy html tags due to encoding bugs
        encoding, body_unicode = html_to_unicode(ct("utf-8"), b"\xf0<span>value</span>")
        assert "<span>value</span>" in body_unicode, repr(body_unicode)

    def _assert_encoding_detected(
        self,
        content_type: str | None,
        expected_encoding: str,
        body: bytes,
        **kwargs: Any,
    ) -> None:
        assert not isinstance(body, str)
        encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs)
        self.assertTrue(isinstance(body_unicode, str))
        self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))

    def test_BOM(self):
        # utf-16 cases already tested, as is the BOM detection function

        # BOM takes precedence, ahead of the http header
        bom_be_str = codecs.BOM_UTF16_BE + "hi".encode("utf-16-be")
        expected = "hi"
        self._assert_encoding("utf-8", bom_be_str, "utf-16-be", expected)

        # BOM is stripped when present
        bom_utf8_str = codecs.BOM_UTF8 + b"hi"
        self._assert_encoding("utf-8", bom_utf8_str, "utf-8", "hi")
        self._assert_encoding(None, bom_utf8_str, "utf-8", "hi")

    def test_utf16_32(self):
        # tools.ietf.org/html/rfc2781 section 4.3

        # USE BOM and strip it
        bom_be_str = codecs.BOM_UTF16_BE + "hi".encode("utf-16-be")
        self._assert_encoding("utf-16", bom_be_str, "utf-16-be", "hi")
        self._assert_encoding(None, bom_be_str, "utf-16-be", "hi")

        bom_le_str = codecs.BOM_UTF16_LE + "hi".encode("utf-16-le")
        self._assert_encoding("utf-16", bom_le_str, "utf-16-le", "hi")
        self._assert_encoding(None, bom_le_str, "utf-16-le", "hi")

        bom_be_str = codecs.BOM_UTF32_BE + "hi".encode("utf-32-be")
        self._assert_encoding("utf-32", bom_be_str, "utf-32-be", "hi")
        self._assert_encoding(None, bom_be_str, "utf-32-be", "hi")

        bom_le_str = codecs.BOM_UTF32_LE + "hi".encode("utf-32-le")
        self._assert_encoding("utf-32", bom_le_str, "utf-32-le", "hi")
        self._assert_encoding(None, bom_le_str, "utf-32-le", "hi")

        # if there is no BOM,  big endian should be chosen
        self._assert_encoding("utf-16", "hi".encode("utf-16-be"), "utf-16-be", "hi")
        self._assert_encoding("utf-32", "hi".encode("utf-32-be"), "utf-32-be", "hi")

    def test_python_crash(self):
        import random
        from io import BytesIO

        random.seed(42)
        buf = BytesIO()
        for i in range(150000):
            buf.write(bytes([random.randint(0, 255)]))
        to_unicode(buf.getvalue(), "utf-16-le")
        to_unicode(buf.getvalue(), "utf-16-be")
        to_unicode(buf.getvalue(), "utf-32-le")
        to_unicode(buf.getvalue(), "utf-32-be")

    def test_html_encoding(self):
        # extracting the encoding from raw html is tested elsewhere
        body = b"""blah blah < meta   http-equiv="Content-Type"
            content="text/html; charset=iso-8859-1"> other stuff"""
        self._assert_encoding_detected(None, "cp1252", body)

        # header encoding takes precedence
        self._assert_encoding_detected("utf-8", "utf-8", body)
        # BOM encoding takes precedence
        self._assert_encoding_detected(None, "utf-8", codecs.BOM_UTF8 + body)

    def test_autodetect(self):
        def asciif(x):
            return "ascii"

        body = b"""<meta charset="utf-8">"""
        # body encoding takes precedence
        self._assert_encoding_detected(None, "utf-8", body, auto_detect_fun=asciif)
        # if no other encoding, the auto detect encoding is used.
        self._assert_encoding_detected(
            None, "ascii", b"no encoding info", auto_detect_fun=asciif
        )

    def test_default_encoding(self):
        # if no other method available, the default encoding of utf-8 is used
        self._assert_encoding_detected(None, "utf-8", b"no encoding info")
        # this can be overridden
        self._assert_encoding_detected(
            None, "ascii", b"no encoding info", default_encoding="ascii"
        )

    def test_empty_body(self):
        # if no other method available, the default encoding of utf-8 is used
        self._assert_encoding_detected(None, "utf-8", b"")