from __future__ import annotations import codecs import unittest from typing import Any from w3lib.encoding import ( html_body_declared_encoding, html_to_unicode, http_content_type_encoding, read_bom, resolve_encoding, to_unicode, ) class RequestEncodingTests(unittest.TestCase): utf8_fragments = [ # Content-Type as meta http-equiv b"""""", b"""\n""", b"""""", b"""""", b"""""", b"""""", b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""", # html5 meta charset b"""""", b"""""", # xml encoding b"""""", ] def test_bom(self): # cjk water character in unicode water_unicode = "\u6C34" # BOM + water character encoded utf16be = b"\xfe\xff\x6c\x34" utf16le = b"\xff\xfe\x34\x6c" utf32be = b"\x00\x00\xfe\xff\x00\x00\x6c\x34" utf32le = b"\xff\xfe\x00\x00\x34\x6c\x00\x00" for string in (utf16be, utf16le, utf32be, utf32le): bom_encoding, bom = read_bom(string) assert bom_encoding is not None assert bom is not None decoded = string[len(bom) :].decode(bom_encoding) self.assertEqual(water_unicode, decoded) # Body without BOM enc, bom = read_bom(b"foo") self.assertEqual(enc, None) self.assertEqual(bom, None) # Empty body enc, bom = read_bom(b"") self.assertEqual(enc, None) self.assertEqual(bom, None) def test_http_encoding_header(self): header_value = "Content-Type: text/html; charset=ISO-8859-4" extracted = http_content_type_encoding(header_value) self.assertEqual(extracted, "iso8859-4") self.assertEqual(None, http_content_type_encoding("something else")) def test_html_body_declared_encoding(self): for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, "utf-8", fragment) self.assertEqual(None, html_body_declared_encoding(b"something else")) self.assertEqual( None, html_body_declared_encoding( b""" this isn't searched """ ), ) self.assertEqual( None, html_body_declared_encoding( b"""""" ), ) def test_html_body_declared_encoding_unicode(self): # html_body_declared_encoding should work when unicode body is passed self.assertEqual(None, html_body_declared_encoding("something else")) for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment.decode("utf8")) self.assertEqual(encoding, "utf-8", fragment) self.assertEqual( None, html_body_declared_encoding( """ this isn't searched """ ), ) self.assertEqual( None, html_body_declared_encoding( """""" ), ) class CodecsEncodingTestCase(unittest.TestCase): def test_resolve_encoding(self): self.assertEqual(resolve_encoding("latin1"), "cp1252") self.assertEqual(resolve_encoding(" Latin-1"), "cp1252") self.assertEqual(resolve_encoding("gb_2312-80"), "gb18030") self.assertEqual(resolve_encoding("unknown encoding"), None) class UnicodeDecodingTestCase(unittest.TestCase): def test_utf8(self): self.assertEqual(to_unicode(b"\xc2\xa3", "utf-8"), "\xa3") def test_invalid_utf8(self): self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3") def ct(charset: str | None) -> str | None: return "Content-Type: text/html; charset=" + charset if charset else None def norm_encoding(enc: str) -> str: return codecs.lookup(enc).name class HtmlConversionTests(unittest.TestCase): def test_unicode_body(self): unicode_string = "\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442" original_string = unicode_string.encode("cp1251") encoding, body_unicode = html_to_unicode(ct("cp1251"), original_string) # check body_as_unicode self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(body_unicode, unicode_string) def _assert_encoding( self, content_type: str | None, body: bytes, expected_encoding: str, expected_unicode: str | list[str], ) -> None: assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body) self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) if isinstance(expected_unicode, str): self.assertEqual(body_unicode, expected_unicode) else: self.assertTrue( body_unicode in expected_unicode, f"{body_unicode} is not in {expected_unicode}", ) def test_content_type_and_conversion(self): """Test content type header is interpreted and text converted as expected """ self._assert_encoding("utf-8", b"\xc2\xa3", "utf-8", "\xa3") # something like this in the scrapy tests - but that's invalid? # self._assert_encoding('', "\xa3", 'utf-8', "\xa3") # iso-8859-1 is overridden to cp1252 self._assert_encoding("iso-8859-1", b"\xa3", "cp1252", "\xa3") self._assert_encoding("", b"\xc2\xa3", "utf-8", "\xa3") self._assert_encoding("none", b"\xc2\xa3", "utf-8", "\xa3") self._assert_encoding("gb2312", b"\xa8D", "gb18030", "\u2015") self._assert_encoding("gbk", b"\xa8D", "gb18030", "\u2015") self._assert_encoding("big5", b"\xf9\xda", "big5hkscs", "\u6052") def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self): # unlike scrapy, the BOM is stripped self._assert_encoding( "utf-8", b"\xef\xbb\xbfWORD\xe3\xabWORD2", "utf-8", "WORD\ufffdWORD2" ) self._assert_encoding( None, b"\xef\xbb\xbfWORD\xe3\xabWORD2", "utf-8", "WORD\ufffdWORD2" ) def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self): # Python implementations handle unexpected end of UTF8 data # differently (see https://bugs.pypy.org/issue1536). # It is hard to fix this for PyPy in w3lib, so the test # is permissive. # unlike scrapy, the BOM is stripped self._assert_encoding( "utf-8", b"\xef\xbb\xbfWORD\xe3\xab", "utf-8", ["WORD\ufffd\ufffd", "WORD\ufffd"], ) self._assert_encoding( None, b"\xef\xbb\xbfWORD\xe3\xab", "utf-8", ["WORD\ufffd\ufffd", "WORD\ufffd"], ) def test_replace_wrong_encoding(self): """Test invalid chars are replaced properly""" encoding, body_unicode = html_to_unicode(ct("utf-8"), b"PREFIX\xe3\xabSUFFIX") # XXX: Policy for replacing invalid chars may suffer minor variations # but it should always contain the unicode replacement char ('\ufffd') assert "\ufffd" in body_unicode, repr(body_unicode) assert "PREFIX" in body_unicode, repr(body_unicode) assert "SUFFIX" in body_unicode, repr(body_unicode) # Do not destroy html tags due to encoding bugs encoding, body_unicode = html_to_unicode(ct("utf-8"), b"\xf0value") assert "value" in body_unicode, repr(body_unicode) def _assert_encoding_detected( self, content_type: str | None, expected_encoding: str, body: bytes, **kwargs: Any, ) -> None: assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) def test_BOM(self): # utf-16 cases already tested, as is the BOM detection function # BOM takes precedence, ahead of the http header bom_be_str = codecs.BOM_UTF16_BE + "hi".encode("utf-16-be") expected = "hi" self._assert_encoding("utf-8", bom_be_str, "utf-16-be", expected) # BOM is stripped when present bom_utf8_str = codecs.BOM_UTF8 + b"hi" self._assert_encoding("utf-8", bom_utf8_str, "utf-8", "hi") self._assert_encoding(None, bom_utf8_str, "utf-8", "hi") def test_utf16_32(self): # tools.ietf.org/html/rfc2781 section 4.3 # USE BOM and strip it bom_be_str = codecs.BOM_UTF16_BE + "hi".encode("utf-16-be") self._assert_encoding("utf-16", bom_be_str, "utf-16-be", "hi") self._assert_encoding(None, bom_be_str, "utf-16-be", "hi") bom_le_str = codecs.BOM_UTF16_LE + "hi".encode("utf-16-le") self._assert_encoding("utf-16", bom_le_str, "utf-16-le", "hi") self._assert_encoding(None, bom_le_str, "utf-16-le", "hi") bom_be_str = codecs.BOM_UTF32_BE + "hi".encode("utf-32-be") self._assert_encoding("utf-32", bom_be_str, "utf-32-be", "hi") self._assert_encoding(None, bom_be_str, "utf-32-be", "hi") bom_le_str = codecs.BOM_UTF32_LE + "hi".encode("utf-32-le") self._assert_encoding("utf-32", bom_le_str, "utf-32-le", "hi") self._assert_encoding(None, bom_le_str, "utf-32-le", "hi") # if there is no BOM, big endian should be chosen self._assert_encoding("utf-16", "hi".encode("utf-16-be"), "utf-16-be", "hi") self._assert_encoding("utf-32", "hi".encode("utf-32-be"), "utf-32-be", "hi") def test_python_crash(self): import random from io import BytesIO random.seed(42) buf = BytesIO() for i in range(150000): buf.write(bytes([random.randint(0, 255)])) to_unicode(buf.getvalue(), "utf-16-le") to_unicode(buf.getvalue(), "utf-16-be") to_unicode(buf.getvalue(), "utf-32-le") to_unicode(buf.getvalue(), "utf-32-be") def test_html_encoding(self): # extracting the encoding from raw html is tested elsewhere body = b"""blah blah < meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> other stuff""" self._assert_encoding_detected(None, "cp1252", body) # header encoding takes precedence self._assert_encoding_detected("utf-8", "utf-8", body) # BOM encoding takes precedence self._assert_encoding_detected(None, "utf-8", codecs.BOM_UTF8 + body) def test_autodetect(self): def asciif(x): return "ascii" body = b"""""" # body encoding takes precedence self._assert_encoding_detected(None, "utf-8", body, auto_detect_fun=asciif) # if no other encoding, the auto detect encoding is used. self._assert_encoding_detected( None, "ascii", b"no encoding info", auto_detect_fun=asciif ) def test_default_encoding(self): # if no other method available, the default encoding of utf-8 is used self._assert_encoding_detected(None, "utf-8", b"no encoding info") # this can be overridden self._assert_encoding_detected( None, "ascii", b"no encoding info", default_encoding="ascii" ) def test_empty_body(self): # if no other method available, the default encoding of utf-8 is used self._assert_encoding_detected(None, "utf-8", b"")