import unittest, codecs import six from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode, http_content_type_encoding, resolve_encoding, html_to_unicode) class RequestEncodingTests(unittest.TestCase): utf8_fragments = [ # Content-Type as meta http-equiv b"""""", b"""\n""", b"""""", b"""""", b"""""", b"""""", b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""", # html5 meta charset b"""""", b"""""", # xml encoding b"""""", ] def test_bom(self): # cjk water character in unicode water_unicode = u'\u6C34' # BOM + water character encoded utf16be = b'\xfe\xff\x6c\x34' utf16le = b'\xff\xfe\x34\x6c' utf32be = b'\x00\x00\xfe\xff\x00\x00\x6c\x34' utf32le = b'\xff\xfe\x00\x00\x34\x6c\x00\x00' for string in (utf16be, utf16le, utf32be, utf32le): bom_encoding, bom = read_bom(string) decoded = string[len(bom):].decode(bom_encoding) self.assertEqual(water_unicode, decoded) # Body without BOM enc, bom = read_bom("foo") self.assertEqual(enc, None) self.assertEqual(bom, None) # Empty body enc, bom = read_bom("") self.assertEqual(enc, None) self.assertEqual(bom, None) def test_http_encoding_header(self): header_value = "Content-Type: text/html; charset=ISO-8859-4" extracted = http_content_type_encoding(header_value) self.assertEqual(extracted, "iso8859-4") self.assertEqual(None, http_content_type_encoding("something else")) def test_html_body_declared_encoding(self): for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, 'utf-8', fragment) self.assertEqual(None, html_body_declared_encoding(b"something else")) self.assertEqual(None, html_body_declared_encoding(b""" this isn't searched """)) self.assertEqual(None, html_body_declared_encoding( b"""""")) def test_html_body_declared_encoding_unicode(self): # html_body_declared_encoding should work when unicode body is passed self.assertEqual(None, html_body_declared_encoding(u"something else")) for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment.decode('utf8')) self.assertEqual(encoding, 'utf-8', fragment) self.assertEqual(None, html_body_declared_encoding(u""" this isn't searched """)) self.assertEqual(None, html_body_declared_encoding( u"""""")) class CodecsEncodingTestCase(unittest.TestCase): def test_resolve_encoding(self): self.assertEqual(resolve_encoding('latin1'), 'cp1252') self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252') self.assertEqual(resolve_encoding('gb_2312-80'), 'gb18030') self.assertEqual(resolve_encoding('unknown encoding'), None) class UnicodeDecodingTestCase(unittest.TestCase): def test_utf8(self): self.assertEqual(to_unicode(b'\xc2\xa3', 'utf-8'), u'\xa3') def test_invalid_utf8(self): self.assertEqual(to_unicode(b'\xc2\xc2\xa3', 'utf-8'), u'\ufffd\xa3') def ct(charset): return "Content-Type: text/html; charset=" + charset if charset else None def norm_encoding(enc): return codecs.lookup(enc).name class HtmlConversionTests(unittest.TestCase): def test_unicode_body(self): unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442' original_string = unicode_string.encode('cp1251') encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string) # check body_as_unicode self.assertTrue(isinstance(body_unicode, six.text_type)) self.assertEqual(body_unicode, unicode_string) def _assert_encoding(self, content_type, body, expected_encoding, expected_unicode): assert not isinstance(body, six.text_type) encoding, body_unicode = html_to_unicode(ct(content_type), body) self.assertTrue(isinstance(body_unicode, six.text_type)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) if isinstance(expected_unicode, six.string_types): self.assertEqual(body_unicode, expected_unicode) else: self.assertTrue( body_unicode in expected_unicode, "%s is not in %s" % (body_unicode, expected_unicode) ) def test_content_type_and_conversion(self): """Test content type header is interpreted and text converted as expected """ self._assert_encoding('utf-8', b"\xc2\xa3", 'utf-8', u"\xa3") # something like this in the scrapy tests - but that's invalid? # self._assert_encoding('', "\xa3", 'utf-8', u"\xa3") # iso-8859-1 is overridden to cp1252 self._assert_encoding('iso-8859-1', b"\xa3", 'cp1252', u"\xa3") self._assert_encoding('', b"\xc2\xa3", 'utf-8', u"\xa3") self._assert_encoding('none', b"\xc2\xa3", 'utf-8', u"\xa3") self._assert_encoding('gb2312', b"\xa8D", 'gb18030', u"\u2015") self._assert_encoding('gbk', b"\xa8D", 'gb18030', u"\u2015") self._assert_encoding('big5', b"\xf9\xda", 'big5hkscs', u"\u6052") def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self): # unlike scrapy, the BOM is stripped self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xabWORD2", 'utf-8', u'WORD\ufffdWORD2') self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xabWORD2", 'utf-8', u'WORD\ufffdWORD2') def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self): # Python implementations handle unexpected end of UTF8 data # differently (see https://bugs.pypy.org/issue1536). # It is hard to fix this for PyPy in w3lib, so the test # is permissive. # unlike scrapy, the BOM is stripped self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xab", 'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd']) self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xab", 'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd']) def test_replace_wrong_encoding(self): """Test invalid chars are replaced properly""" encoding, body_unicode = html_to_unicode(ct('utf-8'), b'PREFIX\xe3\xabSUFFIX') # XXX: Policy for replacing invalid chars may suffer minor variations # but it should always contain the unicode replacement char (u'\ufffd') assert u'\ufffd' in body_unicode, repr(body_unicode) assert u'PREFIX' in body_unicode, repr(body_unicode) assert u'SUFFIX' in body_unicode, repr(body_unicode) # Do not destroy html tags due to encoding bugs encoding, body_unicode = html_to_unicode(ct('utf-8'), b'\xf0value') assert u'value' in body_unicode, repr(body_unicode) def _assert_encoding_detected(self, content_type, expected_encoding, body, **kwargs): assert not isinstance(body, six.text_type) encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) self.assertTrue(isinstance(body_unicode, six.text_type)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) def test_BOM(self): # utf-16 cases already tested, as is the BOM detection function # http header takes precedence, irrespective of BOM bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be') expected = u'\ufffd\ufffd\x00h\x00i' self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected) # BOM is stripped when it agrees with the encoding, or used to # determine encoding bom_utf8_str = codecs.BOM_UTF8 + b'hi' self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi") self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi") def test_utf16_32(self): # tools.ietf.org/html/rfc2781 section 4.3 # USE BOM and strip it bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be') self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi") self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi") bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le') self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi") self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi") bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be') self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi") self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi") bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le') self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi") self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi") # if there is no BOM, big endian should be chosen self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi") self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi") def test_python_crash(self): import random from io import BytesIO random.seed(42) buf = BytesIO() for i in range(150000): buf.write(bytes([random.randint(0, 255)])) to_unicode(buf.getvalue(), 'utf-16-le') to_unicode(buf.getvalue(), 'utf-16-be') to_unicode(buf.getvalue(), 'utf-32-le') to_unicode(buf.getvalue(), 'utf-32-be') def test_html_encoding(self): # extracting the encoding from raw html is tested elsewhere body = b"""blah blah < meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> other stuff""" self._assert_encoding_detected(None, 'cp1252', body) # header encoding takes precedence self._assert_encoding_detected('utf-8', 'utf-8', body) # BOM encoding takes precedence self._assert_encoding_detected(None, 'utf-8', codecs.BOM_UTF8 + body) def test_autodetect(self): asciif = lambda x: 'ascii' body = b"""""" # body encoding takes precedence self._assert_encoding_detected(None, 'utf-8', body, auto_detect_fun=asciif) # if no other encoding, the auto detect encoding is used. self._assert_encoding_detected(None, 'ascii', b"no encoding info", auto_detect_fun=asciif) def test_default_encoding(self): # if no other method available, the default encoding of utf-8 is used self._assert_encoding_detected(None, 'utf-8', b"no encoding info") # this can be overridden self._assert_encoding_detected(None, 'ascii', b"no encoding info", default_encoding='ascii') def test_empty_body(self): # if no other method available, the default encoding of utf-8 is used self._assert_encoding_detected(None, 'utf-8', b"")