from __future__ import unicode_literals import pytest from bleach import html5lib_shim @pytest.mark.parametrize( "data, expected", [ # Strings without character entities pass through as is ("", ""), ("abc", "abc"), # Handles character entities--both named and numeric (" ", "\xa0"), (" ", " "), (" ", " "), # Handles ambiguous ampersand ("&xx;", "&xx;"), # Handles multiple entities in the same string ("this & that & that", "this & that & that"), ], ) def test_convert_entities(data, expected): assert html5lib_shim.convert_entities(data) == expected @pytest.mark.parametrize( "data, expected", [ ("", ""), ("text", "text"), # & in Characters is escaped ("&", "&"), # FIXME(willkg): This happens because the BleachHTMLTokenizer is ignoring # character entities. What it should be doing is creating Entity tokens # for character entities. # # That was too hard at the time I was fixing it, so I fixed it in # BleachSanitizerFilter. When that gest fixed correctly in the tokenizer, # then this test cases will get fixed. ("a & b", "a & b"), # should be 'a & b' # & in HTML attribute values are escaped ( 'tag', 'tag', ), # & marking character entities in HTML attribute values aren't escaped ( 'tag', 'tag', ), # & marking ambiguous character entities in attribute values are escaped # (¤ is a character entity) ( 'tag', 'tag', ), ], ) def test_serializer(data, expected): # Build a parser, walker, and serializer just like we do in clean() parser = html5lib_shim.BleachHTMLParser( tags=None, strip=True, consume_entities=False, namespaceHTMLElements=False ) walker = html5lib_shim.getTreeWalker("etree") serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values="always", omit_optional_tags=False, escape_lt_in_attrs=True, resolve_entities=False, sanitize=False, alphabetical_attributes=False, ) # Parse, walk, and then serialize the output dom = parser.parseFragment(data) serialized = serializer.render(walker(dom)) assert serialized == expected @pytest.mark.parametrize( "parser_args, data, expected", [ # Make sure InputStreamWithMemory has charEncoding and changeEncoding ({}, '', ''), # Handle consume entities False--all entities are passed along and then # escaped when serialized ( {"consume_entities": False}, "text &>"", "text &>"", ), # Handle consume entities True--all entities are consumed and converted # to their character equivalents and then &, <, and > are escaped when # serialized ({"consume_entities": True}, "text &>"", 'text &>"'), # Test that "invalid-character-in-attribute-name" errors in tokenizing # result in attributes with invalid names getting dropped ({}, '', ''), ({}, "", ''), # Test that "expected-closing-tag-but-got-char" works when tags is None ( {}, "", ), ], ) def test_bleach_html_parser(parser_args, data, expected): args = {"tags": None, "strip": True, "consume_entities": True} args.update(parser_args) # Build a parser, walker, and serializer just like we do in clean() parser = html5lib_shim.BleachHTMLParser(**args) walker = html5lib_shim.getTreeWalker("etree") serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values="always", omit_optional_tags=False, escape_lt_in_attrs=True, resolve_entities=False, sanitize=False, alphabetical_attributes=False, ) # Parse, walk, and then serialize the output dom = parser.parseFragment(data) serialized = serializer.render(walker(dom)) assert serialized == expected