import pytest from bleach import html5lib_shim @pytest.mark.parametrize('data, expected', [ # Strings without character entities pass through as is ('', ''), ('abc', 'abc'), # Handles character entities--both named and numeric (' ', u'\xa0'), (' ', ' '), (' ', ' '), # Handles ambiguous ampersand ('&xx;', '&xx;'), # Handles multiple entities in the same string ('this & that & that', 'this & that & that'), ]) def test_convert_entities(data, expected): assert html5lib_shim.convert_entities(data) == expected @pytest.mark.parametrize('data, expected', [ ('', ''), ('text', 'text'), # & in Characters is escaped ('&', '&'), # FIXME(willkg): This happens because the BleachHTMLTokenizer is ignoring # character entities. What it should be doing is creating Entity tokens # for character entities. # # That was too hard at the time I was fixing it, so I fixed it in # BleachSanitizerFilter. When that gest fixed correctly in the tokenizer, # then this test cases will get fixed. ('a & b', 'a & b'), # should be 'a & b' # & in HTML attribute values are escaped ( 'tag', 'tag' ), # & marking character entities in HTML attribute values aren't escaped ( 'tag', 'tag' ), # & marking ambiguous character entities in attribute values are escaped # (¤ is a character entity) ( 'tag', 'tag' ), ]) def test_serializer(data, expected): # Build a parser, walker, and serializer just like we do in clean() parser = html5lib_shim.BleachHTMLParser( tags=None, strip=True, consume_entities=False, namespaceHTMLElements=False ) walker = html5lib_shim.getTreeWalker('etree') serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values='always', omit_optional_tags=False, escape_lt_in_attrs=True, resolve_entities=False, sanitize=False, alphabetical_attributes=False, ) # Parse, walk, and then serialize the output dom = parser.parseFragment(data) serialized = serializer.render(walker(dom)) assert serialized == expected @pytest.mark.parametrize('parser_args, data, expected', [ # Make sure InputStreamWithMemory has charEncoding and changeEncoding ( {}, '', '' ), # Handle consume entities False--all entities are passed along and then # escaped when serialized ( {'consume_entities': False}, 'text &>"', 'text &>"' ), # Handle consume entities True--all entities are consumed and converted # to their character equivalents and then &, <, and > are escaped when # serialized ( {'consume_entities': True}, 'text &>"', 'text &>"' ), # Test that "invalid-character-in-attribute-name" errors in tokenizing # result in attributes with invalid names getting dropped ( {}, '', '' ), ( {}, '', '' ) ]) def test_bleach_html_parser(parser_args, data, expected): args = { 'tags': None, 'strip': True, 'consume_entities': True } args.update(parser_args) # Build a parser, walker, and serializer just like we do in clean() parser = html5lib_shim.BleachHTMLParser(**args) walker = html5lib_shim.getTreeWalker('etree') serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values='always', omit_optional_tags=False, escape_lt_in_attrs=True, resolve_entities=False, sanitize=False, alphabetical_attributes=False, ) # Parse, walk, and then serialize the output dom = parser.parseFragment(data) serialized = serializer.render(walker(dom)) assert serialized == expected