1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
|
import pytest
from bleach import html5lib_shim
@pytest.mark.parametrize('data, expected', [
# Strings without character entities pass through as is
('', ''),
('abc', 'abc'),
# Handles character entities--both named and numeric
(' ', u'\xa0'),
(' ', ' '),
(' ', ' '),
# Handles ambiguous ampersand
('&xx;', '&xx;'),
# Handles multiple entities in the same string
('this & that & that', 'this & that & that'),
])
def test_convert_entities(data, expected):
assert html5lib_shim.convert_entities(data) == expected
@pytest.mark.parametrize('data, expected', [
('', ''),
('text', 'text'),
# & in Characters is escaped
('&', '&'),
# FIXME(willkg): This happens because the BleachHTMLTokenizer is ignoring
# character entities. What it should be doing is creating Entity tokens
# for character entities.
#
# That was too hard at the time I was fixing it, so I fixed it in
# BleachSanitizerFilter. When that gest fixed correctly in the tokenizer,
# then this test cases will get fixed.
('a & b', 'a & b'), # should be 'a & b'
# & in HTML attribute values are escaped
(
'<a href="http://example.com?key=value&key2=value">tag</a>',
'<a href="http://example.com?key=value&key2=value">tag</a>'
),
# & marking character entities in HTML attribute values aren't escaped
(
'<a href="http://example.com?key=value&key2=value">tag</a>',
'<a href="http://example.com?key=value&key2=value">tag</a>'
),
# & marking ambiguous character entities in attribute values are escaped
# (¤ is a character entity)
(
'<a href="http://example.com?key=value¤t=value">tag</a>',
'<a href="http://example.com?key=value&current=value">tag</a>'
),
])
def test_serializer(data, expected):
# Build a parser, walker, and serializer just like we do in clean()
parser = html5lib_shim.BleachHTMLParser(
tags=None,
strip=True,
consume_entities=False,
namespaceHTMLElements=False
)
walker = html5lib_shim.getTreeWalker('etree')
serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values='always',
omit_optional_tags=False,
escape_lt_in_attrs=True,
resolve_entities=False,
sanitize=False,
alphabetical_attributes=False,
)
# Parse, walk, and then serialize the output
dom = parser.parseFragment(data)
serialized = serializer.render(walker(dom))
assert serialized == expected
@pytest.mark.parametrize('parser_args, data, expected', [
# Make sure InputStreamWithMemory has charEncoding and changeEncoding
(
{},
'<meta charset="utf-8">',
'<meta charset="utf-8">'
),
# Handle consume entities False--all entities are passed along and then
# escaped when serialized
(
{'consume_entities': False},
'text &>"',
'text &amp;&gt;&quot;'
),
# Handle consume entities True--all entities are consumed and converted
# to their character equivalents and then &, <, and > are escaped when
# serialized
(
{'consume_entities': True},
'text &>"',
'text &>"'
),
# Test that "invalid-character-in-attribute-name" errors in tokenizing
# result in attributes with invalid names getting dropped
(
{},
'<a href="http://example.com"">',
'<a href="http://example.com"></a>'
),
(
{},
'<a href=\'http://example.com\'\'>',
'<a href="http://example.com"></a>'
)
])
def test_bleach_html_parser(parser_args, data, expected):
args = {
'tags': None,
'strip': True,
'consume_entities': True
}
args.update(parser_args)
# Build a parser, walker, and serializer just like we do in clean()
parser = html5lib_shim.BleachHTMLParser(**args)
walker = html5lib_shim.getTreeWalker('etree')
serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values='always',
omit_optional_tags=False,
escape_lt_in_attrs=True,
resolve_entities=False,
sanitize=False,
alphabetical_attributes=False,
)
# Parse, walk, and then serialize the output
dom = parser.parseFragment(data)
serialized = serializer.render(walker(dom))
assert serialized == expected
|