import os import pytest from bleach import clean from bleach.html5lib_shim import Filter from bleach.sanitizer import ALLOWED_PROTOCOLS, Cleaner, NoCssSanitizerWarning from html5lib.constants import rcdataElements @pytest.mark.parametrize( "data", [ "a < b", "link http://link.com", "text", # Verify idempotentcy with character entity handling "text & ", "jim ¤t joe", "& &", "jim &xx; joe", # Link with querystring items '', ], ) def test_clean_idempotent(data): """Make sure that applying the filter twice doesn't change anything.""" assert clean(clean(data)) == clean(data) def test_clean_idempotent_img(): tags = {"img"} dirty = '' assert clean(clean(dirty, tags=tags), tags=tags) == clean(dirty, tags=tags) def test_only_text_is_cleaned(): some_text = "text" some_type = int no_type = None assert clean(some_text) == some_text with pytest.raises(TypeError) as e: clean(some_type) assert "argument cannot be of 'type' type" in str(e.value) with pytest.raises(TypeError) as e: clean(no_type) assert "NoneType" in str(e.value) def test_empty(): assert clean("") == "" def test_content_has_no_html(): assert clean("no html string") == "no html string" @pytest.mark.parametrize( "data, expected", [ ("an allowed tag", "an allowed tag"), ("another good tag", "another good tag"), ], ) def test_content_has_allowed_html(data, expected): assert clean(data) == expected def test_html_is_lowercased(): assert ( clean('foo') == 'foo' ) @pytest.mark.parametrize( "data, should_strip, expected", [ # Regular comment ("", True, ""), # Open comment with no close comment bit (""), (""), # Comment with text to the right ("text", True, "text"), ("text", True, "text"), ("text", False, "text"), ("text", False, "text"), # Comment with text to the left ("text", True, "text"), ("text", True, "text"), ("text", False, "text"), ("text", False, "text"), ], ) def test_comments(data, should_strip, expected): assert clean(data, strip_comments=should_strip) == expected def test_invalid_char_in_tag(): assert ( clean('') == '<script/xss src="http://xx.com/xss.js"></script>' ) assert ( clean('') == '<script/src="http://xx.com/xss.js"></script>' ) def test_unclosed_tag(): assert clean("a fixed tag") == "a fixed tag" assert ( clean("/script>") == "<<script>script>evil()<</script>/script>" ) assert ( clean("<script>evil()</script>") == "<<x>script>evil()<</x>/script>" ) assert ( clean(">evil()>") == "<script<script>>evil()</script</script>>" ) @pytest.mark.parametrize( "text, expected", [ ("an & entity", "an & entity"), ("an < entity", "an < entity"), ("tag < and entity", "tag < and entity"), ], ) def test_bare_entities_get_escaped_correctly(text, expected): assert clean(text) == expected @pytest.mark.parametrize( "text, expected", [ ("x", "<y>"), # this is an eof-in-attribute-name parser error ("foo', 'foo'), ('foo', 'foo'), # Things in attributes that aren't character entities get escaped ( 'foo', 'foo', ), ( 'foo', 'foo', ), ( 'foo', 'foo', ), # Things in text that aren't character entities get escaped ("&xx;", "&xx;"), ("&adp;", "&adp;"), ("&currdupe;", "&currdupe;"), # Test numeric entities ("'", "'"), (""", """), ("{", "{"), ("{", "{"), ("{", "{"), # Test non-numeric entities ("&#", "&#"), ("&#<", "&#<"), # html5lib tokenizer unescapes character entities, so these would become ' # and " which makes it possible to break out of html attributes. # # Verify that clean() doesn't unescape entities. ("'"", "'""), ], ) def test_character_entities_handling(text, expected): assert clean(text) == expected @pytest.mark.parametrize( "data, kwargs, expected", [ # All tags are allowed, so it strips nothing ( "a test with html tags", {}, "a test with html tags", ), # img tag is disallowed, so it's stripped ( 'a test with html tags', {}, "a test with html tags", ), # a tag is disallowed, so it's stripped ( '
link text
', {"tags": {"p"}}, "
link text
", ), # Test nested disallowed tag ( "
multiply nested text
", {"tags": {"p"}}, "
multiply nested text
", ), # (#271) ("
", "<script>safe()</script>"), # Test with braces ("", "<style>body{}</style>"), # Test nested disallow tags (#271) ("
ipt>", "pt>alert(1)ipt>"), ("pt>pt>alert(1)", "pt>pt>alert(1)"), ], ) def test_stripping_tags_is_safe(data, expected): """Test stripping tags shouldn't result in malicious content""" assert clean(data, strip=True) == expected def test_href_with_wrong_tag(): assert clean('no link') == "no link" def test_disallowed_attr(): IMG = {"img"} IMG_ATTR = ["src"] assert clean('test') == 'test' assert ( clean('', tags=IMG, attributes=IMG_ATTR) == '' ) assert ( clean('', tags=IMG, attributes=IMG_ATTR) == '' ) def test_unquoted_attr_values_are_quoted(): assert ( clean("myabbr") == 'myabbr' ) def test_unquoted_event_handler_attr_value(): assert ( clean('xx.com') == 'xx.com' ) def test_invalid_filter_attr(): IMG = {"img"} IMG_ATTR = { "img": lambda tag, name, val: name == "src" and val == "http://example.com/" } assert ( clean( '', tags=IMG, attributes=IMG_ATTR, ) == '' ) assert ( clean( '', tags=IMG, attributes=IMG_ATTR, ) == "" ) def test_poster_attribute(): """Poster attributes should not allow javascript.""" tags = {"video"} attrs = {"video": ["poster"]} test = '' assert clean(test, tags=tags, attributes=attrs) == "" ok = '' assert clean(ok, tags=tags, attributes=attrs) == ok def test_attributes_callable(): """Verify attributes can take a callable""" ATTRS = lambda tag, name, val: name == "title" TAGS = {"a"} text = 'example' assert clean(text, tags=TAGS, attributes=ATTRS) == 'example' def test_attributes_wildcard(): """Verify attributes[*] works""" ATTRS = { "*": ["id"], "img": ["src"], } TAGS = {"img", "em"} text = ( 'both can have ' ) assert ( clean(text, tags=TAGS, attributes=ATTRS) == 'both can have ' ) def test_attributes_wildcard_callable(): """Verify attributes[*] callable works""" ATTRS = {"*": lambda tag, name, val: name == "title"} TAGS = {"a"} assert ( clean('example', tags=TAGS, attributes=ATTRS) == 'example' ) def test_attributes_tag_callable(): """Verify attributes[tag] callable works""" def img_test(tag, name, val): return name == "src" and val.startswith("https") ATTRS = { "img": img_test, } TAGS = {"img"} text = 'foo baz' assert clean(text, tags=TAGS, attributes=ATTRS) == "foo baz" text = 'foo baz' assert ( clean(text, tags=TAGS, attributes=ATTRS) == 'foo baz' ) def test_attributes_tag_list(): """Verify attributes[tag] list works""" ATTRS = {"a": ["title"]} TAGS = {"a"} assert ( clean('example', tags=TAGS, attributes=ATTRS) == 'example' ) def test_attributes_list(): """Verify attributes list works""" ATTRS = ["title"] TAGS = {"a"} text = 'example' assert clean(text, tags=TAGS, attributes=ATTRS) == 'example' @pytest.mark.parametrize( "data, kwargs, expected", [ # invalid URI (urlparse raises a ValueError: Invalid IPv6 URL) # is not allowed by default ( 'text', {"protocols": ALLOWED_PROTOCOLS}, "text", ), # data protocol is not allowed by default ( 'foo', {"protocols": ALLOWED_PROTOCOLS}, "foo", ), # javascript: is not allowed by default ( "xss", {"protocols": ALLOWED_PROTOCOLS}, "xss", ), # File protocol is not allowed by default ( 'foo', {"protocols": ALLOWED_PROTOCOLS}, "foo", ), # Specified protocols are allowed ( 'allowed href', {"protocols": {"myprotocol"}}, 'allowed href', ), # Unspecified protocols are not allowed ( 'invalid href', {"protocols": {"myprotocol"}}, "invalid href", ), # Anchors are ok ( 'foo', {"protocols": set()}, 'foo', ), # Anchor that looks like a domain is ok ( 'foo', {"protocols": set()}, 'foo', ), # Allow implicit http/https if allowed ( 'valid', {"protocols": {"http"}}, 'valid', ), ( 'valid', {"protocols": {"https"}}, 'valid', ), ( 'valid', {"protocols": {"http"}}, 'valid', ), ( 'valid', {"protocols": {"http"}}, 'valid', ), ( 'valid', {"protocols": {"http"}}, 'valid', ), ( 'valid', {"protocols": {"http"}}, 'valid', ), ( 'valid', {"protocols": {"http"}}, 'valid', ), ( 'valid', {"protocols": {"http"}}, 'valid', ), pytest.param( *( 'valid', {"protocols": {"http"}}, 'valid', ), marks=pytest.mark.xfail, ), # Disallow implicit http/https if disallowed ('foo', {"protocols": set()}, "foo"), ('foo', {"protocols": set()}, "foo"), ('foo', {"protocols": set()}, "foo"), ('foo', {"protocols": set()}, "foo"), ('foo', {"protocols": set()}, "foo"), ('foo', {"protocols": set()}, "foo"), # Disallowed protocols with sneaky character entities ('alert', {}, "alert"), ('alert', {}, "alert"), # Checking the uri should change it at all ( 'foo', {}, 'foo', ), ], ) def test_uri_value_allowed_protocols(data, kwargs, expected): assert clean(data, **kwargs) == expected def test_svg_attr_val_allows_ref(): """Unescape values in svg attrs that allow url references""" # Local IRI, so keep it TAGS = {"svg", "rect"} ATTRS = { "rect": ["fill"], } text = '' assert ( clean(text, tags=TAGS, attributes=ATTRS) == '' ) # Non-local IRI, so drop it TAGS = {"svg", "rect"} ATTRS = { "rect": ["fill"], } text = '' assert clean(text, tags=TAGS, attributes=ATTRS) == "" @pytest.mark.parametrize( "text, expected", [ ( '', '', ), ( '', # NOTE(willkg): Bug in html5lib serializer drops the xlink part '', ), ], ) def test_svg_allow_local_href(text, expected): """Keep local hrefs for svg elements""" TAGS = {"svg", "pattern"} ATTRS = { "pattern": ["id", "href"], } assert clean(text, tags=TAGS, attributes=ATTRS) == expected @pytest.mark.parametrize( "text, expected", [ ( '', '', ), ( '', '', ), ], ) def test_svg_allow_local_href_nonlocal(text, expected): """Drop non-local hrefs for svg elements""" TAGS = {"svg", "pattern"} ATTRS = { "pattern": ["id", "href"], } assert clean(text, tags=TAGS, attributes=ATTRS) == expected @pytest.mark.parametrize( "data, expected", [ # Convert bell ("1\a23", "1?23"), # Convert backpsace ("1\b23", "1?23"), # Convert formfeed ("1\v23", "1?23"), # Convert vertical tab ("1\f23", "1?23"), # Convert a bunch of characters in a string ("import y\bose\bm\bi\bt\be\b", "import y?ose?m?i?t?e?"), ], ) def test_invisible_characters(data, expected): assert clean(data) == expected def test_nonexistent_namespace(): # Issue #352 involved this string kicking up a KeyError since the "c" # namespace didn't exist. After the fixes for Bleach 3.0, this no longer # goes through the HTML parser as a tag, so it doesn't tickle the bad # namespace code. assert clean("") == "<d {c}>" @pytest.mark.parametrize( "tag", [ "area", "base", "br", "embed", "hr", "img", "input", pytest.param( "keygen", marks=pytest.mark.xfail( reason="https://github.com/mozilla/bleach/issues/488" ), ), "link", "meta", "param", "source", pytest.param( "menuitem", marks=pytest.mark.xfail( reason="https://github.com/mozilla/bleach/issues/488" ), ), "track", pytest.param( "wbr", marks=pytest.mark.xfail( reason="https://github.com/mozilla/bleach/issues/488" ), ), ], ) def test_self_closing_tags_self_close(tag): assert clean(f"<{tag}>", tags={tag}) == f"<{tag}>" # tags that get content passed through (i.e. parsed with parseRCDataRawtext) _raw_tags = [ "title", "textarea", "script", "style", "noembed", "noframes", "iframe", "xmp", ] @pytest.mark.parametrize( "raw_tag, data, expected", [ ( raw_tag, f"", f"<img src=x onerror=alert(1) />", ) for raw_tag in _raw_tags ], ) def test_noscript_rawtag_(raw_tag, data, expected): # refs: bug 1615315 / GHSA-q65m-pv3f-wr5r assert clean(data, tags={"noscript", raw_tag}) == expected @pytest.mark.parametrize( "namespace_tag, rc_data_element_tag, data, expected", [ ( namespace_tag, rc_data_element_tag, ( f"<{namespace_tag}><{rc_data_element_tag}>" + "" ), ( f"<{namespace_tag}><{rc_data_element_tag}>" + "<img src=x onerror=alert(1)>" + f"" ), ) for namespace_tag in ["math", "svg"] # https://dev.w3.org/html5/html-author/#rcdata-elements # https://html.spec.whatwg.org/index.html#parsing-html-fragments # in html5lib: 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', and 'noscript' for rc_data_element_tag in rcdataElements ], ) def test_namespace_rc_data_element_strip_false( namespace_tag, rc_data_element_tag, data, expected ): # refs: bug 1621692 / GHSA-m6xf-fq7q-8743 # # browsers will pull the img out of the namespace and rc data tag resulting in XSS assert ( clean(data, tags={namespace_tag, rc_data_element_tag}, strip=False) == expected ) @pytest.mark.parametrize( "namespace_tag, end_tag, eject_tag, data, expected", [ # eject with style ( "math", "p", "style", "", ), ( "math", "br", "style", "
", "", ), ( "svg", "p", "style", "
", "
", ), ( "svg", "br", "style", "
", "
", ), # eject with title ( "math", "p", "title", "", ), ( "math", "br", "title", "
", ), ( "svg", "br", "title", "
", ), # eject with noscript ( "math", "p", "noscript", "", ), ( "math", "br", "noscript", "
", "", ), ( "svg", "p", "noscript", "
", "
", ), ( "svg", "br", "noscript", "
", "
", ), # eject with script ( "math", "p", "script", "", ), ( "math", "br", "script", "
", "", ), ( "svg", "p", "script", "
", "
", ), ( "svg", "br", "script", "
", "
", ), # eject with noembed ( "math", "p", "noembed", "", ), ( "math", "br", "noembed", "
", "", ), ( "svg", "p", "noembed", "
", "
", ), ( "svg", "br", "noembed", "
", "
", ), # eject with textarea ( "math", "p", "textarea", "", ), ( "math", "br", "textarea", "
", ), ( "svg", "br", "textarea", "
", ), # eject with noframes ( "math", "p", "noframes", "", ), ( "math", "br", "noframes", "
", "", ), ( "svg", "p", "noframes", "
", "
", ), ( "svg", "br", "noframes", "
", "
", ), # eject with iframe ( "math", "p", "iframe", "", ), ( "math", "br", "iframe", "
", ), ( "svg", "br", "iframe", "
", ), # eject with xmp ( "math", "p", "xmp", "", ), ( "math", "br", "xmp", "
", "", ), ( "svg", "p", "xmp", "
", "
", ), ( "svg", "br", "xmp", "
", "
", ), ], ) def test_html_comments_escaped(namespace_tag, end_tag, eject_tag, data, expected): # refs: bug 1689399 / GHSA-vv2x-vrpj-qqpq # # p and br can be just an end tag (e.g.
==
) # # In browsers: # # * img and other tags break out of the svg or math namespace (e.g. == ) # * style does not (e.g. == ) # * style and other tags without child elements does not (e.g. == ) # * the breaking tag ejects trailing elements (e.g. == ) # # the ejected elements can trigger XSS assert ( clean(data, tags={namespace_tag, end_tag, eject_tag}, strip_comments=False) == expected ) @pytest.mark.parametrize( "text, expected", [ ( "
Test!
Hello
", "Test!\nHello", ), ( # with an internal space and escaped character "
This is our description! &
nice!
", "This is our description! &\nnice!", ), ( # note: double-wrap causes an initial newline--this can't really be # handled under the current design "
This is our description! &
nice!
", "\nThis is our description! &\nnice!", ), ( # newlines are used to keep lists and other elements readable ( "
This is our description! &
1
" + "
a
b
c
nice!
" ), "\nThis is our description! &\n1\n\na\nb\nc\nnice!", ), ], ) def test_strip_respects_block_level_elements(text, expected): """ Insert a newline between block level elements https://github.com/mozilla/bleach/issues/369 """ assert clean(text, tags=set(), strip=True) == expected def get_ids_and_tests(): """Retrieves regression tests from data/ directory :returns: list of ``(id, filedata)`` tuples """ datadir = os.path.join(os.path.dirname(__file__), "data") tests = [ os.path.join(datadir, fn) for fn in os.listdir(datadir) if fn.endswith(".test") ] # Sort numerically which makes it easier to iterate through them tests.sort(key=lambda x: int(os.path.basename(x).split(".", 1)[0])) testcases = [] for fn in tests: with open(fn) as fp: data = fp.read() testcases.append((os.path.basename(fn), data)) return testcases _regression_ids_and_tests = get_ids_and_tests() _regression_ids = [item[0] for item in _regression_ids_and_tests] _regression_tests = [item[1] for item in _regression_ids_and_tests] @pytest.mark.parametrize("test_case", _regression_tests, ids=_regression_ids) def test_regressions(test_case): """Regression tests for clean so we can see if there are issues""" test_data, expected = test_case.split("\n--\n") # NOTE(willkg): This strips input and expected which makes it easier to # maintain the files. If there comes a time when the input needs whitespace # at the beginning or end, then we'll have to figure out something else. test_data = test_data.strip() expected = expected.strip() assert clean(test_data) == expected def test_preserves_attributes_order(): html = """Link""" cleaned_html = clean(html, tags={"a"}, attributes={"a": ["href", "target"]}) assert cleaned_html == html @pytest.mark.parametrize( "attr", ( ["style"], {"*": ["style"]}, ), ) def test_css_sanitizer_warning(attr): # If you have "style" in attributes, but don't set a css_sanitizer, it # should raise a warning. with pytest.warns(NoCssSanitizerWarning): clean("foo", attributes=attr) class TestCleaner: def test_basics(self): TAGS = {"span", "br"} ATTRS = {"span": ["style"]} cleaner = Cleaner(tags=TAGS, attributes=ATTRS) assert ( cleaner.clean('a
test') == 'a
test' ) def test_filters(self): # Create a Filter that changes all the attr values to "moo" class MooFilter(Filter): def __iter__(self): for token in Filter.__iter__(self): if token["type"] in ["StartTag", "EmptyTag"] and token["data"]: for attr, value in token["data"].items(): token["data"][attr] = "moo" yield token ATTRS = {"img": ["rel", "src"]} TAGS = {"img"} cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) dirty = 'this is cute! ' assert cleaner.clean(dirty) == 'this is cute! '