import re import pytest from urllib.parse import quote_plus from bleach import linkify, DEFAULT_CALLBACKS as DC from bleach.linkifier import Linker, LinkifyFilter from bleach.sanitizer import Cleaner def test_empty(): assert linkify("") == "" def test_simple_link(): assert ( linkify("a http://example.com link") == 'a http://example.com link' ) assert ( linkify("a https://example.com link") == 'a https://example.com link' ) assert ( linkify("a example.com link") == 'a example.com link' ) def test_trailing_slash(): assert ( linkify("http://examp.com/") == 'http://examp.com/' ) assert ( linkify("http://example.com/foo/") == 'http://example.com/foo/' ) assert ( linkify("http://example.com/foo/bar/") == 'http://example.com/foo/bar/' ) def test_mangle_link(): """We can muck with the href attribute of the link.""" def filter_url(attrs, new=False): if not attrs.get((None, "href"), "").startswith("http://bouncer"): quoted = quote_plus(attrs[(None, "href")]) attrs[(None, "href")] = "http://bouncer/?u={!s}".format(quoted) return attrs assert ( linkify("http://example.com", callbacks=DC + [filter_url]) == 'http://example.com' ) def test_mangle_text(): """We can muck with the inner text of a link.""" def ft(attrs, new=False): attrs["_text"] = "bar" return attrs assert ( linkify('http://ex.mp foo', callbacks=[ft]) == 'bar bar' ) @pytest.mark.parametrize( "data,parse_email,expected", [ ("a james@example.com mailto", False, "a james@example.com mailto"), ("a james@example.com.au mailto", False, "a james@example.com.au mailto"), ( "a james@example.com mailto", True, 'a james@example.com mailto', ), ( "aussie james@example.com.au mailto", True, 'aussie james@example.com.au mailto', ), # This is kind of a pathological case. I guess we do our best here. ( 'email to james@example.com', True, 'email to james@example.com', ), ( "
jinkyun@example.com", True, '
jinkyun@example.com', ), # Mailto links at the end of a sentence. ( "mailto james@example.com.au.", True, 'mailto james@example.com.au.', ), # Incorrect email ('"\\\n"@opa.ru', True, '"\\\n"@opa.ru'), # RFC6068 special characters ( "gorby%kremvax@example.com", True, 'gorby%kremvax@example.com', ), ( "unlikely?address@example.com", True, 'unlikely?address@example.com', ), ], ) def test_email_link(data, parse_email, expected): assert linkify(data, parse_email=parse_email) == expected @pytest.mark.parametrize( "data, expected", [ ( '"james"@example.com', """"james"@example.com""", ), ( '"j\'ames"@example.com', """"j'ames"@example.com""", ), ( '"ja>mes"@example.com', """"ja>mes"@example.com""", ), ], ) def test_email_link_escaping(data, expected): assert linkify(data, parse_email=True) == expected def no_new_links(attrs, new=False): if new: return None return attrs def no_old_links(attrs, new=False): if not new: return None return attrs def noop(attrs, new=False): return attrs @pytest.mark.parametrize( "callback,expected", [ ( [noop], 'a ex.mp example', ), ([no_new_links, noop], 'a ex.mp example'), ([noop, no_new_links], 'a ex.mp example'), ([no_old_links, noop], 'a ex.mp example'), ([noop, no_old_links], 'a ex.mp example'), ([no_old_links, no_new_links], "a ex.mp example"), ], ) def test_prevent_links(callback, expected): """Returning None from any callback should remove links or prevent them from being created.""" text = 'a ex.mp example' assert linkify(text, callbacks=callback) == expected def test_set_attrs(): """We can set random attributes on links.""" def set_attr(attrs, new=False): attrs[(None, "rev")] = "canonical" return attrs assert ( linkify("ex.mp", callbacks=[set_attr]) == 'ex.mp' ) def test_only_proto_links(): """Only create links if there's a protocol.""" def only_proto(attrs, new=False): if new and not attrs["_text"].startswith(("http:", "https:")): return None return attrs in_text = 'a ex.mp http://ex.mp bar' assert ( linkify(in_text, callbacks=[only_proto]) == 'a ex.mp http://ex.mp bar' ) def test_stop_email(): """Returning None should prevent a link from being created.""" def no_email(attrs, new=False): if attrs[(None, "href")].startswith("mailto:"): return None return attrs text = "do not link james@example.com" assert linkify(text, parse_email=True, callbacks=[no_email]) == text @pytest.mark.parametrize( "data,expected", [ # tlds ("example.com", 'example.com'), ("example.co", 'example.co'), ( "example.co.uk", 'example.co.uk', ), ("example.edu", 'example.edu'), ("example.xxx", 'example.xxx'), ("bit.ly/fun", 'bit.ly/fun'), # non-tlds ("example.yyy", "example.yyy"), ("brie", "brie"), ], ) def test_tlds(data, expected): assert linkify(data) == expected @pytest.mark.parametrize( "data,expected", [ ("< unrelated", "< unrelated"), ("", ''), ], ) def test_escaping(data, expected): assert linkify(data) == expected def test_nofollow_off(): assert ( linkify("example.com", callbacks=[]) == 'example.com' ) def test_link_in_html(): assert ( linkify("http://yy.com") == 'http://yy.com' ) assert ( linkify("http://xx.com") == 'http://xx.com' ) def test_links_https(): assert ( linkify("https://yy.com") == 'https://yy.com' ) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" assert ( linkify('http://yy.com') == 'http://yy.com' ) def test_url_with_path(): assert ( linkify("http://example.com/path/to/file") == '' "http://example.com/path/to/file" ) def test_link_ftp(): assert ( linkify("ftp://ftp.mozilla.org/some/file") == '' "ftp://ftp.mozilla.org/some/file" ) def test_link_query(): assert ( linkify("http://xx.com/?test=win") == 'http://xx.com/?test=win' ) assert ( linkify("xx.com/?test=win") == 'xx.com/?test=win' ) assert ( linkify("xx.com?test=win") == 'xx.com?test=win' ) def test_link_fragment(): assert ( linkify("http://xx.com/path#frag") == 'http://xx.com/path#frag' ) def test_link_entities(): assert ( linkify("http://xx.com/?a=1&b=2") == 'http://xx.com/?a=1&b=2' ) def test_escaped_html(): """If I pass in escaped HTML, it should probably come out escaped.""" s = "strong" assert linkify(s) == s def test_link_http_complete(): assert ( linkify("https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f") == '' "https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f" ) def test_non_url(): """document.vulnerable should absolutely not be linkified.""" s = "document.vulnerable" assert linkify(s) == s def test_javascript_url(): """javascript: urls should never be linkified.""" s = "javascript:document.vulnerable" assert linkify(s) == s def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" assert ( linkify('All your{"xx.yy.com/grover.png"}base are') == 'All your{"xx.yy.com/grover.png"}' "base are" ) def test_skip_tags(): """Skip linkification in skip tags""" simple = "http://xx.com

http://xx.com

" linked = ( 'http://xx.com ' "

http://xx.com

" ) all_linked = ( 'http://xx.com ' '

http://xx.com'
        "

" ) assert linkify(simple, skip_tags=["pre"]) == linked assert linkify(simple) == all_linked already_linked = '

xx

' nofollowed = '

xx

' assert linkify(already_linked) == nofollowed assert linkify(already_linked, skip_tags=["pre"]) == nofollowed assert linkify( "

http://example.com

http://example.com", skip_tags=["pre"], ) == ( "

http://example.com

" 'http://example.com' ) def test_libgl(): """libgl.so.1 should not be linkified.""" s = "libgl.so.1" assert linkify(s) == s @pytest.mark.parametrize( "url,periods", [ ("example.com", "."), ("example.com", "..."), ("ex.com/foo", "."), ("ex.com/foo", "...."), ], ) def test_end_of_sentence(url, periods): """example.com. should match.""" out = '{0!s}{1!s}' intxt = "{0!s}{1!s}" assert linkify(intxt.format(url, periods)) == out.format(url, periods) def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" assert ( linkify("ex.com/foo, bar") == 'ex.com/foo, bar' ) def test_sarcasm(): """Jokes should crash.""" assert linkify("Yeah right ") == "Yeah right <sarcasm/>" @pytest.mark.parametrize( "data,expected_data", [ ("(example.com)", ("(", "example.com", "example.com", ")")), ("(example.com/)", ("(", "example.com/", "example.com/", ")")), ("(example.com/foo)", ("(", "example.com/foo", "example.com/foo", ")")), ("(((example.com/))))", ("(((", "example.com/", "example.com/", "))))")), ("example.com/))", ("", "example.com/", "example.com/", "))")), ( "(foo http://example.com/)", ("(foo ", "example.com/", "http://example.com/", ")"), ), ( "(foo http://example.com)", ("(foo ", "example.com", "http://example.com", ")"), ), ( "http://en.wikipedia.org/wiki/Test_(assessment)", ( "", "en.wikipedia.org/wiki/Test_(assessment)", "http://en.wikipedia.org/wiki/Test_(assessment)", "", ), ), ( "(http://en.wikipedia.org/wiki/Test_(assessment))", ( "(", "en.wikipedia.org/wiki/Test_(assessment)", "http://en.wikipedia.org/wiki/Test_(assessment)", ")", ), ), ( "((http://en.wikipedia.org/wiki/Test_(assessment))", ( "((", "en.wikipedia.org/wiki/Test_(assessment", "http://en.wikipedia.org/wiki/Test_(assessment", "))", ), ), ( "(http://en.wikipedia.org/wiki/Test_(assessment)))", ( "(", "en.wikipedia.org/wiki/Test_(assessment))", "http://en.wikipedia.org/wiki/Test_(assessment))", ")", ), ), ( "(http://en.wikipedia.org/wiki/)Test_(assessment", ( "(", "en.wikipedia.org/wiki/)Test_(assessment", "http://en.wikipedia.org/wiki/)Test_(assessment", "", ), ), ( "hello (http://www.mu.de/blah.html) world", ("hello (", "www.mu.de/blah.html", "http://www.mu.de/blah.html", ") world"), ), ( "hello (http://www.mu.de/blah.html). world", ( "hello (", "www.mu.de/blah.html", "http://www.mu.de/blah.html", "). world", ), ), ], ) def test_wrapping_parentheses(data, expected_data): """URLs wrapped in parantheses should not include them.""" out = '{0!s}{2!s}{3!s}' assert linkify(data) == out.format(*expected_data) def test_parentheses_with_removing(): expected = "(test.py)" assert linkify(expected, callbacks=[lambda *a: None]) == expected @pytest.mark.parametrize( "data,expected_data", [ # Test valid ports ("http://foo.com:8000", ("http://foo.com:8000", "")), ("http://foo.com:8000/", ("http://foo.com:8000/", "")), # Test non ports ("http://bar.com:xkcd", ("http://bar.com", ":xkcd")), ("http://foo.com:81/bar", ("http://foo.com:81/bar", "")), ("http://foo.com:", ("http://foo.com", ":")), # Test non-ascii ports ("http://foo.com:\u0663\u0669/", ("http://foo.com", ":\u0663\u0669/")), ( "http://foo.com:\U0001d7e0\U0001d7d8/", ("http://foo.com", ":\U0001d7e0\U0001d7d8/"), ), ], ) def test_ports(data, expected_data): """URLs can contain port numbers.""" out = '{0}{1}' assert linkify(data) == out.format(*expected_data) def test_ignore_bad_protocols(): assert linkify("foohttp://bar") == "foohttp://bar" assert ( linkify("fohttp://exampl.com") == 'fohttp://exampl.com' ) def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" assert linkify("http://example.com person@example.com", parse_email=True) == ( '' 'http://example.com ' "person@example.com" ) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = 'HTTP://EXAMPLE.COM' assert linkify("HTTP://EXAMPLE.COM") == expect def test_elements_inside_links(): assert ( linkify('hello
') == 'hello
' ) assert ( linkify('bold hello
') == 'bold hello
' ) def test_drop_link_tags(): """Verify that dropping link tags *just* drops the tag and not the content""" html = ( 'first second third ' "fourth fifth" ) assert ( linkify(html, callbacks=[lambda attrs, new: None]) == "first second third fourth fifth" ) @pytest.mark.parametrize( "text, expected", [ (" ", " "), ( " http://example.com", ' http://example.com', ), ( " 
http://example.com", ' 
http://example.com', ), ], ) def test_naughty_unescaping(text, expected): """Verify that linkify is not unescaping things it shouldn't be""" assert linkify(text) == expected def test_hang(): """This string would hang linkify. Issue #200""" assert ( linkify("an@email.com", parse_email=True) == 'an@email.com<mailto:an@email.com>' # noqa ) def test_hyphen_in_mail(): """Test hyphens `-` in mails. Issue #300.""" assert ( linkify("ex@am-ple.com", parse_email=True) == 'ex@am-ple.com' ) def test_url_re_arg(): """Verifies that a specified url_re is used""" fred_re = re.compile(r"""(fred\.com)""") linker = Linker(url_re=fred_re) assert ( linker.linkify("a b c fred.com d e f") == 'a b c fred.com d e f' ) assert ( linker.linkify("a b c http://example.com d e f") == "a b c http://example.com d e f" ) def test_email_re_arg(): """Verifies that a specified email_re is used""" fred_re = re.compile(r"""(fred@example\.com)""") linker = Linker(parse_email=True, email_re=fred_re) assert ( linker.linkify("a b c fred@example.com d e f") == 'a b c fred@example.com d e f' ) assert ( linker.linkify("a b c jim@example.com d e f") == "a b c jim@example.com d e f" ) def test_recognized_tags_arg(): """Verifies that recognized_tags works""" # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it linker = Linker(recognized_tags=["p"]) assert ( linker.linkify("

http://example.com/

") == '

http://example.com/

<sarcasm>' # noqa ) # The html parser recognizes "sarcasm" as a tag and fixes it linker = Linker(recognized_tags=["p", "sarcasm"]) assert ( linker.linkify("

http://example.com/

") == '

http://example.com/

' # noqa ) @pytest.mark.parametrize( "data", [ "text & ", "a < b", "link http://link.com", "text", "jim ¤t joe", # Link with querystring items '', ], ) def test_linkify_idempotent(data): assert linkify(linkify(data)) == linkify(data) class TestLinkify: def test_no_href_links(self): s = 'x' assert linkify(s) == s def test_rel_already_there(self): """Make sure rel attribute is updated not replaced""" linked = 'Click ' "here." link_good = ( 'Click here.' ) assert linkify(linked) == link_good assert linkify(link_good) == link_good def test_only_text_is_linkified(self): some_text = "text" some_type = int no_type = None assert linkify(some_text) == some_text with pytest.raises(TypeError): linkify(some_type) with pytest.raises(TypeError): linkify(no_type) @pytest.mark.parametrize( "text, expected", [ ("abc", "abc"), ("example.com", 'example.com'), ( "http://example.com?b=1&c=2", 'http://example.com?b=1&c=2', ), ( "http://example.com?b=1&c=2", 'http://example.com?b=1&c=2', ), ( "link: https://example.com/watch#anchor", 'link: https://example.com/watch#anchor', ), ], ) def test_linkify_filter(text, expected): cleaner = Cleaner(filters=[LinkifyFilter]) assert cleaner.clean(text) == expected