import re try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus import pytest from bleach import linkify, DEFAULT_CALLBACKS as DC from bleach.linkifier import Linker def test_empty(): assert linkify('') == '' def test_simple_link(): assert ( linkify('a http://example.com link') == 'a http://example.com link' ) assert ( linkify('a https://example.com link') == 'a https://example.com link' ) assert ( linkify('a example.com link') == 'a example.com link' ) def test_trailing_slash(): assert ( linkify('http://examp.com/') == 'http://examp.com/' ) assert ( linkify('http://example.com/foo/') == 'http://example.com/foo/' ) assert ( linkify('http://example.com/foo/bar/') == 'http://example.com/foo/bar/' ) def test_mangle_link(): """We can muck with the href attribute of the link.""" def filter_url(attrs, new=False): if not attrs.get((None, 'href'), '').startswith('http://bouncer'): quoted = quote_plus(attrs[(None, 'href')]) attrs[(None, 'href')] = 'http://bouncer/?u={0!s}'.format(quoted) return attrs assert ( linkify('http://example.com', DC + [filter_url]) == 'http://example.com' ) def test_mangle_text(): """We can muck with the inner text of a link.""" def ft(attrs, new=False): attrs['_text'] = 'bar' return attrs assert ( linkify('http://ex.mp foo', [ft]) == 'bar bar' ) @pytest.mark.parametrize('data,parse_email,expected', [ ( 'a james@example.com mailto', False, 'a james@example.com mailto' ), ( 'a james@example.com.au mailto', False, 'a james@example.com.au mailto' ), ( 'a james@example.com mailto', True, 'a james@example.com mailto' ), ( 'aussie james@example.com.au mailto', True, 'aussie james@example.com.au mailto' ), # This is kind of a pathological case. I guess we do our best here. ( 'email to james@example.com', True, 'email to james@example.com' ), ( '
jinkyun@example.com', True, '
jinkyun@example.com' ), # Mailto links at the end of a sentence. ( 'mailto james@example.com.au.', True, 'mailto james@example.com.au.' ), # Incorrect email ( '"\\\n"@opa.ru', True, '"\\\n"@opa.ru' ), ]) def test_email_link(data, parse_email, expected): assert linkify(data, parse_email=parse_email) == expected @pytest.mark.parametrize('data,expected', [ ( '"james"@example.com', '''"james"@example.com''' ), ( '"j\'ames"@example.com', '''"j'ames"@example.com''' ), ( '"ja>mes"@example.com', '''"ja>mes"@example.com''' ), ]) def test_email_link_escaping(data, expected): assert linkify(data, parse_email=True) == expected def no_new_links(attrs, new=False): if new: return None return attrs def no_old_links(attrs, new=False): if not new: return None return attrs def noop(attrs, new=False): return attrs @pytest.mark.parametrize('callback,expected', [ ( [noop], 'a ex.mp example' ), ( [no_new_links, noop], 'a ex.mp example' ), ( [noop, no_new_links], 'a ex.mp example' ), ( [no_old_links, noop], 'a ex.mp example' ), ( [noop, no_old_links], 'a ex.mp example' ), ( [no_old_links, no_new_links], 'a ex.mp example' ) ]) def test_prevent_links(callback, expected): """Returning None from any callback should remove links or prevent them from being created.""" text = 'a ex.mp example' assert linkify(text, callback) == expected def test_set_attrs(): """We can set random attributes on links.""" def set_attr(attrs, new=False): attrs[(None, u'rev')] = u'canonical' return attrs assert ( linkify('ex.mp', [set_attr]) == 'ex.mp' ) def test_only_proto_links(): """Only create links if there's a protocol.""" def only_proto(attrs, new=False): if new and not attrs['_text'].startswith(('http:', 'https:')): return None return attrs in_text = 'a ex.mp http://ex.mp bar' assert ( linkify(in_text, [only_proto]) == 'a ex.mp http://ex.mp bar' ) def test_stop_email(): """Returning None should prevent a link from being created.""" def no_email(attrs, new=False): if attrs[(None, 'href')].startswith('mailto:'): return None return attrs text = 'do not link james@example.com' assert linkify(text, parse_email=True, callbacks=[no_email]) == text @pytest.mark.parametrize('data,expected', [ # tlds ('example.com', 'example.com'), ('example.co', 'example.co'), ('example.co.uk', 'example.co.uk'), ('example.edu', 'example.edu'), ('example.xxx', 'example.xxx'), ('bit.ly/fun', 'bit.ly/fun'), # non-tlds ('example.yyy', 'example.yyy'), ('brie', 'brie'), ]) def test_tlds(data, expected): assert linkify(data) == expected def test_escaping(): assert linkify('< unrelated') == '< unrelated' def test_nofollow_off(): assert linkify('example.com', []) == 'example.com' def test_link_in_html(): assert ( linkify('http://yy.com') == 'http://yy.com' ) assert ( linkify('http://xx.com') == 'http://xx.com' ) def test_links_https(): assert ( linkify('https://yy.com') == 'https://yy.com' ) def test_add_rel_nofollow(): """Verify that rel="nofollow" is added to an existing link""" assert ( linkify('http://yy.com') == 'http://yy.com' ) def test_url_with_path(): assert ( linkify('http://example.com/path/to/file') == '' 'http://example.com/path/to/file' ) def test_link_ftp(): assert ( linkify('ftp://ftp.mozilla.org/some/file') == '' 'ftp://ftp.mozilla.org/some/file' ) def test_link_query(): assert ( linkify('http://xx.com/?test=win') == 'http://xx.com/?test=win' ) assert ( linkify('xx.com/?test=win') == 'xx.com/?test=win' ) assert ( linkify('xx.com?test=win') == 'xx.com?test=win' ) def test_link_fragment(): assert ( linkify('http://xx.com/path#frag') == 'http://xx.com/path#frag' ) def test_link_entities(): assert ( linkify('http://xx.com/?a=1&b=2') == 'http://xx.com/?a=1&b=2' ) def test_escaped_html(): """If I pass in escaped HTML, it should probably come out escaped.""" s = 'strong' assert linkify(s) == s def test_link_http_complete(): assert ( linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f') == '' 'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f' ) def test_non_url(): """document.vulnerable should absolutely not be linkified.""" s = 'document.vulnerable' assert linkify(s) == s def test_javascript_url(): """javascript: urls should never be linkified.""" s = 'javascript:document.vulnerable' assert linkify(s) == s def test_unsafe_url(): """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning.""" assert ( linkify('All your{"xx.yy.com/grover.png"}base are') == 'All your{"xx.yy.com/grover.png"}' 'base are' ) def test_skip_tags(): """Skip linkification in skip tags""" simple = 'http://xx.com

http://xx.com

' linked = ('http://xx.com ' '

http://xx.com

') all_linked = ('http://xx.com ' '

http://xx.com'
                  '

') assert linkify(simple, skip_tags=['pre']) == linked assert linkify(simple) == all_linked already_linked = '

xx

' nofollowed = '

xx

' assert linkify(already_linked) == nofollowed assert linkify(already_linked, skip_tags=['pre']) == nofollowed assert ( linkify('

http://example.com

http://example.com', skip_tags=['pre']) == ( '

http://example.com

' 'http://example.com' ) ) def test_libgl(): """libgl.so.1 should not be linkified.""" s = 'libgl.so.1' assert linkify(s) == s @pytest.mark.parametrize('url,periods', [ ('example.com', '.'), ('example.com', '...'), ('ex.com/foo', '.'), ('ex.com/foo', '....'), ]) def test_end_of_sentence(url, periods): """example.com. should match.""" out = '{0!s}{1!s}' intxt = '{0!s}{1!s}' assert linkify(intxt.format(url, periods)) == out.format(url, periods) def test_end_of_clause(): """example.com/foo, shouldn't include the ,""" assert ( linkify('ex.com/foo, bar') == 'ex.com/foo, bar' ) @pytest.mark.xfail(reason='html5lib >= 0.99999999: changed API') def test_sarcasm(): """Jokes should crash.""" assert linkify('Yeah right ') == 'Yeah right <sarcasm/>' @pytest.mark.parametrize('data,expected_data', [ ( '(example.com)', ('(', 'example.com', 'example.com', ')') ), ( '(example.com/)', ('(', 'example.com/', 'example.com/', ')') ), ( '(example.com/foo)', ('(', 'example.com/foo', 'example.com/foo', ')') ), ( '(((example.com/))))', ('(((', 'example.com/', 'example.com/', '))))') ), ( 'example.com/))', ('', 'example.com/', 'example.com/', '))') ), ( '(foo http://example.com/)', ('(foo ', 'example.com/', 'http://example.com/', ')') ), ( '(foo http://example.com)', ('(foo ', 'example.com', 'http://example.com', ')') ), ( 'http://en.wikipedia.org/wiki/Test_(assessment)', ('', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', '') ), ( '(http://en.wikipedia.org/wiki/Test_(assessment))', ('(', 'en.wikipedia.org/wiki/Test_(assessment)', 'http://en.wikipedia.org/wiki/Test_(assessment)', ')') ), ( '((http://en.wikipedia.org/wiki/Test_(assessment))', ('((', 'en.wikipedia.org/wiki/Test_(assessment', 'http://en.wikipedia.org/wiki/Test_(assessment', '))') ), ( '(http://en.wikipedia.org/wiki/Test_(assessment)))', ('(', 'en.wikipedia.org/wiki/Test_(assessment))', 'http://en.wikipedia.org/wiki/Test_(assessment))', ')') ), ( '(http://en.wikipedia.org/wiki/)Test_(assessment', ('(', 'en.wikipedia.org/wiki/)Test_(assessment', 'http://en.wikipedia.org/wiki/)Test_(assessment', '') ), ( 'hello (http://www.mu.de/blah.html) world', ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', ') world') ), ( 'hello (http://www.mu.de/blah.html). world', ('hello (', 'www.mu.de/blah.html', 'http://www.mu.de/blah.html', '). world') ) ]) def test_wrapping_parentheses(data, expected_data): """URLs wrapped in parantheses should not include them.""" out = '{0!s}{2!s}{3!s}' assert linkify(data) == out.format(*expected_data) def test_parentheses_with_removing(): expected = '(test.py)' assert linkify(expected, callbacks=[lambda *a: None]) == expected @pytest.mark.parametrize('data,expected_data', [ # Test valid ports ('http://foo.com:8000', ('http://foo.com:8000', '')), ('http://foo.com:8000/', ('http://foo.com:8000/', '')), # Test non ports ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')), ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')), ('http://foo.com:', ('http://foo.com', ':')), # Test non-ascii ports ('http://foo.com:\u0663\u0669/', ('http://foo.com', ':\u0663\u0669/')), ('http://foo.com:\U0001d7e0\U0001d7d8/', ('http://foo.com', ':\U0001d7e0\U0001d7d8/')), ]) def test_ports(data, expected_data): """URLs can contain port numbers.""" out = '{0}{1}' assert linkify(data) == out.format(*expected_data) def test_ignore_bad_protocols(): assert ( linkify('foohttp://bar') == 'foohttp://bar' ) assert ( linkify('fohttp://exampl.com') == 'fohttp://exampl.com' ) def test_link_emails_and_urls(): """parse_email=True shouldn't prevent URLs from getting linkified.""" assert ( linkify('http://example.com person@example.com', parse_email=True) == ( '' 'http://example.com ' 'person@example.com' ) ) def test_links_case_insensitive(): """Protocols and domain names are case insensitive.""" expect = 'HTTP://EXAMPLE.COM' assert linkify('HTTP://EXAMPLE.COM') == expect def test_elements_inside_links(): assert ( linkify('hello
') == 'hello
' ) assert ( linkify('bold hello
') == 'bold hello
' ) def test_drop_link_tags(): """Verify that dropping link tags *just* drops the tag and not the content""" html = ( 'first second third ' 'fourth fifth' ) assert ( linkify(html, callbacks=[lambda attrs, new: None]) == 'first second third fourth fifth' ) @pytest.mark.parametrize('text, expected', [ (u' ', u' '), ( u' http://example.com', u' http://example.com' ), ( u' 
http://example.com', u' 
http://example.com' ) ]) def test_naughty_unescaping(text, expected): """Verify that linkify is not unescaping things it shouldn't be""" assert linkify(text) == expected def test_hang(): """This string would hang linkify. Issue #200""" assert ( linkify("an@email.com", parse_email=True) == 'an@email.com' ) def test_url_re_arg(): """Verifies that a specified url_re is used""" fred_re = re.compile(r"""(fred\.com)""") linker = Linker(url_re=fred_re) assert ( linker.linkify('a b c fred.com d e f') == 'a b c fred.com d e f' ) assert ( linker.linkify('a b c http://example.com d e f') == 'a b c http://example.com d e f' ) def test_email_re_arg(): """Verifies that a specified email_re is used""" fred_re = re.compile(r"""(fred@example\.com)""") linker = Linker(parse_email=True, email_re=fred_re) assert ( linker.linkify('a b c fred@example.com d e f') == 'a b c fred@example.com d e f' ) assert ( linker.linkify('a b c jim@example.com d e f') == 'a b c jim@example.com d e f' ) def test_linkify_idempotent(): dirty = 'invalid & < extra http://link.com' assert linkify(linkify(dirty)) == linkify(dirty) class TestLinkify: def test_no_href_links(self): s = 'x' assert linkify(s) == s def test_rel_already_there(self): """Make sure rel attribute is updated not replaced""" linked = ('Click ' 'here.') link_good = 'Click here.' assert linkify(linked) == link_good assert linkify(link_good) == link_good