Dummy

# -*- coding: utf-8 -*- import unittest import six from w3lib.html import (replace_entities, replace_tags, remove_comments, remove_tags_with_content, replace_escape_chars, remove_tags, unquote_markup, get_base_url, get_meta_refresh) class RemoveEntitiesTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return uncode assert isinstance(replace_entities(b'no entities'), six.text_type) assert isinstance(replace_entities(b'Price: £100!'), six.text_type) assert isinstance(replace_entities(u'no entities'), six.text_type) assert isinstance(replace_entities(u'Price: £100!'), six.text_type) def test_regular(self): # regular conversions self.assertEqual(replace_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(replace_entities(b'As low as £100!'), u'As low as \xa3100!') self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant') def test_keep_entities(self): # keep some entities self.assertEqual(replace_entities(b'Low < High & Medium £ six', keep=['lt', 'amp']), u'Low < High & Medium \xa3 six') self.assertEqual(replace_entities(u'Low < High & Medium £ six', keep=[u'lt', u'amp']), u'Low < High & Medium \xa3 six') def test_illegal_entities(self): self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(replace_entities('x≤y'), u'x\u2264y') self.assertEqual(replace_entities('xy'), u'xy') self.assertEqual(replace_entities('xy', remove_illegal=False), u'xy') def test_browser_hack(self): # check browser hack for numeric character references in the 80-9F range self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y') self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y') def test_missing_semicolon(self): for entity, result in ( ('<<!', '<some tag'), u'This text contains some tag') self.assertEqual(replace_tags(b'This text is very important', ' '), u'This text is very im port ant') def test_replace_tags_multiline(self): self.assertEqual(replace_tags(b'Click here'), u'Click here') class RemoveCommentsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode assert isinstance(remove_comments(b'without comments'), six.text_type) assert isinstance(remove_comments(b''), six.text_type) assert isinstance(remove_comments(u'without comments'), six.text_type) assert isinstance(remove_comments(u''), six.text_type) def test_no_comments(self): # text without comments self.assertEqual(remove_comments(u'text without comments'), u'text without comments') def test_remove_comments(self): # text with comments self.assertEqual(remove_comments(u''), u'') self.assertEqual(remove_comments(u'Hello'), u'Hello') self.assertEqual(remove_comments(u'Hello'), u'Hello') self.assertEqual(remove_comments(b"test whatever"), u'test whatever') self.assertEqual(remove_comments(b"test whatever"), u'test whatever') self.assertEqual(remove_comments(b"test """ self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) def test_html_comments_with_uncommented_meta_refresh(self): # html comments must not interfere with uncommented meta refresh header baseurl = 'http://example.com' body = """-->""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/')) def test_float_refresh_intervals(self): # float refresh intervals baseurl = 'http://example.com' body = """""" self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html')) body = """""" self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html')) def test_tag_name(self): baseurl = 'http://example.org' body = """ Dummy blahablsdfsal& """ self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) def test_leading_newline_in_url(self): baseurl = 'http://example.org' body = """ Dummy """ self.assertEqual(get_meta_refresh(body, baseurl), (0.0, 'http://www.example.org/index.php')) def test_inside_noscript(self): baseurl = 'http://example.org' body = """ """ self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/javascript_required")) def test_inside_script(self): baseurl = 'http://example.org' body = """ """ self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/foobar_required"))