# -*- coding: utf-8 -*- import unittest from scrapy.utils.markup import remove_entities, replace_tags, remove_comments from scrapy.utils.markup import remove_tags_with_content, replace_escape_chars, remove_tags from scrapy.utils.markup import unquote_markup class UtilsMarkupTest(unittest.TestCase): def test_remove_entities(self): # make sure it always return uncode assert isinstance(remove_entities('no entities'), unicode) assert isinstance(remove_entities('Price: £100!'), unicode) # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant') # keep some entities self.assertEqual(remove_entities('Low < High & Medium £ six', keep=['lt', 'amp']), u'Low < High & Medium \xa3 six') # illegal entities self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y') # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y') # encoding self.assertEqual(remove_entities('x\x99™™y', encoding='cp1252'), \ u'x\u2122\u2122\u2122y') def test_replace_tags(self): # make sure it always return uncode assert isinstance(replace_tags('no entities'), unicode) self.assertEqual(replace_tags(u'This text contains some tag'), u'This text contains some tag') self.assertEqual(replace_tags('This text is very important', ' '), u'This text is very im port ant') # multiline tags self.assertEqual(replace_tags('Click here'), u'Click here') def test_remove_comments(self): # make sure it always return unicode assert isinstance(remove_comments('without comments'), unicode) assert isinstance(remove_comments(''), unicode) # text without comments self.assertEqual(remove_comments(u'text without comments'), u'text without comments') # text with comments self.assertEqual(remove_comments(u''), u'') self.assertEqual(remove_comments(u'Hello'),u'Hello') def test_remove_tags(self): # make sure it always return unicode assert isinstance(remove_tags('no tags'), unicode) assert isinstance(remove_tags('no tags', which_ones=('p',)), unicode) assert isinstance(remove_tags('

one tag

'), unicode) assert isinstance(remove_tags('

one tag

', which_ones=('p')), unicode) assert isinstance(remove_tags('link', which_ones=('b',)), unicode) # text without tags self.assertEqual(remove_tags(u'no tags'), u'no tags') self.assertEqual(remove_tags(u'no tags', which_ones=('p','b',)), u'no tags') # text with tags self.assertEqual(remove_tags(u'

one p tag

'), u'one p tag') self.assertEqual(remove_tags(u'

one p tag

', which_ones=('b',)), u'

one p tag

') self.assertEqual(remove_tags(u'not will removedi will removed', which_ones=('i',)), u'not will removedi will removed') # text with tags and attributes self.assertEqual(remove_tags(u'

texty

'), u'texty') self.assertEqual(remove_tags(u'

texty

', which_ones=('b',)), u'

texty

') def test_remove_tags_with_content(self): # make sure it always return unicode assert isinstance(remove_tags_with_content('no tags'), unicode) assert isinstance(remove_tags_with_content('no tags', which_ones=('p',)), unicode) assert isinstance(remove_tags_with_content('

one tag

', which_ones=('p',)), unicode) assert isinstance(remove_tags_with_content('link', which_ones=('b',)), unicode) # text without tags self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags') self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p','b',)), u'no tags') # text with tags self.assertEqual(remove_tags_with_content(u'

one p tag

'), u'

one p tag

') self.assertEqual(remove_tags_with_content(u'

one p tag

', which_ones=('p',)), u'') self.assertEqual(remove_tags_with_content(u'not will removedi will removed', which_ones=('i',)), u'not will removed') def test_replace_escape_chars(self): # make sure it always return unicode assert isinstance(replace_escape_chars('no ec'), unicode) assert isinstance(replace_escape_chars('no ec', replace_by='str'), unicode) assert isinstance(replace_escape_chars('no ec', which_ones=('\n','\t',)), unicode) # text without escape chars self.assertEqual(replace_escape_chars(u'no ec'), u'no ec') self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec') # text with escape chars self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape') self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n') self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3') self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'), u'escape\xa3chars\xa3') def test_unquote_markup(self): sample_txt1 = u"""hi, this is sample text with entities: & © """ sample_txt2 = u'blah&blahmoreblah<>' sample_txt3 = u'something£&morewhat"everhi, this is sample text with entities: & \xa9 although this is inside a cdata! & """") self.assertEqual(unquote_markup(sample_txt2), u'blah&blahblahblahblah!£moreblah<>') self.assertEqual(unquote_markup(sample_txt1 + sample_txt2), u"""hi, this is sample text with entities: & \xa9 although this is inside a cdata! & "blah&blahblahblahblah!£moreblah<>""") self.assertEqual(unquote_markup(sample_txt3), u'something\xa3&morethings, stuff, and suchwhat"ever