1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
|
# -*- coding: utf-8 -*-
import unittest
from scrapy.utils.markup import remove_entities, replace_tags, remove_comments
from scrapy.utils.markup import remove_tags_with_content, replace_escape_chars, remove_tags
from scrapy.utils.markup import unquote_markup
class UtilsMarkupTest(unittest.TestCase):
def test_remove_entities(self):
# make sure it always return uncode
assert isinstance(remove_entities('no entities'), unicode)
assert isinstance(remove_entities('Price: £100!'), unicode)
# regular conversions
self.assertEqual(remove_entities(u'As low as £100!'),
u'As low as \xa3100!')
self.assertEqual(remove_entities('As low as £100!'),
u'As low as \xa3100!')
self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
# keep some entities
self.assertEqual(remove_entities('<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']),
u'<b>Low < High & Medium \xa3 six</b>')
# illegal entities
self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=False),
u'a < b &illegal; c � six')
self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=True),
u'a < b c six')
self.assertEqual(remove_entities('x≤y'), u'x\u2264y')
# check browser hack for numeric character references in the 80-9F range
self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y')
# encoding
self.assertEqual(remove_entities('x\x99™™y', encoding='cp1252'), \
u'x\u2122\u2122\u2122y')
def test_replace_tags(self):
# make sure it always return uncode
assert isinstance(replace_tags('no entities'), unicode)
self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
u'This text contains some tag')
self.assertEqual(replace_tags('This text is very im<b>port</b>ant', ' '),
u'This text is very im port ant')
# multiline tags
self.assertEqual(replace_tags('Click <a class="one"\r\n href="url">here</a>'),
u'Click here')
def test_remove_comments(self):
# make sure it always return unicode
assert isinstance(remove_comments('without comments'), unicode)
assert isinstance(remove_comments('<!-- with comments -->'), unicode)
# text without comments
self.assertEqual(remove_comments(u'text without comments'), u'text without comments')
# text with comments
self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
self.assertEqual(remove_comments(u'Hello<!--World-->'),u'Hello')
def test_remove_tags(self):
# make sure it always return unicode
assert isinstance(remove_tags('no tags'), unicode)
assert isinstance(remove_tags('no tags', which_ones=('p',)), unicode)
assert isinstance(remove_tags('<p>one tag</p>'), unicode)
assert isinstance(remove_tags('<p>one tag</p>', which_ones=('p')), unicode)
assert isinstance(remove_tags('<a>link</a>', which_ones=('b',)), unicode)
# text without tags
self.assertEqual(remove_tags(u'no tags'), u'no tags')
self.assertEqual(remove_tags(u'no tags', which_ones=('p','b',)), u'no tags')
# text with tags
self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag')
self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>')
self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
u'<b>not will removed</b>i will removed')
# text with tags and attributes
self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>'), u'texty')
self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>', which_ones=('b',)),
u'<p align="center" class="one">texty</p>')
def test_remove_tags_with_content(self):
# make sure it always return unicode
assert isinstance(remove_tags_with_content('no tags'), unicode)
assert isinstance(remove_tags_with_content('no tags', which_ones=('p',)), unicode)
assert isinstance(remove_tags_with_content('<p>one tag</p>', which_ones=('p',)), unicode)
assert isinstance(remove_tags_with_content('<a>link</a>', which_ones=('b',)), unicode)
# text without tags
self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags')
self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p','b',)), u'no tags')
# text with tags
self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>')
self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'')
self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
u'<b>not will removed</b>')
def test_replace_escape_chars(self):
# make sure it always return unicode
assert isinstance(replace_escape_chars('no ec'), unicode)
assert isinstance(replace_escape_chars('no ec', replace_by='str'), unicode)
assert isinstance(replace_escape_chars('no ec', which_ones=('\n','\t',)), unicode)
# text without escape chars
self.assertEqual(replace_escape_chars(u'no ec'), u'no ec')
self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')
# text with escape chars
self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n')
self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n')
self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ')
self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3')
self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'), u'escape\xa3chars\xa3')
def test_unquote_markup(self):
sample_txt1 = u"""<node1>hi, this is sample text with entities: & ©
<![CDATA[although this is inside a cdata! & "]]></node1>"""
sample_txt2 = u'<node2>blah&blah<![CDATA[blahblahblah!£]]>moreblah<></node2>'
sample_txt3 = u'something£&more<node3><![CDATA[things, stuff, and such]]>what"ever</node3><node4'
# make sure it always return unicode
assert isinstance(unquote_markup(sample_txt1.encode('latin-1')), unicode)
assert isinstance(unquote_markup(sample_txt2), unicode)
self.assertEqual(unquote_markup(sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! & "</node1>""")
self.assertEqual(unquote_markup(sample_txt2), u'<node2>blah&blahblahblahblah!£moreblah<></node2>')
self.assertEqual(unquote_markup(sample_txt1 + sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! & "</node1><node2>blah&blahblahblahblah!£moreblah<></node2>""")
self.assertEqual(unquote_markup(sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')
|