File: test_utils_markup.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (144 lines) | stat: -rw-r--r-- 7,969 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
import unittest

from scrapy.utils.markup import remove_entities, replace_tags, remove_comments
from scrapy.utils.markup import remove_tags_with_content, replace_escape_chars, remove_tags
from scrapy.utils.markup import unquote_markup

class UtilsMarkupTest(unittest.TestCase):

    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: £100!'),  unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as £100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as £100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
                         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
        # keep some entities
        self.assertEqual(remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=False),
                         u'a < b &illegal; c &#12345678; six')
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                         u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')

    def test_replace_tags(self):
        # make sure it always return uncode
        assert isinstance(replace_tags('no entities'), unicode)

        self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
                         u'This text contains some tag')

        self.assertEqual(replace_tags('This text is very im<b>port</b>ant', ' '),
                         u'This text is very im port ant')

        # multiline tags
        self.assertEqual(replace_tags('Click <a class="one"\r\n href="url">here</a>'),
                         u'Click here')

    def test_remove_comments(self):
        # make sure it always return unicode
        assert isinstance(remove_comments('without comments'), unicode)
        assert isinstance(remove_comments('<!-- with comments -->'), unicode)

        # text without comments 
        self.assertEqual(remove_comments(u'text without comments'), u'text without comments')

        # text with comments
        self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
        self.assertEqual(remove_comments(u'Hello<!--World-->'),u'Hello')

    def test_remove_tags(self):
        # make sure it always return unicode
        assert isinstance(remove_tags('no tags'), unicode)
        assert isinstance(remove_tags('no tags', which_ones=('p',)), unicode)
        assert isinstance(remove_tags('<p>one tag</p>'), unicode)
        assert isinstance(remove_tags('<p>one tag</p>', which_ones=('p')), unicode)
        assert isinstance(remove_tags('<a>link</a>', which_ones=('b',)), unicode)

        # text without tags
        self.assertEqual(remove_tags(u'no tags'), u'no tags')
        self.assertEqual(remove_tags(u'no tags', which_ones=('p','b',)), u'no tags')

        # text with tags
        self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag')
        self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>')

        self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
                         u'<b>not will removed</b>i will removed')

        # text with tags and attributes
        self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>'), u'texty')
        self.assertEqual(remove_tags(u'<p align="center" class="one">texty</p>', which_ones=('b',)),
                         u'<p align="center" class="one">texty</p>')

    def test_remove_tags_with_content(self):
        # make sure it always return unicode
        assert isinstance(remove_tags_with_content('no tags'), unicode)
        assert isinstance(remove_tags_with_content('no tags', which_ones=('p',)), unicode)
        assert isinstance(remove_tags_with_content('<p>one tag</p>', which_ones=('p',)), unicode)
        assert isinstance(remove_tags_with_content('<a>link</a>', which_ones=('b',)), unicode)

        # text without tags
        self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags')
        self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p','b',)), u'no tags')

        # text with tags
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>')
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'')

        self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
                         u'<b>not will removed</b>')

    def test_replace_escape_chars(self):
        # make sure it always return unicode
        assert isinstance(replace_escape_chars('no ec'), unicode)
        assert isinstance(replace_escape_chars('no ec', replace_by='str'), unicode)
        assert isinstance(replace_escape_chars('no ec', which_ones=('\n','\t',)), unicode)

        # text without escape chars
        self.assertEqual(replace_escape_chars(u'no ec'), u'no ec')
        self.assertEqual(replace_escape_chars(u'no ec', which_ones=('\n',)), u'no ec')

        # text with escape chars
        self.assertEqual(replace_escape_chars(u'escape\n\n'), u'escape')
        self.assertEqual(replace_escape_chars(u'escape\n', which_ones=('\t',)), u'escape\n')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', which_ones=('\t')), 'escapechars\n')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=' '), 'escape chars ')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by=u'\xa3'), u'escape\xa3chars\xa3')
        self.assertEqual(replace_escape_chars(u'escape\tchars\n', replace_by='\xc2\xa3'), u'escape\xa3chars\xa3')

    def test_unquote_markup(self):
        sample_txt1 = u"""<node1>hi, this is sample text with entities: &amp; &copy;
<![CDATA[although this is inside a cdata! &amp; &quot;]]></node1>"""
        sample_txt2 = u'<node2>blah&amp;blah<![CDATA[blahblahblah!&pound;]]>moreblah&lt;&gt;</node2>'
        sample_txt3 = u'something&pound;&amp;more<node3><![CDATA[things, stuff, and such]]>what&quot;ever</node3><node4'

        # make sure it always return unicode
        assert isinstance(unquote_markup(sample_txt1.encode('latin-1')), unicode)
        assert isinstance(unquote_markup(sample_txt2), unicode)

        self.assertEqual(unquote_markup(sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1>""")

        self.assertEqual(unquote_markup(sample_txt2), u'<node2>blah&blahblahblahblah!&pound;moreblah<></node2>')

        self.assertEqual(unquote_markup(sample_txt1 + sample_txt2), u"""<node1>hi, this is sample text with entities: & \xa9
although this is inside a cdata! &amp; &quot;</node1><node2>blah&blahblahblahblah!&pound;moreblah<></node2>""")

        self.assertEqual(unquote_markup(sample_txt3), u'something\xa3&more<node3>things, stuff, and suchwhat"ever</node3><node4')