File: adapt.py

package info (click to toggle)
html5-parser 0.4.12%2Bds-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,748 kB
  • sloc: ansic: 32,397; python: 1,732; makefile: 6
file content (107 lines) | stat: -rw-r--r-- 4,326 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import importlib
import sys

from html5_parser import parse

from . import SVG, XHTML, XLINK, TestCase

COMMENT = ' A -- comment -'
DOCTYPE = '<!DOCTYPE html>'
HTML = '''
DOCTYPE
<html lang="en" xml:lang="en">
<head><script>a < & " b</script><title>title</title></head>
<body>
<p>A <span>test</span> of text and tail
<p><svg viewbox="v"><image xlink:href="h">
<p xml:lang="de">
</body>
<!--COMMENT-->
</html>
'''.replace('COMMENT', COMMENT).replace('DOCTYPE', DOCTYPE)


class AdaptTest(TestCase):

    def test_etree(self):
        from xml.etree.ElementTree import tostring
        root = parse(HTML, treebuilder='etree', namespace_elements=True)
        self.ae(root.tag, '{%s}html' % XHTML)
        ns = {'h': XHTML, 's': SVG, 'x': XLINK}
        self.ae(root.attrib, {'lang': 'en', 'xml_lang': 'en'})
        self.ae(root.find('./h:head/h:script', ns).text, 'a < & " b')
        self.ae(root.find('./h:body', ns)[-1].attrib, {'xml_lang': 'de'})
        self.ae(
            tostring(root.find('h:body/h:p', ns), method='text').decode('ascii'),
            'A test of text and tail\n')
        svg = root.find('./h:body/h:p/s:svg', ns)
        self.ae(svg.attrib, {'viewBox': 'v'})
        img = svg[0]
        self.ae(img.attrib, {'{%s}href' % XLINK: 'h'})
        if sys.version_info.major > 2:
            self.assertIn('<!--' + COMMENT + '-->', tostring(root).decode('ascii'))

    def test_dom(self):
        root = parse(HTML, treebuilder='dom', namespace_elements=True)
        doc = root.ownerDocument
        self.ae(doc.doctype, DOCTYPE)
        self.ae(root.tagName, 'html')
        self.ae(
            dict(root.attributes.itemsNS()),
            dict([((u'xmlns', u'xmlns'), 'http://www.w3.org/1999/xhtml'),
                  ((u'xmlns', u'xlink'), 'http://www.w3.org/1999/xlink'),
                  ((None, u'xml_lang'), 'en'),
                  ((None, u'lang'), 'en')]))
        script = doc.getElementsByTagName('script')[0]
        self.ae(script.firstChild.nodeValue, 'a < & " b')
        p = doc.getElementsByTagName('p')[0]
        self.ae(p.toxml(), '<p>A <span>test</span> of text and tail\n</p>')
        p = doc.getElementsByTagName('p')[-1]
        self.ae(
            dict(p.attributes.itemsNS()),
            dict([((None, u'xml_lang'), 'de')]))
        svg = doc.getElementsByTagName('svg')[0]
        self.ae(
            dict(svg.attributes.itemsNS()), {(None, 'viewBox'): 'v',
                                             (u'xmlns', u'xmlns'): 'http://www.w3.org/2000/svg'})
        self.ae(dict(svg.firstChild.attributes.itemsNS()), dict([((XLINK, u'href'), 'h')]))
        self.ae(root.lastChild.nodeValue, COMMENT.replace('--', '\u2014'))

    def test_soup(self):
        from html5_parser.soup import set_soup_module
        soups = []
        for soup in 'bs4 BeautifulSoup'.split():
            try:
                soups.append((soup, importlib.import_module(soup)))
            except ImportError:
                pass
        if not soups:
            self.skipTest('No BeautifulSoup module found')
        for soup_name, soup in soups:
            set_soup_module(soup)
            self.do_soup_test(soup_name)
        set_soup_module(None)

    def do_soup_test(self, soup_name):
        root = parse(HTML, treebuilder='soup')
        soup = root.parent
        if soup_name != 'BeautifulSoup':
            # In BS 4+, Doctype instances only store their contents. They get
            # formatted as `<!DOCTYPE {}>` when the whole soup is serialized.
            parsed_doctype = str(soup).split('\n', 1)[0]
            self.ae(DOCTYPE, parsed_doctype)
        self.ae(root.name, 'html')
        self.ae(dict(root.attrs), {'xml:lang': 'en', 'lang': 'en'})
        self.ae(dict(root.body.contents[-1].attrs), {'xml:lang': 'de'})
        self.ae(root.head.script.string, 'a < & " b')
        self.ae(str(root.find('p')), '<p>A <span>test</span> of text and tail\n</p>')
        svg = root.find('svg')
        self.ae(dict(svg.attrs), {'viewBox': 'v'})
        self.ae(dict(svg.contents[0].attrs), {'xlink:href': 'h'})
        self.ae(COMMENT, root.contents[-1].string)