File: html5lib.py

package info (click to toggle)
python-django-compressor 2.0-1~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 896 kB
  • sloc: python: 3,917; makefile: 152
file content (59 lines) | stat: -rw-r--r-- 1,919 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from __future__ import absolute_import
from django.core.exceptions import ImproperlyConfigured
from django.utils.encoding import smart_text

from compressor.exceptions import ParserError
from compressor.parser import ParserBase
from compressor.utils.decorators import cached_property


class Html5LibParser(ParserBase):

    def __init__(self, content):
        super(Html5LibParser, self).__init__(content)
        import html5lib
        self.html5lib = html5lib

    def _serialize(self, elem):
        return self.html5lib.serialize(
            elem, tree="etree", quote_attr_values=True,
            omit_optional_tags=False, use_trailing_solidus=True,
        )

    def _find(self, *names):
        for elem in self.html:
            if elem.tag in names:
                yield elem

    @cached_property
    def html(self):
        try:
            return self.html5lib.parseFragment(self.content, treebuilder="etree")
        except ImportError as err:
            raise ImproperlyConfigured("Error while importing html5lib: %s" % err)
        except Exception as err:
            raise ParserError("Error while initializing Parser: %s" % err)

    def css_elems(self):
        return self._find('{http://www.w3.org/1999/xhtml}link',
                          '{http://www.w3.org/1999/xhtml}style')

    def js_elems(self):
        return self._find('{http://www.w3.org/1999/xhtml}script')

    def elem_attribs(self, elem):
        return elem.attrib

    def elem_content(self, elem):
        return smart_text(elem.text)

    def elem_name(self, elem):
        if '}' in elem.tag:
            return elem.tag.split('}')[1]
        return elem.tag

    def elem_str(self, elem):
        # This method serializes HTML in a way that does not pass all tests.
        # However, this method is only called in tests anyway, so it doesn't
        # really matter.
        return smart_text(self._serialize(elem))