File: HTMLSerializer.py

package info (click to toggle)
firefox-esr 78.15.0esr-1~deb10u1
links: PTS, VCS
area: main
in suites: buster
size: 3,301,296 kB
sloc: cpp: 5,665,905; javascript: 4,798,386; ansic: 2,878,233; python: 977,004; asm: 270,347; xml: 181,456; java: 111,756; sh: 72,926; makefile: 21,819; perl: 13,380; cs: 4,725; yacc: 4,565; objc: 3,026; pascal: 1,787; lex: 1,720; ada: 1,681; exp: 505; php: 436; lisp: 260; awk: 152; ruby: 103; csh: 80; sed: 53; sql: 45
file content (277 lines) | stat: -rw-r--r-- 10,924 bytes
parent folder | download | duplicates (12)
#!/usr/bin/python
# CSS Test Source Manipulation Library
# Initial code by fantasai, joint copyright 2010 W3C and Microsoft
# additions by peter.linss@hp.com copyright 2013 Hewlett-Packard
# Licensed under BSD 3-Clause: <http://www.w3.org/Consortium/Legal/2008/03-bsd-license>

import lxml
from lxml import etree
import htmlentitydefs
import copy


class HTMLSerializer(object):

    gXMLns = 'http://www.w3.org/XML/1998/namespace'
    gHTMLns = 'http://www.w3.org/1999/xhtml'
  
    gDefaultNamespaces = {'http://www.w3.org/XML/1998/namespace': 'xmlns',
                          'http://www.w3.org/2000/xmlns/': 'xmlns',
                          'http://www.w3.org/1999/xlink': 'xlink'}

    gVoidElements = frozenset((
        'base',
        'command',
        'event-source',
        'link',
        'meta',
        'hr',
        'br',
        'img',
        'embed',
        'param',
        'area',
        'col',
        'input',
        'source'
    ))

    gCDataElements = frozenset((
        'style',
        'script'
    ))
  
    gInvisibleChars = frozenset(
        # ASCII control chars
        range(0x0, 0x9) + range(0xB, 0xD) + range(0xE, 0x20) +
        # Other control chars
        # fixed-width spaces, zero-width marks, bidi marks
        range(0x2000, 0x2010) +
        # LS, PS, bidi control codes
        range(0x2028, 0x2030) +
        # nbsp, mathsp, ideosp, WJ, interlinear
        [0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB]
    )

    gXMLEscapes = frozenset(gInvisibleChars |
                            frozenset((ord('&'), ord('<'), ord('>'))))

    gXMLEntityNames = {'"': 'quot', '&': 'amp', "'": 'apos', '<': 'lt', '>': 'gt'}

    gDocTypes = {
        'html': '<!DOCTYPE html>',
        'html4':
            '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">',
        'html4-transitional':
            '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">',
        'html4-frameset':
            '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">',
        'svg11':
            '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Basic//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd">',
        'svg11-tiny':
            '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Tiny//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd">',
        'xhtml10':
            '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">',
        'xhtml10-transitional':
            '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
        'xhtml10-frameset':
            '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">',
        'xhtml11':
            '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
        'xhtml-basic11':
            '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML Basic 1.1//EN" "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd">'
    }
  

    def __init__(self):
        self._reset()
  
    def _reset(self, xhtml = False):
        self.mOutput = u''
        self.mXHTML = xhtml

    def _output(self, *args):
        for arg in args:
            self.mOutput += unicode(arg)

    def _escape(self, text, escapeChars):
        # This algorithm is O(MN) for M len(text) and N num escapable
        # But it doesn't modify the text when N is zero (common case) and
        # N is expected to be small (usually 1 or 2) in most other cases.
        escapable = set()
        for char in text:
            if ord(char) in escapeChars:
                escapable.add(char)
        for char in escapable:
            if (self.mXHTML):
                name = self.gXMLEntityNames.get(char)
            else:
                name = htmlentitydefs.codepoint2name.get(ord(char))
            escape = u'&%s;' % name if name else u'&#x%X;' % ord(char)
            text = text.replace(char, escape)
        return text

    def _escapeXML(self, text):
        return self._escape(text, self.gXMLEscapes)

    def _escapeInvisible(self, text):
        return self._escape(text, self.gInvisibleChars)

    def _serializeElement(self, element, namespacePrefixes):
        qName = etree.QName(element)
        attrs = element.attrib.items()  # in tree order
      
        if (not namespacePrefixes):
            namespacePrefixes = self.gDefaultNamespaces
      
        if (self.mXHTML):
            namespacePrefixes = copy.copy(namespacePrefixes)
            for attr, value in attrs:
                attrQName = etree.QName(attr)
                if (self.gXMLns == attrQName.namespace):
                    namespacePrefixes[value] = attrQName.localname
                elif ('xmlns' == attrQName.localname):
                    namespacePrefixes[value] = ''

        if (self.mXHTML and qName.namespace and namespacePrefixes[qName.namespace]):
            self._output('<', namespacePrefixes[qName.namespace], ':', qName.localname)
        else:
            self._output('<', qName.localname)

        for attr, value in attrs:
            attrQName = etree.QName(attr)
            if ((attrQName.namespace == self.gXMLns) and ('lang' == attrQName.localname)):
                if (self.mXHTML):
                    attr = 'xml:lang'
                else:
                    attr = 'lang'
            elif (attrQName.namespace and namespacePrefixes[attrQName.namespace]):
                attr = namespacePrefixes[attrQName.namespace] + ':' + attrQName.localname
            else:
                attr = attrQName.localname

            self._output(' ', attr, '=')
            value = value.replace('&', '&amp;')
            if (self.mXHTML):
                value = value.replace('<', '&lt;')

            if (('"' in value) and ("'" not in value)):
                self._output("'", self._escapeInvisible(value), "'")
            else:
                self._output('"', self._escapeInvisible(value.replace('"', '&quot;')), '"')

        if ((qName.namespace == self.gHTMLns) and (qName.localname in self.gVoidElements)):
            if (self.mXHTML):
                self._output(' />')
            else:
                self._output('>')
        else:
            self._output('>')

            if (None != element.text):
                if ((qName.namespace == self.gHTMLns) and (qName.localname in self.gCDataElements)):
                    if (self.mXHTML):
                        self._output(self._escapeXML(element.text)) # or self._output('<![CDATA[', element.text, ']]>')
                    else:
                        self._output(element.text)
                else:
                    self._output(self._escapeXML(element.text))

            for child in list(element):
                self._serializeNode(child, namespacePrefixes)

            self._output('</', qName.localname, '>')

        if (None != element.tail):
            self._output(self._escapeXML(element.tail))

    def _serializeEntity(self, entity):
        self._output(entity.text)
        if (None != entity.tail):
            self._output(self._escapeXML(entity.tail))
        
    def _serializePI(self, pi):
        if (self.mXHTML):
            self._output('<?', pi.target, ' ', pi.text, '?>')
        else:
            raise Exception("Processing Instructions can't be converted to HTML")
        if (None != pi.tail):
            self._output(self._escapeXML(pi.tail))
        
    def _serializeComment(self, comment):
        self._output('<!--', comment.text, '-->') # XXX escape comment?
        if (None != comment.tail):
            self._output(self._escapeXML(comment.tail))
        
    def _serializeNode(self, node, namespacePrefixes = None):
        if (isinstance(node, etree._Entity)):
            self._serializeEntity(node)
        elif (isinstance(node, etree._ProcessingInstruction)):
            self._serializePI(node)
        elif (isinstance(node, etree._Comment)):
            self._serializeComment(node)
        else:
            self._serializeElement(node, namespacePrefixes)


    def _serializeTree(self, tree):
        root = tree.getroot()
        preceding = [node for node in root.itersiblings(preceding = True)]
        preceding.reverse()
        for node in preceding:
            self._serializeNode(node)
        self._serializeNode(root)
        for node in root.itersiblings():
            self._serializeNode(node)
  
    def _serializeDoctype(self, tree, doctype, default):
        if (doctype):
            self._output(self.gDocTypes[doctype], '\n')
        else:
            if (hasattr(tree, 'docinfo') and tree.docinfo and tree.docinfo.doctype):
                doctypeSearch = tree.docinfo.doctype.lower()
                for doctype in self.gDocTypes:
                    if (self.gDocTypes[doctype].lower() == doctypeSearch):
                        break
                else:
                    doctype = None
                if (self.mXHTML):
                    if ('html' == doctype):
                        doctype = 'xhtml10'
                    elif ('html4' == doctype):
                        doctype = 'xhtml10'
                    elif ('html4-transitional' == doctype):
                        doctype = 'xhtml10-transitional'
                    elif ('html4-frameset' == doctype):
                        doctype = 'xhtml10-frameset'
                else:
                    if ('xhtml10' == doctype):
                        doctype = 'html4'
                    elif ('xhtml10-transitional' == doctype):
                        doctype = 'html4-transitional'
                    elif ('xhtml10-frameset' == doctype):
                        doctype = 'html4-frameset'
                    elif ('xhtml11' == doctype):
                        doctype = 'html4'
                if (doctype):
                    self._output(self.gDocTypes[doctype], '\n')
                else:
                    self._output(tree.docinfo.doctype, '\n')
            else:
                self._output(self.gDocTypes[default], '\n')


    def serializeHTML(self, tree, doctype = None):
        self._reset()
        self._serializeDoctype(tree, doctype, 'html')
        self._serializeTree(tree)
        return self.mOutput

    def serializeXHTML(self, tree, doctype = None):
        self._reset(True)
        # XXX '<!xml ...' ??
        self._serializeDoctype(tree, doctype, 'xhtml11')
        self._serializeTree(tree)
        return self.mOutput