File: specextract.py

package info (click to toggle)

firefox-esr 78.15.0esr-1~deb11u1

links: PTS, VCS
area: main
in suites: bullseye
size: 3,301,156 kB
sloc: cpp: 5,665,905; javascript: 4,798,386; ansic: 2,878,233; python: 977,004; asm: 270,347; xml: 181,456; java: 111,756; sh: 72,926; makefile: 21,819; perl: 13,380; cs: 4,725; yacc: 4,565; objc: 3,026; pascal: 1,787; lex: 1,720; ada: 1,681; exp: 505; php: 436; lisp: 260; awk: 152; ruby: 103; csh: 80; sed: 53; sql: 45

file content (68 lines) | stat: -rw-r--r-- 2,704 bytes

parent folder | download | duplicates (8)

import html5lib
import html5lib.treebuilders.dom
import re

# Expected use:
#   curl --compressed https://html.spec.whatwg.org/multipage/canvas.html >current-work
#   python specextract.py
#
# Generates current-work-canvas.xhtml, for use by gentest.py to create the annotated spec document

def extract():
    parser = html5lib.html5parser.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
    doc = parser.parse(open('current-work', "r"), transport_encoding='utf-8')

    head = doc.getElementsByTagName('head')[0]
    for n in head.childNodes:
        if n.tagName == 'script':
            head.removeChild(n)

    header = doc.getElementsByTagName('header')[0]
    #thecanvas = doc.getElementById('the-canvas') # doesn't work (?!)
    thecanvas = [ n for n in doc.getElementsByTagName('h4') if n.getAttribute('id') == 'the-canvas-element' ][0]

    # Add copyright from https://html.spec.whatwg.org/multipage/acknowledgements.html#acknowledgments
    copy = doc.createElement('p')
    copy.setAttribute('class', 'copyright')
    copy.appendChild(doc.createTextNode(u'Parts of this specification are \xA9 Copyright 2004-2014 Apple Inc., Mozilla Foundation, and Opera Software ASA. You are granted a license to use, reproduce and create derivative works of this document.'))
    header.appendChild(copy)

    keep = [header, thecanvas]
    node = thecanvas.nextSibling
    while node.nodeName != 'nav':
        keep.append(node)
        node = node.nextSibling
    p = thecanvas.parentNode
    for n in p.childNodes[:]:
        if n not in keep:
            p.removeChild(n)

    for n in header.childNodes[3:-4]:
        header.removeChild(n)

    def make_absolute(url):
        match = re.match(r'(\w+:|#)', url)
        if match:
            return url
        elif url[0] == '/':
            return 'https://html.spec.whatwg.org' + url
        else:
            return 'https://html.spec.whatwg.org/multipage/' + url

    # Fix relative URLs
    for e in doc.getElementsByTagName('script'):
        e.setAttribute('src', make_absolute(e.getAttribute('src')))
    for e in doc.getElementsByTagName('iframe'):
        e.setAttribute('src', make_absolute(e.getAttribute('src')))
    for e in doc.getElementsByTagName('img'):
        e.setAttribute('src', make_absolute(e.getAttribute('src')))
    for e in doc.getElementsByTagName('a'):
        e.setAttribute('href', make_absolute(e.getAttribute('href')))

    # Convert to XHTML, because it's quicker to re-parse than HTML5
    doc.documentElement.setAttribute('xmlns', 'http://www.w3.org/1999/xhtml')
    doc.removeChild(doc.firstChild) # remove the DOCTYPE

    open('current-work-canvas.xhtml', 'w').write(doc.toxml(encoding = 'UTF-8'))

extract()