File: shtml2html.py

package info (click to toggle)
slurm-wlm 25.11.2-2
links: PTS, VCS
area: main
in suites: forky, sid
size: 46,824 kB
sloc: ansic: 551,323; sh: 10,228; javascript: 6,528; makefile: 4,277; perl: 3,717; python: 559; pascal: 131
file content (146 lines) | stat: -rwxr-xr-x 4,002 bytes
parent folder | download | duplicates (2)
#!/usr/bin/env python3

import re
import sys
import os
import codecs

try:
    import pypandoc

    no_pandoc = 0
except ImportError:
    print("WARNING: pypandoc not found, will wrap markdown in <pre> tags instead")
    no_pandoc = 1

canonical_url = "https://slurm.schedmd.com/"

include_pat = r'(<!--\s*#include\s*virtual\s*=\s*"([^"]+)"\s*-->)'
include_regex = re.compile(include_pat)

markdown_pat = r'(<!--\s*#include\s*markdown\s*=\s*"([^"]+)"\s*-->)'
markdown_regex = re.compile(markdown_pat)

canonical_pat = r"(<!--\s*#canonical\s*-->)"
canonical_regex = re.compile(canonical_pat)

page_title_pat = r"(<!--\s*#pagetitle\s*-->)"
page_title_regex = re.compile(page_title_pat)

url_pat = r'(\s+href\s*=\s*")([^"#]+)(#[^"]+)?(")'
url_regex = re.compile(url_pat)

first_header_pat = r'<[hH]1>\s*(<a name="top">)?\s*(?P<title>[a-zA-Z0-9_ ()\'/-]+)[:]*.*\s*[</a>]?\s*</[hH]1>'
first_header_regex = re.compile(first_header_pat)

version_pat = r"(@SLURM_VERSION@)"
version_regex = re.compile(version_pat)

title = ""
dirname = ""
newfilename = ""


def include_virtual(matchobj):
    global dirname
    if dirname:
        filename = dirname + "/" + matchobj.group(2)
    else:
        filename = matchobj.group(2)

    if os.access(filename, os.F_OK):
        # print('Including file', filename)
        lines = open(filename, "r").read()
        return lines
    else:
        return matchobj.group(0)


def include_markdown(matchobj):
    global dirname
    if dirname:
        filename = dirname + "/" + matchobj.group(2)
    else:
        filename = matchobj.group(2)

    if os.access(filename, os.F_OK):
        if no_pandoc:
            lines = open(filename, "r").read()
            return "<pre>\n" + lines + "</pre>"
        else:
            lines = pypandoc.convert_file(filename, "html", format="md")
            return lines
    else:
        return matchobj.group(0)


def canonical_rewrite(matchobj):
    global newfilename
    return '<link rel="canonical" href="' + canonical_url + newfilename + '">'


def page_title_rewrite(matchobj):
    global title
    return "<title>Slurm Workload Manager - " + title + "</title>"


def url_rewrite(matchobj):
    global dirname
    if dirname:
        localpath = dirname + "/" + matchobj.group(2)
    else:
        localpath = matchobj.group(2)

    if matchobj.group(2)[-6:] == ".shtml" and os.access(localpath, os.F_OK):
        location = matchobj.group(2)
        if matchobj.group(3) is None:
            newname = location[:-6] + ".html"
        else:
            newname = location[:-6] + ".html" + matchobj.group(3)
        # print('Rewriting', location, 'to', newname)
        return matchobj.group(1) + newname + matchobj.group(4)
    else:
        return matchobj.group(0)


def version_rewrite(matchobj):
    global version
    return version


# Make sure all of the files on the command line have the .shtml extension.
version = sys.argv[1]

files = []
for f in sys.argv[2:]:
    if f[-6:] == ".shtml":
        files.append(f)
    else:
        # print('Skipping file ', f, ' (extension is not .shtml)')
        pass

for filename in files:
    dirname, basefilename = os.path.split(filename)
    newfilename = basefilename[:-6] + ".html"
    print("Converting", filename, "->", newfilename)
    shtml = codecs.open(filename, "r", encoding="utf-8")
    html = codecs.open(newfilename, "w", encoding="utf-8")

    for line in shtml.readlines():
        result = first_header_regex.match(line)
        if result:
            title = result.group("title")
            break

    shtml.seek(0)
    for line in shtml.readlines():
        line = include_regex.sub(include_virtual, line)
        line = markdown_regex.sub(include_markdown, line)
        line = page_title_regex.sub(page_title_rewrite, line)
        line = version_regex.sub(version_rewrite, line)
        line = canonical_regex.sub(canonical_rewrite, line)
        line = url_regex.sub(url_rewrite, line)
        html.write(line)

    html.close()
    shtml.close()