1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
#!/usr/bin/env python3
# doc_postprocess.py [-h|--help] <pattern>...
# Post-process the Doxygen-generated HTML files matching pattern.
import os
import sys
import re
import glob
# Substitutions with regular expressions are somewhat slow in Python 3.9.5.
# Use str.replace() rather than re.sub() where possible.
# [search string, compiled regular expression or None, substitution string, count]
class_el_patterns = [
# return value
[ ' & ', re.compile(r' & *'), '& ', 1],
[ ' * ', re.compile(r' \* *'), '* ', 1],
# parameters
[ ' &', None, '&', 0],
[ '&', re.compile(r'&\b'), '& ', 0],
[ ' *', None, '*', 0],
[ '*', re.compile(r'\*\b'), '* ', 0],
# templates
[ 'template<', re.compile(r'\btemplate<'), 'template <', 1]
]
class_md_patterns = [
# left parenthesis
[ '( ', re.compile(r'\( *'), '(', 1],
# return value
[ ' & ', None, '& ', 0],
[ ' * ', None, '* ', 0],
# parameters
[ ' & ', re.compile(r' & *'), '& ', 0],
[ ' * ', re.compile(r' \* *'), '* ', 0],
# templates
[ 'template<', re.compile(r'\btemplate<'), 'template <', 1]
]
else_patterns = [
# template decls
[ 'template<', re.compile(r'^(<h\d>|)template<'), '\\1template <', 1]
]
all_lines_patterns = [
# For some reason, some versions of Doxygen output the full path to
# referenced tag files. This is bad since it breaks doc_install.py,
# and also because it leaks local path names into source tarballs.
# Thus, strip the directory prefix here.
[ ' doxygen="', re.compile(r' doxygen="[^":]*/([^":]+\.tag):'), ' doxygen="\\1:', 0],
[ '©', None, '©', 0],
[ '—', None, '—', 0],
[ '–', None, '–', 0],
[ ' ', re.compile(r' * *'), ' ', 0]
]
def doc_postprocess(patterns):
if not (isinstance(patterns, list) or isinstance(patterns, tuple)):
patterns = [] if patterns == None else [patterns]
filepaths = []
for pattern in patterns:
filepaths += glob.glob(pattern)
for filepath in filepaths:
# Assume that the file is UTF-8 encoded.
# If illegal UTF-8 bytes in the range 0x80..0xff are encountered, they are
# replaced by Unicode Private Use characters in the range 0xdc80..0xdcff
# and restored to their original values when the file is rewritten.
with open(filepath, mode='r', encoding='utf-8', errors='surrogateescape') as file:
# Read the whole file into a buffer, a list with one line per element.
buf = file.readlines()
for line_number in range(len(buf)):
line = buf[line_number]
# Substitute
if '<a class="el"' in line:
for subst in class_el_patterns:
if subst[0] in line:
if subst[1]:
line = subst[1].sub(subst[2], line, count=subst[3])
else:
line = line.replace(subst[0], subst[2], subst[3])
elif ('<td class="md"' in line) or ('<td class="mdname"' in line):
for subst in class_md_patterns:
if subst[0] in line:
if subst[1]:
line = subst[1].sub(subst[2], line, count=subst[3])
else:
line = line.replace(subst[0], subst[2], subst[3])
else:
for subst in else_patterns:
if subst[0] in line:
if subst[1]:
line = subst[1].sub(subst[2], line, count=subst[3])
else:
line = line.replace(subst[0], subst[2], subst[3])
for subst in all_lines_patterns:
if subst[0] in line:
if subst[1]:
line = subst[1].sub(subst[2], line, count=subst[3])
else:
line = line.replace(subst[0], subst[2], subst[3])
buf[line_number] = line
with open(filepath, mode='w', encoding='utf-8', errors='surrogateescape') as file:
# Write the whole buffer back into the file.
file.writelines(buf)
return 0
# ----- Main -----
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='Post-process the Doxygen-generated HTML files matching pattern.')
parser.add_argument('patterns', nargs='*', metavar='pattern', help='filename pattern')
args = parser.parse_args()
print(args.patterns)
sys.exit(doc_postprocess(args.patterns))
|