1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
|
from compression import gzip
import concurrent.futures
from pathlib import Path
import html.parser
import functools
import argparse
import json
import sys
import re
IGNORED_ID_RE = re.compile(
r"""
index-\d+
| id\d+
| [_a-z]+_\d+
""",
re.VERBOSE,
)
class IDGatherer(html.parser.HTMLParser):
def __init__(self, ids):
super().__init__()
self.__ids = ids
def handle_starttag(self, tag, attrs):
for name, value in attrs:
if name == 'id':
if not IGNORED_ID_RE.fullmatch(value):
self.__ids.add(value)
def get_ids_from_file(path):
ids = set()
gatherer = IDGatherer(ids)
with path.open(encoding='utf-8') as file:
while chunk := file.read(4096):
gatherer.feed(chunk)
return ids
def gather_ids(htmldir, *, verbose_print):
if not htmldir.joinpath('objects.inv').exists():
raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
if sys._is_gil_enabled:
pool = concurrent.futures.ProcessPoolExecutor()
else:
pool = concurrent.futures.ThreadPoolExecutor()
tasks = {}
for path in htmldir.glob('**/*.html'):
relative_path = path.relative_to(htmldir)
if '_static' in relative_path.parts:
continue
if 'whatsnew' in relative_path.parts:
continue
tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
ids_by_page = {}
for relative_path, future in tasks.items():
verbose_print(relative_path)
ids = future.result()
ids_by_page[str(relative_path)] = ids
verbose_print(f' - {len(ids)} ids found')
common = set.intersection(*ids_by_page.values())
verbose_print(f'Filtering out {len(common)} common ids')
for key, page_ids in ids_by_page.items():
ids_by_page[key] = sorted(page_ids - common)
return ids_by_page
def do_check(baseline, checked, excluded, *, verbose_print):
successful = True
for name, baseline_ids in sorted(baseline.items()):
try:
checked_ids = checked[name]
except KeyError:
successful = False
print(f'{name}: (page missing)')
print()
else:
missing_ids = set(baseline_ids) - set(checked_ids)
if missing_ids:
missing_ids = {
a
for a in missing_ids
if not IGNORED_ID_RE.fullmatch(a)
and (name, a) not in excluded
}
if missing_ids:
successful = False
for missing_id in sorted(missing_ids):
print(f'{name}: {missing_id}')
print()
return successful
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument(
'-v',
'--verbose',
action='store_true',
help='print out more information',
)
subparsers = parser.add_subparsers(dest='command', required=True)
collect = subparsers.add_parser(
'collect', help='collect IDs from a set of HTML files'
)
collect.add_argument(
'htmldir', type=Path, help='directory with HTML documentation'
)
collect.add_argument(
'-o',
'--outfile',
help='File to save the result in; default <htmldir>/html-ids.json.gz',
)
check = subparsers.add_parser('check', help='check two archives of IDs')
check.add_argument(
'baseline_file', type=Path, help='file with baseline IDs'
)
check.add_argument('checked_file', type=Path, help='file with checked IDs')
check.add_argument(
'-x',
'--exclude-file',
type=Path,
help='file with IDs to exclude from the check',
)
args = parser.parse_args(argv[1:])
if args.verbose:
verbose_print = functools.partial(print, file=sys.stderr)
else:
def verbose_print(*args, **kwargs):
"""do nothing"""
if args.command == 'collect':
ids = gather_ids(args.htmldir, verbose_print=verbose_print)
if args.outfile is None:
args.outfile = args.htmldir / 'html-ids.json.gz'
with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
json.dump({'ids_by_page': ids}, zfile)
if args.command == 'check':
with gzip.open(args.baseline_file) as zfile:
baseline = json.load(zfile)['ids_by_page']
with gzip.open(args.checked_file) as zfile:
checked = json.load(zfile)['ids_by_page']
excluded = set()
if args.exclude_file:
with open(args.exclude_file, encoding='utf-8') as file:
for line in file:
line = line.strip()
if line and not line.startswith('#'):
name, sep, excluded_id = line.partition(':')
if sep:
excluded.add((name.strip(), excluded_id.strip()))
if do_check(baseline, checked, excluded, verbose_print=verbose_print):
verbose_print('All OK')
else:
sys.stdout.flush()
print(
'ERROR: Removed IDs found',
'The above HTML IDs were removed from the documentation, '
+ 'resulting in broken links. Please add them back.',
sep='\n',
file=sys.stderr,
)
if args.exclude_file:
print(f'Alternatively, add them to {args.exclude_file}.')
if __name__ == '__main__':
main(sys.argv)
|