File: check-html-ids.py

package info (click to toggle)
python3.14 3.14.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 152,200 kB
  • sloc: python: 757,783; ansic: 718,195; xml: 31,250; sh: 5,982; cpp: 4,093; makefile: 2,007; objc: 787; lisp: 502; javascript: 136; asm: 75; csh: 12
file content (181 lines) | stat: -rw-r--r-- 5,608 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from compression import gzip
import concurrent.futures
from pathlib import Path
import html.parser
import functools
import argparse
import json
import sys
import re


IGNORED_ID_RE = re.compile(
    r"""
    index-\d+
    | id\d+
    | [_a-z]+_\d+
""",
    re.VERBOSE,
)


class IDGatherer(html.parser.HTMLParser):
    def __init__(self, ids):
        super().__init__()
        self.__ids = ids

    def handle_starttag(self, tag, attrs):
        for name, value in attrs:
            if name == 'id':
                if not IGNORED_ID_RE.fullmatch(value):
                    self.__ids.add(value)


def get_ids_from_file(path):
    ids = set()
    gatherer = IDGatherer(ids)
    with path.open(encoding='utf-8') as file:
        while chunk := file.read(4096):
            gatherer.feed(chunk)
    return ids


def gather_ids(htmldir, *, verbose_print):
    if not htmldir.joinpath('objects.inv').exists():
        raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')

    if sys._is_gil_enabled:
        pool = concurrent.futures.ProcessPoolExecutor()
    else:
        pool = concurrent.futures.ThreadPoolExecutor()
    tasks = {}
    for path in htmldir.glob('**/*.html'):
        relative_path = path.relative_to(htmldir)
        if '_static' in relative_path.parts:
            continue
        if 'whatsnew' in relative_path.parts:
            continue
        tasks[relative_path] = pool.submit(get_ids_from_file, path=path)

    ids_by_page = {}
    for relative_path, future in tasks.items():
        verbose_print(relative_path)
        ids = future.result()
        ids_by_page[str(relative_path)] = ids
        verbose_print(f'    - {len(ids)} ids found')

    common = set.intersection(*ids_by_page.values())
    verbose_print(f'Filtering out {len(common)} common ids')
    for key, page_ids in ids_by_page.items():
        ids_by_page[key] = sorted(page_ids - common)

    return ids_by_page


def do_check(baseline, checked, excluded, *, verbose_print):
    successful = True
    for name, baseline_ids in sorted(baseline.items()):
        try:
            checked_ids = checked[name]
        except KeyError:
            successful = False
            print(f'{name}: (page missing)')
            print()
        else:
            missing_ids = set(baseline_ids) - set(checked_ids)
            if missing_ids:
                missing_ids = {
                    a
                    for a in missing_ids
                    if not IGNORED_ID_RE.fullmatch(a)
                    and (name, a) not in excluded
                }
            if missing_ids:
                successful = False
                for missing_id in sorted(missing_ids):
                    print(f'{name}: {missing_id}')
                print()
    return successful


def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='print out more information',
    )
    subparsers = parser.add_subparsers(dest='command', required=True)

    collect = subparsers.add_parser(
        'collect', help='collect IDs from a set of HTML files'
    )
    collect.add_argument(
        'htmldir', type=Path, help='directory with HTML documentation'
    )
    collect.add_argument(
        '-o',
        '--outfile',
        help='File to save the result in; default <htmldir>/html-ids.json.gz',
    )

    check = subparsers.add_parser('check', help='check two archives of IDs')
    check.add_argument(
        'baseline_file', type=Path, help='file with baseline IDs'
    )
    check.add_argument('checked_file', type=Path, help='file with checked IDs')
    check.add_argument(
        '-x',
        '--exclude-file',
        type=Path,
        help='file with IDs to exclude from the check',
    )

    args = parser.parse_args(argv[1:])

    if args.verbose:
        verbose_print = functools.partial(print, file=sys.stderr)
    else:

        def verbose_print(*args, **kwargs):
            """do nothing"""

    if args.command == 'collect':
        ids = gather_ids(args.htmldir, verbose_print=verbose_print)
        if args.outfile is None:
            args.outfile = args.htmldir / 'html-ids.json.gz'
        with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
            json.dump({'ids_by_page': ids}, zfile)

    if args.command == 'check':
        with gzip.open(args.baseline_file) as zfile:
            baseline = json.load(zfile)['ids_by_page']
        with gzip.open(args.checked_file) as zfile:
            checked = json.load(zfile)['ids_by_page']
        excluded = set()
        if args.exclude_file:
            with open(args.exclude_file, encoding='utf-8') as file:
                for line in file:
                    line = line.strip()
                    if line and not line.startswith('#'):
                        name, sep, excluded_id = line.partition(':')
                        if sep:
                            excluded.add((name.strip(), excluded_id.strip()))
        if do_check(baseline, checked, excluded, verbose_print=verbose_print):
            verbose_print('All OK')
        else:
            sys.stdout.flush()
            print(
                'ERROR: Removed IDs found',
                'The above HTML IDs were removed from the documentation, '
                + 'resulting in broken links. Please add them back.',
                sep='\n',
                file=sys.stderr,
            )
            if args.exclude_file:
                print(f'Alternatively, add them to {args.exclude_file}.')


if __name__ == '__main__':
    main(sys.argv)