Skip to content

Commit 9b22261

Browse files
authored
GH-145000: Add a tool to record/check removed HTML IDs (#145001)
1 parent f8ce51a commit 9b22261

File tree

3 files changed

+190
-0
lines changed

3 files changed

+190
-0
lines changed

Doc/.ruff.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ ignore = [
3232
"E501", # Ignore line length errors (we use auto-formatting)
3333
]
3434

35+
[lint.per-file-ignores]
36+
"tools/check-html-ids.py" = ["I001"] # Unsorted imports
37+
3538
[format]
3639
preview = true
3740
quote-style = "preserve"

Doc/Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,3 +336,9 @@ autobuild-stable-html:
336336
exit 1;; \
337337
esac
338338
@$(MAKE) autobuild-dev-html
339+
340+
# Collect HTML IDs to a JSON document
341+
.PHONY: html-ids
342+
html-ids:
343+
$(PYTHON) tools/check-html-ids.py collect build/html \
344+
-o build/html/html-ids.json.gz

Doc/tools/check-html-ids.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
from compression import gzip
2+
import concurrent.futures
3+
from pathlib import Path
4+
import html.parser
5+
import functools
6+
import argparse
7+
import json
8+
import sys
9+
import re
10+
11+
12+
IGNORED_ID_RE = re.compile(
13+
r"""
14+
index-\d+
15+
| id\d+
16+
| [_a-z]+_\d+
17+
""",
18+
re.VERBOSE,
19+
)
20+
21+
22+
class IDGatherer(html.parser.HTMLParser):
23+
def __init__(self, ids):
24+
super().__init__()
25+
self.__ids = ids
26+
27+
def handle_starttag(self, tag, attrs):
28+
for name, value in attrs:
29+
if name == 'id':
30+
if not IGNORED_ID_RE.fullmatch(value):
31+
self.__ids.add(value)
32+
33+
34+
def get_ids_from_file(path):
35+
ids = set()
36+
gatherer = IDGatherer(ids)
37+
with path.open(encoding='utf-8') as file:
38+
while chunk := file.read(4096):
39+
gatherer.feed(chunk)
40+
return ids
41+
42+
43+
def gather_ids(htmldir, *, verbose_print):
44+
if not htmldir.joinpath('objects.inv').exists():
45+
raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
46+
47+
if sys._is_gil_enabled:
48+
pool = concurrent.futures.ProcessPoolExecutor()
49+
else:
50+
pool = concurrent.futures.ThreadPoolExecutor()
51+
tasks = {}
52+
for path in htmldir.glob('**/*.html'):
53+
relative_path = path.relative_to(htmldir)
54+
if '_static' in relative_path.parts:
55+
continue
56+
if 'whatsnew' in relative_path.parts:
57+
continue
58+
tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
59+
60+
ids_by_page = {}
61+
for relative_path, future in tasks.items():
62+
verbose_print(relative_path)
63+
ids = future.result()
64+
ids_by_page[str(relative_path)] = ids
65+
verbose_print(f' - {len(ids)} ids found')
66+
67+
common = set.intersection(*ids_by_page.values())
68+
verbose_print(f'Filtering out {len(common)} common ids')
69+
for key, page_ids in ids_by_page.items():
70+
ids_by_page[key] = sorted(page_ids - common)
71+
72+
return ids_by_page
73+
74+
75+
def do_check(baseline, checked, excluded, *, verbose_print):
76+
successful = True
77+
for name, baseline_ids in sorted(baseline.items()):
78+
try:
79+
checked_ids = checked[name]
80+
except KeyError:
81+
successful = False
82+
print(f'{name}: (page missing)')
83+
print()
84+
else:
85+
missing_ids = set(baseline_ids) - set(checked_ids)
86+
if missing_ids:
87+
missing_ids = {
88+
a
89+
for a in missing_ids
90+
if not IGNORED_ID_RE.fullmatch(a)
91+
and (name, a) not in excluded
92+
}
93+
if missing_ids:
94+
successful = False
95+
for missing_id in sorted(missing_ids):
96+
print(f'{name}: {missing_id}')
97+
print()
98+
return successful
99+
100+
101+
def main(argv):
102+
parser = argparse.ArgumentParser()
103+
parser.add_argument(
104+
'-v',
105+
'--verbose',
106+
action='store_true',
107+
help='print out more information',
108+
)
109+
subparsers = parser.add_subparsers(dest='command', required=True)
110+
111+
collect = subparsers.add_parser(
112+
'collect', help='collect IDs from a set of HTML files'
113+
)
114+
collect.add_argument(
115+
'htmldir', type=Path, help='directory with HTML documentation'
116+
)
117+
collect.add_argument(
118+
'-o',
119+
'--outfile',
120+
help='File to save the result in; default <htmldir>/html-ids.json.gz',
121+
)
122+
123+
check = subparsers.add_parser('check', help='check two archives of IDs')
124+
check.add_argument(
125+
'baseline_file', type=Path, help='file with baseline IDs'
126+
)
127+
check.add_argument('checked_file', type=Path, help='file with checked IDs')
128+
check.add_argument(
129+
'-x',
130+
'--exclude-file',
131+
type=Path,
132+
help='file with IDs to exclude from the check',
133+
)
134+
135+
args = parser.parse_args(argv[1:])
136+
137+
if args.verbose:
138+
verbose_print = functools.partial(print, file=sys.stderr)
139+
else:
140+
141+
def verbose_print(*args, **kwargs):
142+
"""do nothing"""
143+
144+
if args.command == 'collect':
145+
ids = gather_ids(args.htmldir, verbose_print=verbose_print)
146+
if args.outfile is None:
147+
args.outfile = args.htmldir / 'html-ids.json.gz'
148+
with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
149+
json.dump({'ids_by_page': ids}, zfile)
150+
151+
if args.command == 'check':
152+
with gzip.open(args.baseline_file) as zfile:
153+
baseline = json.load(zfile)['ids_by_page']
154+
with gzip.open(args.checked_file) as zfile:
155+
checked = json.load(zfile)['ids_by_page']
156+
excluded = set()
157+
if args.exclude_file:
158+
with open(args.exclude_file, encoding='utf-8') as file:
159+
for line in file:
160+
line = line.strip()
161+
if line and not line.startswith('#'):
162+
name, sep, excluded_id = line.partition(':')
163+
if sep:
164+
excluded.add((name.strip(), excluded_id.strip()))
165+
if do_check(baseline, checked, excluded, verbose_print=verbose_print):
166+
verbose_print('All OK')
167+
else:
168+
sys.stdout.flush()
169+
print(
170+
'ERROR: Removed IDs found',
171+
'The above HTML IDs were removed from the documentation, '
172+
+ 'resulting in broken links. Please add them back.',
173+
sep='\n',
174+
file=sys.stderr,
175+
)
176+
if args.exclude_file:
177+
print(f'Alternatively, add them to {args.exclude_file}.')
178+
179+
180+
if __name__ == '__main__':
181+
main(sys.argv)

0 commit comments

Comments
 (0)