GH-145000: Add a tool to record/check removed HTML IDs (#145001)

encukou · web-flow · commit 9b22261a86b5 · 2026-02-25T13:37:59.000+01:00
diff --git a/Doc/.ruff.toml b/Doc/.ruff.toml
@@ -32,6 +32,9 @@ ignore = [
     "E501",  # Ignore line length errors (we use auto-formatting)
 ]
 
+[lint.per-file-ignores]
+"tools/check-html-ids.py" = ["I001"]  # Unsorted imports
+
 [format]
 preview = true
 quote-style = "preserve"
diff --git a/Doc/Makefile b/Doc/Makefile
@@ -336,3 +336,9 @@ autobuild-stable-html:
 		exit 1;; \
 	esac
 	@$(MAKE) autobuild-dev-html
+
+# Collect HTML IDs to a JSON document
+.PHONY: html-ids
+html-ids:
+	$(PYTHON) tools/check-html-ids.py collect build/html \
+		-o build/html/html-ids.json.gz
diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py
@@ -0,0 +1,181 @@
+from compression import gzip
+import concurrent.futures
+from pathlib import Path
+import html.parser
+import functools
+import argparse
+import json
+import sys
+import re
+
+
+IGNORED_ID_RE = re.compile(
+    r"""
+    index-\d+
+    | id\d+
+    | [_a-z]+_\d+
+""",
+    re.VERBOSE,
+)
+
+
+class IDGatherer(html.parser.HTMLParser):
+    def __init__(self, ids):
+        super().__init__()
+        self.__ids = ids
+
+    def handle_starttag(self, tag, attrs):
+        for name, value in attrs:
+            if name == 'id':
+                if not IGNORED_ID_RE.fullmatch(value):
+                    self.__ids.add(value)
+
+
+def get_ids_from_file(path):
+    ids = set()
+    gatherer = IDGatherer(ids)
+    with path.open(encoding='utf-8') as file:
+        while chunk := file.read(4096):
+            gatherer.feed(chunk)
+    return ids
+
+
+def gather_ids(htmldir, *, verbose_print):
+    if not htmldir.joinpath('objects.inv').exists():
+        raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
+
+    if sys._is_gil_enabled:
+        pool = concurrent.futures.ProcessPoolExecutor()
+    else:
+        pool = concurrent.futures.ThreadPoolExecutor()
+    tasks = {}
+    for path in htmldir.glob('**/*.html'):
+        relative_path = path.relative_to(htmldir)
+        if '_static' in relative_path.parts:
+            continue
+        if 'whatsnew' in relative_path.parts:
+            continue
+        tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
+
+    ids_by_page = {}
+    for relative_path, future in tasks.items():
+        verbose_print(relative_path)
+        ids = future.result()
+        ids_by_page[str(relative_path)] = ids
+        verbose_print(f'    - {len(ids)} ids found')
+
+    common = set.intersection(*ids_by_page.values())
+    verbose_print(f'Filtering out {len(common)} common ids')
+    for key, page_ids in ids_by_page.items():
+        ids_by_page[key] = sorted(page_ids - common)
+
+    return ids_by_page
+
+
+def do_check(baseline, checked, excluded, *, verbose_print):
+    successful = True
+    for name, baseline_ids in sorted(baseline.items()):
+        try:
+            checked_ids = checked[name]
+        except KeyError:
+            successful = False
+            print(f'{name}: (page missing)')
+            print()
+        else:
+            missing_ids = set(baseline_ids) - set(checked_ids)
+            if missing_ids:
+                missing_ids = {
+                    a
+                    for a in missing_ids
+                    if not IGNORED_ID_RE.fullmatch(a)
+                    and (name, a) not in excluded
+                }
+            if missing_ids:
+                successful = False
+                for missing_id in sorted(missing_ids):
+                    print(f'{name}: {missing_id}')
+                print()
+    return successful
+
+
+def main(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-v',
+        '--verbose',
+        action='store_true',
+        help='print out more information',
+    )
+    subparsers = parser.add_subparsers(dest='command', required=True)
+
+    collect = subparsers.add_parser(
+        'collect', help='collect IDs from a set of HTML files'
+    )
+    collect.add_argument(
+        'htmldir', type=Path, help='directory with HTML documentation'
+    )
+    collect.add_argument(
+        '-o',
+        '--outfile',
+        help='File to save the result in; default <htmldir>/html-ids.json.gz',
+    )
+
+    check = subparsers.add_parser('check', help='check two archives of IDs')
+    check.add_argument(
+        'baseline_file', type=Path, help='file with baseline IDs'
+    )
+    check.add_argument('checked_file', type=Path, help='file with checked IDs')
+    check.add_argument(
+        '-x',
+        '--exclude-file',
+        type=Path,
+        help='file with IDs to exclude from the check',
+    )
+
+    args = parser.parse_args(argv[1:])
+
+    if args.verbose:
+        verbose_print = functools.partial(print, file=sys.stderr)
+    else:
+
+        def verbose_print(*args, **kwargs):
+            """do nothing"""
+
+    if args.command == 'collect':
+        ids = gather_ids(args.htmldir, verbose_print=verbose_print)
+        if args.outfile is None:
+            args.outfile = args.htmldir / 'html-ids.json.gz'
+        with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
+            json.dump({'ids_by_page': ids}, zfile)
+
+    if args.command == 'check':
+        with gzip.open(args.baseline_file) as zfile:
+            baseline = json.load(zfile)['ids_by_page']
+        with gzip.open(args.checked_file) as zfile:
+            checked = json.load(zfile)['ids_by_page']
+        excluded = set()
+        if args.exclude_file:
+            with open(args.exclude_file, encoding='utf-8') as file:
+                for line in file:
+                    line = line.strip()
+                    if line and not line.startswith('#'):
+                        name, sep, excluded_id = line.partition(':')
+                        if sep:
+                            excluded.add((name.strip(), excluded_id.strip()))
+        if do_check(baseline, checked, excluded, verbose_print=verbose_print):
+            verbose_print('All OK')
+        else:
+            sys.stdout.flush()
+            print(
+                'ERROR: Removed IDs found',
+                'The above HTML IDs were removed from the documentation, '
+                + 'resulting in broken links. Please add them back.',
+                sep='\n',
+                file=sys.stderr,
+            )
+            if args.exclude_file:
+                print(f'Alternatively, add them to {args.exclude_file}.')
+
+
+if __name__ == '__main__':
+    main(sys.argv)

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,9 @@ ignore = [`
`32`	`32`	`"E501", # Ignore line length errors (we use auto-formatting)`
`33`	`33`	`]`
`34`	`34`
	`35`	`+[lint.per-file-ignores]`
	`36`	`+"tools/check-html-ids.py" = ["I001"] # Unsorted imports`
	`37`	`+`
`35`	`38`	`[format]`
`36`	`39`	`preview = true`
`37`	`40`	`quote-style = "preserve"`