Skip to content

Commit 5b3ef99

Browse files
committed
feat(dedupe): add a deduplication script
Add a script capable of deduplicating assets used between multiple build targets. When attempting to play with sphinx's asset paths I noticed there was a lot of logic around preserving relative links. It didn't seem like they want to allow usage of an asset path outside of the build directory. The WebSupport class seemed to allow some of that flexibility we were looking for, but unfortunately it still assumes the conf.py is in the source directory and has issues working with other Sphinx extensions like ifconfig. Given what we are doing seems to be a niche usecase, we can address this with a little post processing using lxml filters. Signed-off-by: Randolph Sapp <[email protected]>
1 parent ccba41b commit 5b3ef99

File tree

1 file changed

+141
-0
lines changed

1 file changed

+141
-0
lines changed

bin/dedupe.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env python3
2+
3+
"""Tool to deduplicate HTML assets for GitHub pages deployments
4+
5+
SPDX-License-Identifier: MIT
6+
Copyright (C) 2025 Texas Instruments Incorporated - https://www.ti.com
7+
"""
8+
9+
import argparse
10+
import logging
11+
from multiprocessing import Pool
12+
13+
from lxml import html
14+
from root_index import get_root_index, BUILD_PATH
15+
16+
COMMON_PATHS = {"_images", "_downloads", "_static"}
17+
18+
19+
def _rewrite_path(html_path, common_dir, check_list):
20+
"""Wrapper to replace links using lxml rewrite_links. Defines a throwaway function to make
21+
things faster.
22+
23+
:param html_path: Pathlib path to file to HTML file
24+
:param common_dir: Pathlib path to the document root directory
25+
:param check_list: Iterable of pathlib paths to check
26+
"""
27+
with html_path.open("r", encoding="utf-8") as file:
28+
document = html.fromstring(file.read())
29+
30+
old_rel_path = html_path.parent.resolve()
31+
new_rel_path = common_dir.resolve()
32+
33+
def _update_link(link):
34+
"""Function to interact with lxml's rewrite_links
35+
36+
:param link: String link to rewrite
37+
"""
38+
clean_link = link.strip()
39+
if clean_link[:4] == "http":
40+
return link
41+
42+
link_path = old_rel_path.joinpath(clean_link).resolve()
43+
for check_path in check_list:
44+
if link_path.is_relative_to(check_path):
45+
logging.info("rewriting link in: %s", html_path)
46+
logging.debug("old link path: %s", link_path)
47+
new_path = new_rel_path.joinpath(
48+
link_path.relative_to(check_path.parent)
49+
)
50+
logging.debug("new link path: %s", new_path)
51+
rel_path = new_path.relative_to(old_rel_path, walk_up=True)
52+
logging.debug("new rel path: %s", rel_path)
53+
logging.debug("---")
54+
return rel_path.as_posix()
55+
56+
return link
57+
58+
document.rewrite_links(_update_link, resolve_base_href=False)
59+
60+
with html_path.open("wb") as file:
61+
file.write(
62+
html.tostring(
63+
document,
64+
encoding="utf-8",
65+
include_meta_content_type=True,
66+
doctype="<!DOCTYPE html>",
67+
)
68+
)
69+
70+
71+
def _move_files(old_rel_path, new_rel_path, check_list):
72+
"""Move the files that match the check_list from the old_rel_path root into new_rel_path.
73+
74+
:param old_rel_path: Pathlib path to the document root directory
75+
:param new_rel_path: Pathlib path to the new common directory
76+
:param check_list: Iterable of pathlib paths to check
77+
"""
78+
for check_path in check_list:
79+
operating_dir = old_rel_path.joinpath(check_path)
80+
for path in operating_dir.glob("**/*"):
81+
if not path.is_file():
82+
continue
83+
rel = path.relative_to(old_rel_path)
84+
logging.info("moving file: %s", rel)
85+
new = new_rel_path.joinpath(rel)
86+
logging.debug("destination: %s", new)
87+
new.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
88+
path.replace(new)
89+
90+
for empty_dir in sorted(operating_dir.glob("**/*"), reverse=True):
91+
empty_dir.rmdir()
92+
93+
94+
def rewrite_paths(root_dir, common_dir, jobs):
95+
"""Rewrite the paths to move assets into a common_dir directory. This assumes:
96+
97+
1. Paths are already relative to the given root_dir
98+
2. The root_dir resides under the common_dir
99+
100+
:param root_dir: Pathlib path to document root directory
101+
:param common_dir: Pathlib path to new common_dir directory
102+
"""
103+
check_list = {root_dir.joinpath(x).resolve() for x in COMMON_PATHS}
104+
logging.info("rewriting paths")
105+
starmap_iterable = [
106+
(path, common_dir, check_list) for path in root_dir.glob("**/*.html")
107+
]
108+
with Pool(jobs) as pool:
109+
pool.starmap(_rewrite_path, starmap_iterable)
110+
logging.info("moving the files")
111+
_move_files(root_dir.resolve(), common_dir.resolve(), check_list)
112+
113+
114+
def main():
115+
"""Main processing loop"""
116+
parser = argparse.ArgumentParser(
117+
prog="dedupe.py",
118+
description="Tool to deduplicate HTML assets for GitHub pages deployments",
119+
)
120+
121+
parser.add_argument("-v", "--verbose", action="store_true")
122+
parser.add_argument("-j", "--jobs", type=int, default=8)
123+
124+
args = parser.parse_args()
125+
126+
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
127+
128+
root_list = []
129+
for path in BUILD_PATH.glob("*/"):
130+
index_path = get_root_index(path)
131+
if index_path:
132+
root_list.append(index_path.parent)
133+
logging.info("found the following index: %s", index_path)
134+
135+
for path in root_list:
136+
logging.info("working on the following document dir: %s", path)
137+
rewrite_paths(path, BUILD_PATH, args.jobs)
138+
139+
140+
if __name__ == "__main__":
141+
main()

0 commit comments

Comments
 (0)