From ccba41bfc9fc893be00be05d5e4980e51a049879 Mon Sep 17 00:00:00 2001 From: Randolph Sapp Date: Sun, 27 Jul 2025 23:59:37 -0500 Subject: [PATCH 1/3] feat(docker): add lxml as a requirement Add lxml as a requirement for a deduplication script we'll use to reduce deployment size on GitHub. Signed-off-by: Randolph Sapp --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index bba255449..b19d51ce5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ docutils==0.21.2 idna==3.10 imagesize==1.4.1 Jinja2==3.1.5 +lxml==6.0.0 markdown-it-py==3.0.0 MarkupSafe==3.0.2 mdurl==0.1.2 From 5b3ef99241ea3d210d7c6e036eaec3bc75c33c18 Mon Sep 17 00:00:00 2001 From: Randolph Sapp Date: Sun, 27 Jul 2025 20:39:05 -0500 Subject: [PATCH 2/3] feat(dedupe): add a deduplication script Add a script capable of deduplicating assets used between multiple build targets. When attempting to play with sphinx's asset paths I noticed there was a lot of logic around preserving relative links. It didn't seem like they want to allow usage of an asset path outside of the build directory. The WebSupport class seemed to allow some of that flexibility we were looking for, but unfortunately it still assumes the conf.py is in the source directory and has issues working with other Sphinx extensions like ifconfig. Given what we are doing seems to be a niche usecase, we can address this with a little post processing using lxml filters. Signed-off-by: Randolph Sapp --- bin/dedupe.py | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100755 bin/dedupe.py diff --git a/bin/dedupe.py b/bin/dedupe.py new file mode 100755 index 000000000..64e3b091a --- /dev/null +++ b/bin/dedupe.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 + +"""Tool to deduplicate HTML assets for GitHub pages deployments + +SPDX-License-Identifier: MIT +Copyright (C) 2025 Texas Instruments Incorporated - https://www.ti.com +""" + +import argparse +import logging +from multiprocessing import Pool + +from lxml import html +from root_index import get_root_index, BUILD_PATH + +COMMON_PATHS = {"_images", "_downloads", "_static"} + + +def _rewrite_path(html_path, common_dir, check_list): + """Wrapper to replace links using lxml rewrite_links. Defines a throwaway function to make + things faster. + + :param html_path: Pathlib path to file to HTML file + :param common_dir: Pathlib path to the document root directory + :param check_list: Iterable of pathlib paths to check + """ + with html_path.open("r", encoding="utf-8") as file: + document = html.fromstring(file.read()) + + old_rel_path = html_path.parent.resolve() + new_rel_path = common_dir.resolve() + + def _update_link(link): + """Function to interact with lxml's rewrite_links + + :param link: String link to rewrite + """ + clean_link = link.strip() + if clean_link[:4] == "http": + return link + + link_path = old_rel_path.joinpath(clean_link).resolve() + for check_path in check_list: + if link_path.is_relative_to(check_path): + logging.info("rewriting link in: %s", html_path) + logging.debug("old link path: %s", link_path) + new_path = new_rel_path.joinpath( + link_path.relative_to(check_path.parent) + ) + logging.debug("new link path: %s", new_path) + rel_path = new_path.relative_to(old_rel_path, walk_up=True) + logging.debug("new rel path: %s", rel_path) + logging.debug("---") + return rel_path.as_posix() + + return link + + document.rewrite_links(_update_link, resolve_base_href=False) + + with html_path.open("wb") as file: + file.write( + html.tostring( + document, + encoding="utf-8", + include_meta_content_type=True, + doctype="", + ) + ) + + +def _move_files(old_rel_path, new_rel_path, check_list): + """Move the files that match the check_list from the old_rel_path root into new_rel_path. + + :param old_rel_path: Pathlib path to the document root directory + :param new_rel_path: Pathlib path to the new common directory + :param check_list: Iterable of pathlib paths to check + """ + for check_path in check_list: + operating_dir = old_rel_path.joinpath(check_path) + for path in operating_dir.glob("**/*"): + if not path.is_file(): + continue + rel = path.relative_to(old_rel_path) + logging.info("moving file: %s", rel) + new = new_rel_path.joinpath(rel) + logging.debug("destination: %s", new) + new.parent.mkdir(mode=0o755, parents=True, exist_ok=True) + path.replace(new) + + for empty_dir in sorted(operating_dir.glob("**/*"), reverse=True): + empty_dir.rmdir() + + +def rewrite_paths(root_dir, common_dir, jobs): + """Rewrite the paths to move assets into a common_dir directory. This assumes: + + 1. Paths are already relative to the given root_dir + 2. The root_dir resides under the common_dir + + :param root_dir: Pathlib path to document root directory + :param common_dir: Pathlib path to new common_dir directory + """ + check_list = {root_dir.joinpath(x).resolve() for x in COMMON_PATHS} + logging.info("rewriting paths") + starmap_iterable = [ + (path, common_dir, check_list) for path in root_dir.glob("**/*.html") + ] + with Pool(jobs) as pool: + pool.starmap(_rewrite_path, starmap_iterable) + logging.info("moving the files") + _move_files(root_dir.resolve(), common_dir.resolve(), check_list) + + +def main(): + """Main processing loop""" + parser = argparse.ArgumentParser( + prog="dedupe.py", + description="Tool to deduplicate HTML assets for GitHub pages deployments", + ) + + parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument("-j", "--jobs", type=int, default=8) + + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + root_list = [] + for path in BUILD_PATH.glob("*/"): + index_path = get_root_index(path) + if index_path: + root_list.append(index_path.parent) + logging.info("found the following index: %s", index_path) + + for path in root_list: + logging.info("working on the following document dir: %s", path) + rewrite_paths(path, BUILD_PATH, args.jobs) + + +if __name__ == "__main__": + main() From 05a94350e51f99397a49ef3e1662846257119352 Mon Sep 17 00:00:00 2001 From: Randolph Sapp Date: Mon, 28 Jul 2025 00:00:51 -0500 Subject: [PATCH 3/3] ci(deploy): add a deduplication stage Add a deduplication stage that uses the new dedupe.py script to cut down on duplicated deployment assets. Signed-off-by: Randolph Sapp --- .github/workflows/deploy.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 41849368f..934968145 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -36,6 +36,9 @@ jobs: - name: Generate root index run: ./bin/root_index.py + - name: Deduplicate assets + run: ./bin/dedupe.py + - name: Upload static files as single artifact uses: actions/upload-pages-artifact@v3 with: