From 0964c2e09268158b18292a4c59077f3c290ff1c7 Mon Sep 17 00:00:00 2001 From: Craig Magina Date: Sat, 7 Feb 2026 10:32:53 -0500 Subject: [PATCH 1/6] feat: add PyTorch setup script for modular installation Add scripts/setup_torch.sh to support PyTorch installation and configuration within containers. This script: - Downloads PyTorch source from GitHub when not mounted as a volume - Installs build dependencies for PyTorch compilation - Supports installing PyTorch wheels from PyPI (release, nightly, test) - Provides flexible configuration via INSTALL_TORCH environment variable The script supports multiple installation modes: - source: Build from source (with auto-download if not mounted) - release/nightly/test: Install wheels from PyPI - skip: Skip PyTorch installation This is part of the modular script architecture introduced in PR #115. Signed-off-by: Craig Magina --- .github/workflows/amd-image.yml | 2 + .github/workflows/cpu-image.yml | 2 + .github/workflows/nvidia-image.yml | 2 + Makefile | 7 +- dockerfiles/Dockerfile.triton | 1 + dockerfiles/Dockerfile.triton-amd | 1 + dockerfiles/Dockerfile.triton-cpu | 1 + scripts/devinstall_torch.sh | 213 +++++++++++++++++++++++++++++ scripts/devinstall_triton.sh | 37 ++--- scripts/devsetup.sh | 5 + 10 files changed, 245 insertions(+), 26 deletions(-) create mode 100755 scripts/devinstall_torch.sh diff --git a/.github/workflows/amd-image.yml b/.github/workflows/amd-image.yml index a9bc99d..99d222f 100644 --- a/.github/workflows/amd-image.yml +++ b/.github/workflows/amd-image.yml @@ -11,6 +11,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh @@ -22,6 +23,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh diff --git a/.github/workflows/cpu-image.yml b/.github/workflows/cpu-image.yml index 92c6576..3e7cec4 100644 --- a/.github/workflows/cpu-image.yml +++ b/.github/workflows/cpu-image.yml @@ -11,6 +11,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh @@ -22,6 +23,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh diff --git a/.github/workflows/nvidia-image.yml b/.github/workflows/nvidia-image.yml index c0ef2c5..8fa2031 100644 --- a/.github/workflows/nvidia-image.yml +++ b/.github/workflows/nvidia-image.yml @@ -11,6 +11,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh @@ -22,6 +23,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh diff --git a/Makefile b/Makefile index 62c2a72..804687d 100644 --- a/Makefile +++ b/Makefile @@ -36,12 +36,14 @@ TRITON_CPU_BACKEND ?=0 TRITON_TAG ?= latest triton_path ?=$(source_dir) llvm_path ?= +torch_path ?= user_path ?= gitconfig_path ?="$(HOME)/.gitconfig" USERNAME ?=triton # NOTE: Requires host build system to have a valid Red Hat Subscription if true INSTALL_NSIGHT ?=false INSTALL_LLVM ?= skip # Options: source, skip +INSTALL_TORCH ?= skip # Options: nightly, release, source, skip, test INSTALL_TRITON ?= source # Options: release, source, skip INSTALL_JUPYTER ?= true USE_CCACHE ?= 0 @@ -97,6 +99,9 @@ define run_container if [ -n "$(llvm_path)" ]; then \ volume_arg+=" -v $(llvm_path):/workspace/llvm-project$(SELINUXFLAG)"; \ fi; \ + if [ -n "$(torch_path)" ]; then \ + volume_arg+=" -v $(torch_path):/workspace/torch$(SELINUXFLAG)"; \ + fi; \ if [ -n "$(user_path)" ]; then \ volume_arg+=" -v $(user_path):/workspace/user$(SELINUXFLAG)"; \ fi; \ @@ -135,7 +140,7 @@ define run_container else \ port_arg=""; \ fi; \ - env_vars="-e USERNAME=$(USER) -e USER_UID=`id -u $(USER)` -e USER_GID=`id -g $(USER)` -e TORCH_VERSION=$(torch_version) -e INSTALL_LLVM=$(INSTALL_LLVM) -e INSTALL_TOOLS=$(DEMO_TOOLS) -e INSTALL_JUPYTER=$(INSTALL_JUPYTER) -e NOTEBOOK_PORT=$(NOTEBOOK_PORT) -e INSTALL_TRITON=$(INSTALL_TRITON) -e USE_CCACHE=$(USE_CCACHE) -e MAX_JOBS=$(MAX_JOBS)"; \ + env_vars="-e USERNAME=$(USER) -e USER_UID=`id -u $(USER)` -e USER_GID=`id -g $(USER)` -e TORCH_VERSION=$(torch_version) -e INSTALL_LLVM=$(INSTALL_LLVM) -e INSTALL_TOOLS=$(DEMO_TOOLS) -e INSTALL_JUPYTER=$(INSTALL_JUPYTER) -e NOTEBOOK_PORT=$(NOTEBOOK_PORT) -e INSTALL_TORCH=$(INSTALL_TORCH) -e INSTALL_TRITON=$(INSTALL_TRITON) -e USE_CCACHE=$(USE_CCACHE) -e MAX_JOBS=$(MAX_JOBS)"; \ if [ "$(STRIPPED_CMD)" = "docker" ]; then \ $(CTR_CMD) run $$env_vars $$gpu_args $$profiling_args $$port_arg \ -ti $$volume_arg $$gitconfig_arg $(IMAGE_REPO)/$(strip $(1)):$(TRITON_TAG) bash; \ diff --git a/dockerfiles/Dockerfile.triton b/dockerfiles/Dockerfile.triton index 1fefb0c..c35fb23 100644 --- a/dockerfiles/Dockerfile.triton +++ b/dockerfiles/Dockerfile.triton @@ -52,6 +52,7 @@ ENV BASH_ENV=/workspace/bin/activate \ COPY --from=quay.io/triton-dev-containers/gosu /usr/local/bin/gosu /usr/local/bin/gosu COPY scripts/devinstall_llvm.sh /workspace/bin/devinstall_llvm +COPY scripts/devinstall_torch.sh /workspace/bin/devinstall_torch COPY scripts/devinstall_triton.sh /workspace/bin/devinstall_triton COPY scripts/devcreate_user.sh /workspace/bin/devcreate_user COPY scripts/devsetup.sh /workspace/bin/devsetup diff --git a/dockerfiles/Dockerfile.triton-amd b/dockerfiles/Dockerfile.triton-amd index df93a21..2ff277a 100644 --- a/dockerfiles/Dockerfile.triton-amd +++ b/dockerfiles/Dockerfile.triton-amd @@ -70,6 +70,7 @@ WORKDIR /workspace COPY --from=quay.io/triton-dev-containers/gosu /usr/local/bin/gosu /usr/local/bin/gosu COPY scripts/devinstall_llvm.sh /workspace/bin/devinstall_llvm +COPY scripts/devinstall_torch.sh /workspace/bin/devinstall_torch COPY scripts/devinstall_triton.sh /workspace/bin/devinstall_triton COPY scripts/devcreate_user.sh /workspace/bin/devcreate_user COPY scripts/devsetup.sh /workspace/bin/devsetup diff --git a/dockerfiles/Dockerfile.triton-cpu b/dockerfiles/Dockerfile.triton-cpu index 8bc7ba0..830f0a2 100644 --- a/dockerfiles/Dockerfile.triton-cpu +++ b/dockerfiles/Dockerfile.triton-cpu @@ -50,6 +50,7 @@ WORKDIR /workspace COPY --from=quay.io/triton-dev-containers/gosu /usr/local/bin/gosu /usr/local/bin/gosu COPY scripts/devinstall_llvm.sh /workspace/bin/devinstall_llvm +COPY scripts/devinstall_torch.sh /workspace/bin/devinstall_torch COPY scripts/devinstall_triton.sh /workspace/bin/devinstall_triton COPY scripts/devcreate_user.sh /workspace/bin/devcreate_user COPY scripts/devsetup.sh /workspace/bin/devsetup diff --git a/scripts/devinstall_torch.sh b/scripts/devinstall_torch.sh new file mode 100755 index 0000000..b7b209c --- /dev/null +++ b/scripts/devinstall_torch.sh @@ -0,0 +1,213 @@ +#! /bin/bash -e + +trap "echo -e '\nScript interrupted. Exiting gracefully.'; exit 1" SIGINT + +# Copyright (C) 2024-2025 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 +set -euo pipefail + +WORKSPACE=${WORKSPACE:-${HOME}} + +TORCH_DIR=${WORKSPACE}/torch +TORCH_REPO=https://github.com/pytorch/pytorch.git + +SUDO='' +if ((EUID != 0)) && command -v sudo &>/dev/null; then + SUDO="sudo" +elif ((EUID != 0)); then + echo "ERROR: $(basename "$0") requires root privileges or sudo." >&2 + exit 1 +fi + +pip_install() { + if command -v uv &>/dev/null; then + uv pip install "$@" + else + pip install "$@" + fi +} + +# Remove the dashes or periods from the CUDA version, e.g. 128 from 12-8 +get_cuda_version() { + echo "${CUDA_VERSION//[.-]/}" +} + +# Extract the major.minor version from ROCM_VERSION, e.g. 6.4 from 6.4.4 +get_rocm_version() { + [[ "$ROCM_VERSION" =~ ^([0-9]+\.[0-9]+) ]] && echo "${BASH_REMATCH[1]}" || + echo "$ROCM_VERSION" +} + +setup_src() { + echo "Downloading Torch source code and setting up the environment for building from source..." + + if [ ! -d "$TORCH_DIR" ]; then + echo "Cloning the Torch repo $TORCH_REPO to $TORCH_DIR ..." + git clone "$TORCH_REPO" "$TORCH_DIR" + if [ ! -d "$TORCH_DIR" ]; then + echo "$TORCH_DIR not found. ERROR Cloning repository..." + exit 1 + else + pushd "$TORCH_DIR" 1>/dev/null || exit 1 + git submodule sync + git submodule update --init --recursive + + if [ -n "${TORCH_GITREF:-}" ]; then + git checkout "$TORCH_GITREF" + fi + + echo "Install pre-commit hooks into your local Torch git repo (one-time)" + pip_install pre-commit + pre-commit install + popd 1>/dev/null + fi + else + echo "Torch repo already present, not cloning ..." + fi +} + +install_build_deps() { + echo "Installing Torch build dependencies ..." + + pushd "$TORCH_DIR" 1>/dev/null || exit 1 + + if [ -f requirements.txt ]; then + pip_install --group dev + pip_install mkl-static mkl-include + fi + + $SUDO dnf -y install numactl-devel + + if [ -n "${ROCM_VERSION:-}" ]; then + python tools/amd_build/build_amd.py + fi + + popd 1>/dev/null +} + +install_deps() { + echo "Installing Torch dependencies ..." + pip_install numpy +} + +install_whl() { + local pip_build="$1" + + local compute_platform + local pip_torch_index_url_base + local -a pip_install_args + + pip_torch_index_url_base="https://download.pytorch.org/whl" + + case "$pip_build" in + release) ;; + nightly | test) + pip_torch_index_url_base="${pip_torch_index_url_base}/${pip_build}" + ;; + esac + + echo "Installing Torch $pip_build from PyPI ..." + + if [ -n "${PIP_TORCH_VERSION:-}" ]; then + echo "Using the specified version $PIP_TORCH_VERSION of torch" + PIP_TORCH_VERSION="==$PIP_TORCH_VERSION" + fi + + if [ -n "${PIP_TORCHVISION_VERSION:-}" ]; then + echo "Installing the specified version $PIP_TORCHVISION_VERSION of torchvision" + PIP_TORCHVISION_VERSION="==$PIP_TORCHVISION_VERSION" + fi + + if [ -n "${PIP_TORCHAUDIO_VERSION:-}" ]; then + echo "Installing the specified version $PIP_TORCHAUDIO_VERSION of torchaudio" + PIP_TORCHAUDIO_VERSION="==$PIP_TORCHAUDIO_VERSION" + fi + + declare -a TORCH_PACKAGES=( + "torch${PIP_TORCH_VERSION:-}" + "torchvision${PIP_TORCHVISION_VERSION:-}" + "torchaudio${PIP_TORCHAUDIO_VERSION:-}" + ) + + if [ -n "${PIP_TORCH_INDEX_URL:-}" ]; then + echo "Using the specified index, $PIP_TORCH_INDEX_URL" + pip_install_args+=("--index-url" "$PIP_TORCH_INDEX_URL") + elif command -v uv &>/dev/null && [ -n "${UV_TORCH_BACKEND:-}" ]; then + echo "Using the specified uv backend, $UV_TORCH_BACKEND" + pip_install_args+=("--torch-backend" "$UV_TORCH_BACKEND") + elif ! command -v uv &>/dev/null && [ -n "${UV_TORCH_BACKEND:-}" ]; then + echo "Error: UV_TORCH_BACKEND is set to $UV_TORCH_BACKEND but uv is not available." + exit 1 + else + # Set compute platform for torch wheel installation + if [ -n "${ROCM_VERSION:-}" ]; then + echo "Using the ROCm version $ROCM_VERSION backend" + compute_platform="rocm$(get_rocm_version)" + elif ((${TRITON_CPU_BACKEND:-0} == 1)); then + echo "Using the CPU backend" + compute_platform="cpu" + elif [ -n "${CUDA_VERSION:-}" ]; then + echo "Using the CUDA version $CUDA_VERSION backend" + compute_platform="cu$(get_cuda_version)" + fi + + if [ -n "${compute_platform:-}" ]; then + PIP_TORCH_INDEX_URL="${pip_torch_index_url_base}/${compute_platform}" + pip_install_args+=("--index-url" "$PIP_TORCH_INDEX_URL") + fi + fi + + pip_install -U --force-reinstall "${pip_install_args[@]}" "${TORCH_PACKAGES[@]}" + + # Fix up LD_LIBRARY_PATH for CUDA + ldpretend +} + +usage() { + cat >&2 </dev/null; then @@ -151,7 +138,7 @@ install_whl() { UV_TORCH_BACKEND=auto fi - PIP_INSTALL_ARGS+=("--torch-backend" "$UV_TORCH_BACKEND") + pip_install_args+=("--torch-backend" "$UV_TORCH_BACKEND") elif ! command -v uv &>/dev/null && [ -n "${UV_TORCH_BACKEND:-}" ]; then echo "Error: UV_TORCH_BACKEND is set to $UV_TORCH_BACKEND but uv is not available." exit 1 @@ -162,7 +149,7 @@ install_whl() { PIP_TRITON_VERSION="==$PIP_TRITON_VERSION" fi - pip_install -U --force-reinstall "${PIP_INSTALL_ARGS[@]}" "triton${PIP_TRITON_VERSION:-}" + pip_install -U --force-reinstall "${pip_install_args[@]}" "triton${PIP_TRITON_VERSION:-}" # Fix up LD_LIBRARY_PATH for CUDA ldpretend diff --git a/scripts/devsetup.sh b/scripts/devsetup.sh index 1ccf4a8..809c3ad 100755 --- a/scripts/devsetup.sh +++ b/scripts/devsetup.sh @@ -25,6 +25,7 @@ declare -a SAVE_VARS=( "INSTALL_JUPYTER" "INSTALL_LLVM" "INSTALL_TOOLS" + "INSTALL_TORCH" "INSTALL_TRITON" "MAX_JOBS" "PIP_TRITON_VERSION" @@ -67,3 +68,7 @@ fi if [ "${INSTALL_LLVM:-skip}" != "skip" ]; then run_as_user devinstall_llvm "$INSTALL_LLVM" fi + +if [ "${INSTALL_TORCH:-skip}" != "skip" ]; then + run_as_user devinstall_torch "$INSTALL_TORCH" +fi From c9aa1c0dc9982ea194b2035cc6bde062914d1888 Mon Sep 17 00:00:00 2001 From: Craig Magina Date: Wed, 8 Apr 2026 13:03:37 -0400 Subject: [PATCH 2/6] Fix for lack of sudo bailing out even for pip install only runs Signed-off-by: Craig Magina --- scripts/devinstall_torch.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/devinstall_torch.sh b/scripts/devinstall_torch.sh index b7b209c..e633a74 100755 --- a/scripts/devinstall_torch.sh +++ b/scripts/devinstall_torch.sh @@ -27,9 +27,6 @@ TORCH_REPO=https://github.com/pytorch/pytorch.git SUDO='' if ((EUID != 0)) && command -v sudo &>/dev/null; then SUDO="sudo" -elif ((EUID != 0)); then - echo "ERROR: $(basename "$0") requires root privileges or sudo." >&2 - exit 1 fi pip_install() { @@ -89,7 +86,12 @@ install_build_deps() { pip_install mkl-static mkl-include fi - $SUDO dnf -y install numactl-devel + if ((EUID == 0)) || [ -n "${SUDO:-}" ]; then + $SUDO dnf -y install numactl-devel + else + echo "ERROR: Can't install some build deps without root or sudo permissions." >&2 + exit 1 + fi if [ -n "${ROCM_VERSION:-}" ]; then python tools/amd_build/build_amd.py From a879dc2317365398598bea33503038c5604a9d4d Mon Sep 17 00:00:00 2001 From: Craig Magina Date: Wed, 8 Apr 2026 13:07:42 -0400 Subject: [PATCH 3/6] Re-order git submodule update Signed-off-by: Craig Magina --- scripts/devinstall_torch.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/devinstall_torch.sh b/scripts/devinstall_torch.sh index e633a74..1c6485e 100755 --- a/scripts/devinstall_torch.sh +++ b/scripts/devinstall_torch.sh @@ -59,13 +59,14 @@ setup_src() { exit 1 else pushd "$TORCH_DIR" 1>/dev/null || exit 1 - git submodule sync - git submodule update --init --recursive if [ -n "${TORCH_GITREF:-}" ]; then git checkout "$TORCH_GITREF" fi + git submodule sync + git submodule update --init --recursive + echo "Install pre-commit hooks into your local Torch git repo (one-time)" pip_install pre-commit pre-commit install From 6672bc2c8a65af65da0d929246d4f1be375f1ed3 Mon Sep 17 00:00:00 2001 From: Craig Magina Date: Wed, 8 Apr 2026 14:20:42 -0400 Subject: [PATCH 4/6] Added default pip index url for nightly and test with no compute platform Signed-off-by: Craig Magina --- scripts/devinstall_torch.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/devinstall_torch.sh b/scripts/devinstall_torch.sh index 1c6485e..9b9251e 100755 --- a/scripts/devinstall_torch.sh +++ b/scripts/devinstall_torch.sh @@ -170,6 +170,9 @@ install_whl() { if [ -n "${compute_platform:-}" ]; then PIP_TORCH_INDEX_URL="${pip_torch_index_url_base}/${compute_platform}" pip_install_args+=("--index-url" "$PIP_TORCH_INDEX_URL") + else + PIP_TORCH_INDEX_URL="${pip_torch_index_url_base}" + pip_install_args+=("--index-url" "$PIP_TORCH_INDEX_URL") fi fi From 5602f7a766093e279c919524d5d816d1f030097d Mon Sep 17 00:00:00 2001 From: Craig Magina Date: Wed, 8 Apr 2026 14:23:11 -0400 Subject: [PATCH 5/6] Handle INSTALL_TORCH=skip as triton dep install Signed-off-by: Craig Magina --- scripts/devinstall_triton.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/devinstall_triton.sh b/scripts/devinstall_triton.sh index 66ec43c..dd3cb5e 100755 --- a/scripts/devinstall_triton.sh +++ b/scripts/devinstall_triton.sh @@ -106,7 +106,7 @@ install_deps() { numpy pandas pybind11 pytest pyyaml scipy tabulate wheel if [ "${INSTALL_TORCH:-}" != "source" ]; then - if [ -n "${INSTALL_TORCH:-}" ]; then + if [ -n "${INSTALL_TORCH:-}" ] && [ "${INSTALL_TORCH}" != "skip" ]; then echo "Installing Torch $INSTALL_TORCH as a dependency ..." devinstall_torch "${INSTALL_TORCH}" else From b88845fd39c074ef90e626b2bc3c42999554b119 Mon Sep 17 00:00:00 2001 From: Craig Magina Date: Wed, 8 Apr 2026 14:26:08 -0400 Subject: [PATCH 6/6] Added new torch environment variables to SAVE_VARS Signed-off-by: Craig Magina --- scripts/devsetup.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/devsetup.sh b/scripts/devsetup.sh index 809c3ad..c6a9fd8 100755 --- a/scripts/devsetup.sh +++ b/scripts/devsetup.sh @@ -28,9 +28,14 @@ declare -a SAVE_VARS=( "INSTALL_TORCH" "INSTALL_TRITON" "MAX_JOBS" + "PIP_TORCH_VERSION" + "PIP_TORCHVISION_VERSION" + "PIP_TORCHAUDIO_VERSION" + "PIP_TORCH_INDEX_URL" "PIP_TRITON_VERSION" "ROCM_VERSION" "ROCR_VISIBLE_DEVICES" + "TORCH_GITREF" "TORCH_VERSION" "TRITON_CPU_BACKEND" "TRITON_GITREF"