diff --git a/.github/workflows/amd-image.yml b/.github/workflows/amd-image.yml index a9bc99d..99d222f 100644 --- a/.github/workflows/amd-image.yml +++ b/.github/workflows/amd-image.yml @@ -11,6 +11,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh @@ -22,6 +23,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh diff --git a/.github/workflows/cpu-image.yml b/.github/workflows/cpu-image.yml index 92c6576..3e7cec4 100644 --- a/.github/workflows/cpu-image.yml +++ b/.github/workflows/cpu-image.yml @@ -11,6 +11,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh @@ -22,6 +23,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh diff --git a/.github/workflows/nvidia-image.yml b/.github/workflows/nvidia-image.yml index c0ef2c5..8fa2031 100644 --- a/.github/workflows/nvidia-image.yml +++ b/.github/workflows/nvidia-image.yml @@ -11,6 +11,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh @@ -22,6 +23,7 @@ on: # yamllint disable-line rule:truthy - scripts/devinstall_software.sh - scripts/ldpretend.sh - scripts/devinstall_llvm.sh + - scripts/devinstall_torch.sh - scripts/devinstall_triton.sh - scripts/devcreate_user.sh - scripts/devsetup.sh diff --git a/Makefile b/Makefile index 62c2a72..804687d 100644 --- a/Makefile +++ b/Makefile @@ -36,12 +36,14 @@ TRITON_CPU_BACKEND ?=0 TRITON_TAG ?= latest triton_path ?=$(source_dir) llvm_path ?= +torch_path ?= user_path ?= gitconfig_path ?="$(HOME)/.gitconfig" USERNAME ?=triton # NOTE: Requires host build system to have a valid Red Hat Subscription if true INSTALL_NSIGHT ?=false INSTALL_LLVM ?= skip # Options: source, skip +INSTALL_TORCH ?= skip # Options: nightly, release, source, skip, test INSTALL_TRITON ?= source # Options: release, source, skip INSTALL_JUPYTER ?= true USE_CCACHE ?= 0 @@ -97,6 +99,9 @@ define run_container if [ -n "$(llvm_path)" ]; then \ volume_arg+=" -v $(llvm_path):/workspace/llvm-project$(SELINUXFLAG)"; \ fi; \ + if [ -n "$(torch_path)" ]; then \ + volume_arg+=" -v $(torch_path):/workspace/torch$(SELINUXFLAG)"; \ + fi; \ if [ -n "$(user_path)" ]; then \ volume_arg+=" -v $(user_path):/workspace/user$(SELINUXFLAG)"; \ fi; \ @@ -135,7 +140,7 @@ define run_container else \ port_arg=""; \ fi; \ - env_vars="-e USERNAME=$(USER) -e USER_UID=`id -u $(USER)` -e USER_GID=`id -g $(USER)` -e TORCH_VERSION=$(torch_version) -e INSTALL_LLVM=$(INSTALL_LLVM) -e INSTALL_TOOLS=$(DEMO_TOOLS) -e INSTALL_JUPYTER=$(INSTALL_JUPYTER) -e NOTEBOOK_PORT=$(NOTEBOOK_PORT) -e INSTALL_TRITON=$(INSTALL_TRITON) -e USE_CCACHE=$(USE_CCACHE) -e MAX_JOBS=$(MAX_JOBS)"; \ + env_vars="-e USERNAME=$(USER) -e USER_UID=`id -u $(USER)` -e USER_GID=`id -g $(USER)` -e TORCH_VERSION=$(torch_version) -e INSTALL_LLVM=$(INSTALL_LLVM) -e INSTALL_TOOLS=$(DEMO_TOOLS) -e INSTALL_JUPYTER=$(INSTALL_JUPYTER) -e NOTEBOOK_PORT=$(NOTEBOOK_PORT) -e INSTALL_TORCH=$(INSTALL_TORCH) -e INSTALL_TRITON=$(INSTALL_TRITON) -e USE_CCACHE=$(USE_CCACHE) -e MAX_JOBS=$(MAX_JOBS)"; \ if [ "$(STRIPPED_CMD)" = "docker" ]; then \ $(CTR_CMD) run $$env_vars $$gpu_args $$profiling_args $$port_arg \ -ti $$volume_arg $$gitconfig_arg $(IMAGE_REPO)/$(strip $(1)):$(TRITON_TAG) bash; \ diff --git a/dockerfiles/Dockerfile.triton b/dockerfiles/Dockerfile.triton index 1fefb0c..c35fb23 100644 --- a/dockerfiles/Dockerfile.triton +++ b/dockerfiles/Dockerfile.triton @@ -52,6 +52,7 @@ ENV BASH_ENV=/workspace/bin/activate \ COPY --from=quay.io/triton-dev-containers/gosu /usr/local/bin/gosu /usr/local/bin/gosu COPY scripts/devinstall_llvm.sh /workspace/bin/devinstall_llvm +COPY scripts/devinstall_torch.sh /workspace/bin/devinstall_torch COPY scripts/devinstall_triton.sh /workspace/bin/devinstall_triton COPY scripts/devcreate_user.sh /workspace/bin/devcreate_user COPY scripts/devsetup.sh /workspace/bin/devsetup diff --git a/dockerfiles/Dockerfile.triton-amd b/dockerfiles/Dockerfile.triton-amd index df93a21..2ff277a 100644 --- a/dockerfiles/Dockerfile.triton-amd +++ b/dockerfiles/Dockerfile.triton-amd @@ -70,6 +70,7 @@ WORKDIR /workspace COPY --from=quay.io/triton-dev-containers/gosu /usr/local/bin/gosu /usr/local/bin/gosu COPY scripts/devinstall_llvm.sh /workspace/bin/devinstall_llvm +COPY scripts/devinstall_torch.sh /workspace/bin/devinstall_torch COPY scripts/devinstall_triton.sh /workspace/bin/devinstall_triton COPY scripts/devcreate_user.sh /workspace/bin/devcreate_user COPY scripts/devsetup.sh /workspace/bin/devsetup diff --git a/dockerfiles/Dockerfile.triton-cpu b/dockerfiles/Dockerfile.triton-cpu index 8bc7ba0..830f0a2 100644 --- a/dockerfiles/Dockerfile.triton-cpu +++ b/dockerfiles/Dockerfile.triton-cpu @@ -50,6 +50,7 @@ WORKDIR /workspace COPY --from=quay.io/triton-dev-containers/gosu /usr/local/bin/gosu /usr/local/bin/gosu COPY scripts/devinstall_llvm.sh /workspace/bin/devinstall_llvm +COPY scripts/devinstall_torch.sh /workspace/bin/devinstall_torch COPY scripts/devinstall_triton.sh /workspace/bin/devinstall_triton COPY scripts/devcreate_user.sh /workspace/bin/devcreate_user COPY scripts/devsetup.sh /workspace/bin/devsetup diff --git a/scripts/devinstall_torch.sh b/scripts/devinstall_torch.sh new file mode 100755 index 0000000..9b9251e --- /dev/null +++ b/scripts/devinstall_torch.sh @@ -0,0 +1,219 @@ +#! /bin/bash -e + +trap "echo -e '\nScript interrupted. Exiting gracefully.'; exit 1" SIGINT + +# Copyright (C) 2024-2025 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 +set -euo pipefail + +WORKSPACE=${WORKSPACE:-${HOME}} + +TORCH_DIR=${WORKSPACE}/torch +TORCH_REPO=https://github.com/pytorch/pytorch.git + +SUDO='' +if ((EUID != 0)) && command -v sudo &>/dev/null; then + SUDO="sudo" +fi + +pip_install() { + if command -v uv &>/dev/null; then + uv pip install "$@" + else + pip install "$@" + fi +} + +# Remove the dashes or periods from the CUDA version, e.g. 128 from 12-8 +get_cuda_version() { + echo "${CUDA_VERSION//[.-]/}" +} + +# Extract the major.minor version from ROCM_VERSION, e.g. 6.4 from 6.4.4 +get_rocm_version() { + [[ "$ROCM_VERSION" =~ ^([0-9]+\.[0-9]+) ]] && echo "${BASH_REMATCH[1]}" || + echo "$ROCM_VERSION" +} + +setup_src() { + echo "Downloading Torch source code and setting up the environment for building from source..." + + if [ ! -d "$TORCH_DIR" ]; then + echo "Cloning the Torch repo $TORCH_REPO to $TORCH_DIR ..." + git clone "$TORCH_REPO" "$TORCH_DIR" + if [ ! -d "$TORCH_DIR" ]; then + echo "$TORCH_DIR not found. ERROR Cloning repository..." + exit 1 + else + pushd "$TORCH_DIR" 1>/dev/null || exit 1 + + if [ -n "${TORCH_GITREF:-}" ]; then + git checkout "$TORCH_GITREF" + fi + + git submodule sync + git submodule update --init --recursive + + echo "Install pre-commit hooks into your local Torch git repo (one-time)" + pip_install pre-commit + pre-commit install + popd 1>/dev/null + fi + else + echo "Torch repo already present, not cloning ..." + fi +} + +install_build_deps() { + echo "Installing Torch build dependencies ..." + + pushd "$TORCH_DIR" 1>/dev/null || exit 1 + + if [ -f requirements.txt ]; then + pip_install --group dev + pip_install mkl-static mkl-include + fi + + if ((EUID == 0)) || [ -n "${SUDO:-}" ]; then + $SUDO dnf -y install numactl-devel + else + echo "ERROR: Can't install some build deps without root or sudo permissions." >&2 + exit 1 + fi + + if [ -n "${ROCM_VERSION:-}" ]; then + python tools/amd_build/build_amd.py + fi + + popd 1>/dev/null +} + +install_deps() { + echo "Installing Torch dependencies ..." + pip_install numpy +} + +install_whl() { + local pip_build="$1" + + local compute_platform + local pip_torch_index_url_base + local -a pip_install_args + + pip_torch_index_url_base="https://download.pytorch.org/whl" + + case "$pip_build" in + release) ;; + nightly | test) + pip_torch_index_url_base="${pip_torch_index_url_base}/${pip_build}" + ;; + esac + + echo "Installing Torch $pip_build from PyPI ..." + + if [ -n "${PIP_TORCH_VERSION:-}" ]; then + echo "Using the specified version $PIP_TORCH_VERSION of torch" + PIP_TORCH_VERSION="==$PIP_TORCH_VERSION" + fi + + if [ -n "${PIP_TORCHVISION_VERSION:-}" ]; then + echo "Installing the specified version $PIP_TORCHVISION_VERSION of torchvision" + PIP_TORCHVISION_VERSION="==$PIP_TORCHVISION_VERSION" + fi + + if [ -n "${PIP_TORCHAUDIO_VERSION:-}" ]; then + echo "Installing the specified version $PIP_TORCHAUDIO_VERSION of torchaudio" + PIP_TORCHAUDIO_VERSION="==$PIP_TORCHAUDIO_VERSION" + fi + + declare -a TORCH_PACKAGES=( + "torch${PIP_TORCH_VERSION:-}" + "torchvision${PIP_TORCHVISION_VERSION:-}" + "torchaudio${PIP_TORCHAUDIO_VERSION:-}" + ) + + if [ -n "${PIP_TORCH_INDEX_URL:-}" ]; then + echo "Using the specified index, $PIP_TORCH_INDEX_URL" + pip_install_args+=("--index-url" "$PIP_TORCH_INDEX_URL") + elif command -v uv &>/dev/null && [ -n "${UV_TORCH_BACKEND:-}" ]; then + echo "Using the specified uv backend, $UV_TORCH_BACKEND" + pip_install_args+=("--torch-backend" "$UV_TORCH_BACKEND") + elif ! command -v uv &>/dev/null && [ -n "${UV_TORCH_BACKEND:-}" ]; then + echo "Error: UV_TORCH_BACKEND is set to $UV_TORCH_BACKEND but uv is not available." + exit 1 + else + # Set compute platform for torch wheel installation + if [ -n "${ROCM_VERSION:-}" ]; then + echo "Using the ROCm version $ROCM_VERSION backend" + compute_platform="rocm$(get_rocm_version)" + elif ((${TRITON_CPU_BACKEND:-0} == 1)); then + echo "Using the CPU backend" + compute_platform="cpu" + elif [ -n "${CUDA_VERSION:-}" ]; then + echo "Using the CUDA version $CUDA_VERSION backend" + compute_platform="cu$(get_cuda_version)" + fi + + if [ -n "${compute_platform:-}" ]; then + PIP_TORCH_INDEX_URL="${pip_torch_index_url_base}/${compute_platform}" + pip_install_args+=("--index-url" "$PIP_TORCH_INDEX_URL") + else + PIP_TORCH_INDEX_URL="${pip_torch_index_url_base}" + pip_install_args+=("--index-url" "$PIP_TORCH_INDEX_URL") + fi + fi + + pip_install -U --force-reinstall "${pip_install_args[@]}" "${TORCH_PACKAGES[@]}" + + # Fix up LD_LIBRARY_PATH for CUDA + ldpretend +} + +usage() { + cat >&2 </dev/null; then @@ -151,7 +138,7 @@ install_whl() { UV_TORCH_BACKEND=auto fi - PIP_INSTALL_ARGS+=("--torch-backend" "$UV_TORCH_BACKEND") + pip_install_args+=("--torch-backend" "$UV_TORCH_BACKEND") elif ! command -v uv &>/dev/null && [ -n "${UV_TORCH_BACKEND:-}" ]; then echo "Error: UV_TORCH_BACKEND is set to $UV_TORCH_BACKEND but uv is not available." exit 1 @@ -162,7 +149,7 @@ install_whl() { PIP_TRITON_VERSION="==$PIP_TRITON_VERSION" fi - pip_install -U --force-reinstall "${PIP_INSTALL_ARGS[@]}" "triton${PIP_TRITON_VERSION:-}" + pip_install -U --force-reinstall "${pip_install_args[@]}" "triton${PIP_TRITON_VERSION:-}" # Fix up LD_LIBRARY_PATH for CUDA ldpretend diff --git a/scripts/devsetup.sh b/scripts/devsetup.sh index 1ccf4a8..c6a9fd8 100755 --- a/scripts/devsetup.sh +++ b/scripts/devsetup.sh @@ -25,11 +25,17 @@ declare -a SAVE_VARS=( "INSTALL_JUPYTER" "INSTALL_LLVM" "INSTALL_TOOLS" + "INSTALL_TORCH" "INSTALL_TRITON" "MAX_JOBS" + "PIP_TORCH_VERSION" + "PIP_TORCHVISION_VERSION" + "PIP_TORCHAUDIO_VERSION" + "PIP_TORCH_INDEX_URL" "PIP_TRITON_VERSION" "ROCM_VERSION" "ROCR_VISIBLE_DEVICES" + "TORCH_GITREF" "TORCH_VERSION" "TRITON_CPU_BACKEND" "TRITON_GITREF" @@ -67,3 +73,7 @@ fi if [ "${INSTALL_LLVM:-skip}" != "skip" ]; then run_as_user devinstall_llvm "$INSTALL_LLVM" fi + +if [ "${INSTALL_TORCH:-skip}" != "skip" ]; then + run_as_user devinstall_torch "$INSTALL_TORCH" +fi