diff --git a/.github/ci-trigger-20250814-1 b/.github/ci-trigger-20250814-1 new file mode 100644 index 000000000000..8ca993aa58b2 --- /dev/null +++ b/.github/ci-trigger-20250814-1 @@ -0,0 +1 @@ +trigger: sync_with_upstream diff --git a/.github/workflows/sync_with_upstream.yml b/.github/workflows/sync_with_upstream.yml new file mode 100644 index 000000000000..df1048a43833 --- /dev/null +++ b/.github/workflows/sync_with_upstream.yml @@ -0,0 +1,80 @@ +name: Sync with Upstream + +on: + schedule: + - cron: '0 0 * * *' # Runs daily at midnight + push: + branches: + - main + +jobs: + sync: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Git + run: | + git config --global user.name 'Zhuul' + git config --global user.email '40538530+Zhuul@users.noreply.github.com' + + - name: Add upstream remote + run: git remote add upstream https://github.com/vllm-project/vllm.git + + - name: Fetch upstream changes + run: git fetch upstream + + - name: Merge upstream changes + id: merge + run: | + git checkout main + git merge upstream/main || { + echo "Merge conflict detected. Creating a new branch for manual resolution." + git checkout -b "merge-conflict-$(date +%Y%m%d%H%M%S)" + git push origin HEAD + echo "conflict=true" >> "$GITHUB_OUTPUT" + exit 1 + } + echo "conflict=false" >> "$GITHUB_OUTPUT" + + - name: Check for workflow file changes + id: workflow_change + run: | + if git diff --name-only upstream/main | grep '^.github/workflows/'; then + echo "workflow_changed=true" >> "$GITHUB_OUTPUT" + else + echo "workflow_changed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Set up PAT authentication + env: + GH_PAT: ${{ secrets.GH_PAT }} + run: | + git remote set-url origin "https://Zhuul:${GH_PAT}@github.com/Zhuul/vllm.git" + + - name: Push changes if no workflow files changed + if: steps.workflow_change.outputs.workflow_changed == 'false' && steps.merge.outputs.conflict == 'false' + run: git push origin main + + - name: Create Pull Request for workflow file changes + if: steps.workflow_change.outputs.workflow_changed == 'true' && steps.merge.outputs.conflict == 'false' + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ secrets.GH_PAT }} + commit-message: "Sync with upstream: update workflow files" + title: "Sync with upstream: update workflow files" + body: | + This PR was automatically created because workflow files were updated while syncing with upstream. + Please review and merge. + branch: workflow-sync-${{ github.run_id }} + base: main + + - name: Send notification if merge conflict + if: steps.merge.outputs.conflict == 'true' + run: | + echo "Merge conflict detected. Manual intervention required." + # Add your notification logic here (e.g., send an email, create an issue, etc.) diff --git a/.gitignore b/.gitignore index 465935d488f8..a5bd3740e844 100644 --- a/.gitignore +++ b/.gitignore @@ -209,4 +209,5 @@ shellcheck*/ csrc/moe/marlin_moe_wna16/kernel_* # Ignore ep_kernels_workspace folder -ep_kernels_workspace/ \ No newline at end of file +ep_kernels_workspace/node_modules/ +package*.json diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 5a2a70d57e85..1c2a31cf895c 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -31,6 +31,8 @@ See . Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source][build-from-source] documentation for details. +For a containerized developer workflow, see Podman-first dev: `docs/contributing/podman-dev.md`. + For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. ### Building the docs with MkDocs diff --git a/docs/contributing/podman-dev.md b/docs/contributing/podman-dev.md new file mode 100644 index 000000000000..881e495f8421 --- /dev/null +++ b/docs/contributing/podman-dev.md @@ -0,0 +1,41 @@ +--- +title: Podman-first Development Environment +--- + +This guide documents the Podman-first development workflow for building vLLM from source with CUDA and PyTorch nightly. + +Primary entrypoint + +- Windows (PowerShell): `./extras/podman/run.ps1` +- Linux/macOS (bash): `extras/podman/run.sh` + +Legacy launchers at `extras/run-vllm-dev.ps1` and `extras/run-vllm-dev.sh` are deprecated and forward to the Podman wrappers. + +Prerequisites + +- Podman with GPU CDI enabled (on Windows, use Podman Desktop + WSL; ensure NVIDIA drivers and CUDA are installed on the host). +- Optional named volume for build/work space, e.g., `vllm-work`. + +Quick start + +Windows (PowerShell): + +```powershell +./extras/podman/run.ps1 -Build +./extras/podman/run.ps1 -GPUCheck +./extras/podman/run.ps1 -Setup -WorkVolume vllm-work -Progress +``` + +Linux/macOS (bash): + +```bash +extras/podman/run.sh --build +extras/podman/run.sh --gpu-check +extras/podman/run.sh --setup --work-volume vllm-work --progress +``` + +Notes + +- The image uses CUDA 12.9 UBI9 and installs PyTorch nightly cu129 first to ensure latest GPU arch support (including sm_120 when present). +- The setup step performs an editable vLLM install without downgrading torch family packages. +- Use a named Podman volume for `/opt/work` to avoid `/tmp` tmpfs pressure and to speed up rebuilds. diff --git a/extras/.dockerignore b/extras/.dockerignore new file mode 100644 index 000000000000..60a8d81a82c1 --- /dev/null +++ b/extras/.dockerignore @@ -0,0 +1,39 @@ +# Reduce build context to avoid Windows Podman tar write issues +.git +.github +.vscode +.venv +venv +node_modules +build +dist +csrc/ +vllm/ +benchmarks/ +docs/ +examples/ +tests/ +**/__pycache__ +**/*.pyc +**/*.pyo +**/*.pyd +**/*.so +**/*.o +**/*.a +**/*.dll +**/*.dylib +extras/build.log +extras/*.bak +extras/tools/ +extras/run-vllm-dev-*.ps1 +extras/run-vllm-dev-*.sh +extras/*wsl* +extras/*docker*.ps1 + +!extras/Dockerfile +!extras/run-vllm-dev.ps1 +!extras/run-vllm-dev.sh +!extras/dev-setup.sh +requirements/ +pyproject.toml +setup.py diff --git a/extras/Dockerfile b/extras/Dockerfile new file mode 100644 index 000000000000..052da8390c6d --- /dev/null +++ b/extras/Dockerfile @@ -0,0 +1,259 @@ +# vLLM Development Container with GPU Support +# Uses vLLM's own requirements for automatic dependency management + +# Build-time args to control CUDA/OS base and PyTorch nightly index +ARG CUDA_VERSION=13.0.0 +ARG UBI_VERSION=9 +ARG TORCH_CUDA_INDEX=cu130 +# Base flavor for CUDA image: e.g. 'rockylinux9' (default) or 'ubi9' +ARG BASE_FLAVOR=rockylinux9 + +# Switchable base: defaults to Rocky Linux to avoid subscription-gated repos +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${BASE_FLAVOR} + +# Set CUDA environment variables for build tools +ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_ROOT=/usr/local/cuda +ENV PATH=$CUDA_HOME/bin:$PATH +ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH +ENV CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME +ENV CUDNN_LIBRARY_PATH=/usr/lib64 +ENV CUDNN_INCLUDE_PATH=/usr/include + +# Install system packages with additional CUDA development libraries +RUN dnf update -y && dnf install --allowerasing -y \ + python3 python3-pip python3-devel \ + git gcc gcc-c++ cmake \ + make patch which findutils tar rsync \ + wget curl vim nano pkgconfig \ + zlib-devel bzip2 bzip2-devel xz xz-devel libffi-devel \ + openssl-devel sqlite-devel \ + && (dnf install -y readline-devel || true) \ + && dnf clean all + +# Prefer Python 3.12 from packages if available (fallback to system python3) +RUN dnf install -y python3.12 python3.12-devel || true + +## Ensure /usr/bin/python exists for compatibility +RUN ln -sf $(command -v python3) /usr/bin/python || true + +# Create a non-root user for development +RUN useradd -m -s /bin/bash vllmuser && \ + echo "vllmuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +# Install essential system tools +RUN dnf install -y hostname iproute iputils + +ARG REQUIRE_FFMPEG=1 +# Multimedia and image libs with optional ffmpeg-devel enforcement +# Install EPEL and RPM Fusion repos for EL (9/10) and pull ffmpeg/ffmpeg-devel from there. +# When REQUIRE_FFMPEG=1, fail the build if ffmpeg is still unavailable. +RUN set -euxo pipefail \ + && (dnf install -y dnf-plugins-core || true) \ + && (dnf config-manager --set-enabled crb || true) \ + && (dnf makecache -y || true) \ + && . /etc/os-release \ + && ELVER="${VERSION_ID%%.*}" \ + && echo "[Dockerfile] Detected Enterprise Linux major version: ${ELVER}" \ + && dnf install -y \ + libjpeg-turbo-devel libpng-devel zlib-devel freetype-devel \ + libsndfile libsndfile-devel sox sox-devel || true \ + && if [ "${REQUIRE_FFMPEG}" = "1" ]; then \ + echo "[Dockerfile] Enabling EPEL and RPM Fusion for ffmpeg (EL${ELVER})"; \ + dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-${ELVER}.noarch.rpm; \ + dnf install -y https://download1.rpmfusion.org/free/el/rpmfusion-free-release-${ELVER}.noarch.rpm; \ + dnf install -y https://download1.rpmfusion.org/nonfree/el/rpmfusion-nonfree-release-${ELVER}.noarch.rpm; \ + dnf makecache -y; \ + dnf install -y ffmpeg ffmpeg-devel; \ + command -v ffmpeg >/dev/null 2>&1; \ + else \ + # Best-effort install when not enforced + (dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-${ELVER}.noarch.rpm || true); \ + (dnf install -y https://download1.rpmfusion.org/free/el/rpmfusion-free-release-${ELVER}.noarch.rpm || true); \ + (dnf install -y https://download1.rpmfusion.org/nonfree/el/rpmfusion-nonfree-release-${ELVER}.noarch.rpm || true); \ + (dnf makecache -y || true); \ + (dnf install -y ffmpeg ffmpeg-devel || true); \ + fi \ + && (dnf install -y --enablerepo=crb ninja-build || \ + dnf install -y --enablerepo=crb ninja || \ + dnf install -y ninja-build || \ + dnf install -y ninja || true) \ + && dnf clean all || true + + + +# Add NVIDIA Machine Learning repo for RHEL9/UBI9 and install NCCL runtime/devel +# Needed for PyTorch nightly cu129 to avoid ncclCommWindowRegister symbol errors +# Install NCCL runtime/devel from the CUDA repository available in the base image +RUN set -euxo pipefail \ + && dnf makecache -y \ + && (dnf install -y libnccl libnccl-devel || dnf install -y libnccl-2 libnccl-devel-2) \ + && dnf clean all + +# Set working directory and adjust ownership +WORKDIR /workspace +RUN chown -R vllmuser:vllmuser /workspace + +# Create build directories with proper permissions +RUN mkdir -p /workspace/.deps && chown -R vllmuser:vllmuser /workspace/.deps && \ + mkdir -p /tmp/vllm-build && chmod 777 /tmp/vllm-build && \ + mkdir -p /opt/work && chmod 777 /opt/work && \ + mkdir -p /home/vllmuser/.cache && chown -R vllmuser:vllmuser /home/vllmuser/.cache && \ + mkdir -p /home/vllmuser/.ccache && chown -R vllmuser:vllmuser /home/vllmuser/.ccache && \ + mkdir -p /home/vllmuser/.cmake && chown -R vllmuser:vllmuser /home/vllmuser/.cmake && \ + chmod -R 755 /workspace && \ + chmod -R 777 /tmp + +# Switch to the non-root user +USER vllmuser + +# Create and activate virtual environment using the best available Python (3.12 preferred) +ENV VIRTUAL_ENV=/home/vllmuser/venv +RUN PY_BIN="$(command -v python3.12 || command -v python3)" && "$PY_BIN" -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# Set pip configuration +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 +ENV PIP_NO_CACHE_DIR=1 +ENV PYTHONUNBUFFERED=1 +ENV PIP_DEFAULT_TIMEOUT=120 +ENV PIP_RETRIES=5 +ENV PIP_PREFER_BINARY=1 + +# CUDA arch list including legacy + latest so builds cover both older and newest GPUs. +# Can be overridden at build time with: --build-arg TORCH_CUDA_ARCH_LIST="..." +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 12.0 13.0" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" + +# Upgrade pip and setuptools to latest versions +RUN pip install --upgrade pip setuptools>=61 wheel + +COPY requirements/ /tmp/requirements/ + +# Install PyTorch nightly first (includes latest GPU arch support such as Blackwell sm_120 when present) +ARG TORCH_CUDA_INDEX +RUN pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX} + +RUN pip install --pre torchvision --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX} +RUN pip install --pre torchaudio --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX} + +# Install PyAV for torchvision video I/O (read_video) compatibility +RUN pip install --upgrade av + +# Install TorchCodec to support torchaudio.load on recent nightlies +RUN set -euxo pipefail \ + && (pip install --pre torchcodec \ + || pip install torchcodec \ + || pip install --no-deps 'git+https://github.com/pytorch/torchcodec@main') + +# Install modern build tools and vLLM's build dependencies and CUDA deps early, +# but sanitize requirements to avoid downgrading torch-family or forcing xformers pins. +COPY pyproject.toml /tmp/pyproject.toml +RUN set -euxo pipefail \ + && cd /tmp \ + && pip install "setuptools>=61" "setuptools-scm>=8" build wheel ninja cmake \ + && mkdir -p /tmp/requirements_sanitized \ + && for f in build.txt cuda.txt common.txt; do \ + if [ -f "/tmp/requirements/$f" ]; then \ + sed -E '/^(torch|torchvision|torchaudio|xformers)\b/Id' "/tmp/requirements/$f" > "/tmp/requirements_sanitized/$f"; \ + fi; \ + done \ + && pip install --pre \ + -r /tmp/requirements_sanitized/build.txt \ + -r /tmp/requirements_sanitized/cuda.txt \ + -r /tmp/requirements_sanitized/common.txt \ + && pip install --pre --upgrade \ + torch --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX} + +# Install minimal development extras +RUN pip install pytest pytest-asyncio ipython + +# Note: vLLM will be installed from source in development mode via dev-setup.sh +# This ensures compatibility with the PyTorch nightly build + +# Create activation script for easy virtual environment access +RUN echo '#!/bin/bash' > /home/vllmuser/activate_venv.sh && \ + echo 'source /home/vllmuser/venv/bin/activate' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "Virtual environment activated: $VIRTUAL_ENV"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "Python version: $(python --version)"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "PyTorch version: $(python -c \"import torch; print(torch.__version__)\")"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "CUDA available: $(python -c \"import torch; print(torch.cuda.is_available())\")"' >> /home/vllmuser/activate_venv.sh && \ + chmod +x /home/vllmuser/activate_venv.sh + +# Ensure virtual environment is activated in .bashrc +RUN echo 'source /home/vllmuser/venv/bin/activate' >> /home/vllmuser/.bashrc && \ + echo 'echo "🐍 Python virtual environment activated"' >> /home/vllmuser/.bashrc && \ + echo 'echo "🚀 Ready for vLLM development!"' >> /home/vllmuser/.bashrc + +# Create development helper script that uses current workspace requirements +RUN echo '#!/bin/bash' > /home/vllmuser/setup_vllm_dev.sh && \ + echo 'echo "🔧 Setting up vLLM for development..."' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'cd /workspace' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Use temporary build directory to avoid permission issues' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export TMPDIR=/tmp/vllm-build' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'mkdir -p "$TMPDIR" && chmod 777 "$TMPDIR"' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export CMAKE_BUILD_PARALLEL_LEVEL=4' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export VLLM_INSTALL_PUNICA_KERNELS=0' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export MAX_JOBS=4' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Install current workspace requirements first' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'if [ -f requirements/common.txt ]; then pip install -r requirements/common.txt; fi' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Use temporary directory for CMake build files' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'FETCHCONTENT_BASE_DIR="$TMPDIR/deps" pip install -e . --no-deps --no-build-isolation --verbose' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'echo "✅ vLLM installed in editable mode!"' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'python -c "import vllm; print(\"vLLM version:\", vllm.__version__)"' >> /home/vllmuser/setup_vllm_dev.sh && \ + chmod +x /home/vllmuser/setup_vllm_dev.sh + +# Provide a helper to apply repo patches against the mounted /workspace +# Create under /usr/local/bin as root, then switch back to non-root user +USER root +RUN printf '%s\n' \ + '#!/usr/bin/env bash' \ + 'set -euo pipefail' \ + 'cd /workspace 2>/dev/null || exit 0' \ + 'SCRIPT=./extras/patches/apply_patches.sh' \ + 'if [ -f "$SCRIPT" ]; then' \ + ' echo "[apply-patches] Running $SCRIPT"' \ + ' # Copy to temp and normalize EOL to avoid permission errors on mounted FS' \ + ' TMP_SCRIPT=$(mktemp /tmp/apply_patches.XXXXXX.sh)' \ + ' tr -d '\''\r'\'' < "$SCRIPT" > "$TMP_SCRIPT" || cp "$SCRIPT" "$TMP_SCRIPT"' \ + ' chmod +x "$TMP_SCRIPT"' \ + ' bash "$TMP_SCRIPT" || {' \ + ' echo "[apply-patches] Warning: patch apply failed (continuing)" >&2; exit 0; }' \ + 'fi' \ + > /usr/local/bin/apply-vllm-patches && \ + chmod +x /usr/local/bin/apply-vllm-patches +USER vllmuser + +# Add environment variables for better CUDA memory management and build optimization +# Use the new variable name to avoid deprecation warnings. +ENV PYTORCH_ALLOC_CONF=expandable_segments:True +# Do not pin a single GPU here; let runtime inject device selection +# ENV CUDA_VISIBLE_DEVICES=0 +ENV CMAKE_BUILD_PARALLEL_LEVEL=4 +ENV VLLM_INSTALL_PUNICA_KERNELS=0 +ENV MAX_JOBS=4 + +# Enable ccache for faster rebuilds +ENV CCACHE_DIR=/home/vllmuser/.ccache +ENV CCACHE_MAXSIZE=10G +ENV PATH=/usr/lib64/ccache:$PATH + +# (TORCH_CUDA_ARCH_LIST defined earlier) +# Do not force-disable Machete; allow upstream defaults. User may still pass -e CMAKE_ARGS for custom CMake settings. +ENV CMAKE_ARGS="" + +# WSL2-specific CUDA environment configuration +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility +ENV LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/lib/wsl/drivers:/usr/lib/wsl/lib:/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH + +# Add runtime library detection script +RUN echo '#!/bin/bash' > /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "=== CUDA Library Check ==="' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "Searching for CUDA libraries..."' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'find /usr/lib/wsl -name "libcuda.so*" 2>/dev/null | head -3 || echo "No WSL CUDA libs"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'ldconfig -p | grep cuda | head -3 || echo "No CUDA in ldconfig"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "PyTorch CUDA status:"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'python -c "import torch; print(f\"CUDA available: {torch.cuda.is_available()}\"); print(f\"Device count: {torch.cuda.device_count()}\")" 2>/dev/null || echo "PyTorch not available"' >> /home/vllmuser/check_cuda_libs.sh && \ + chmod +x /home/vllmuser/check_cuda_libs.sh diff --git a/extras/README.md b/extras/README.md new file mode 100644 index 000000000000..b8e8576084fa --- /dev/null +++ b/extras/README.md @@ -0,0 +1,50 @@ +# extras/ overview + +This directory hosts all non-core assets: container/build tooling, configs, testing, storage helpers, and optional patches. The goals are clarity, single-responsibility, and easy extension without touching the vLLM core. + +Suggested layout (implemented here): + +- podman/ — Podman-specific build/launch wrappers and helpers +- configs/ — Centralized, declarative versions and build configuration +- secrets/ — Gitignored area for local tokens/config (not committed) +- testing/ — Test/benchmark harness, matrices, and results +- storage/ — External volumes and cache management helpers +- patches/ — Optional patch/plug-in mechanism for controlled tweaks + +Primary entrypoint: use `extras/podman/` as the canonical way to build and run the dev container. + +Deprecation: the legacy launchers `extras/run-vllm-dev.sh` and `extras/run-vllm-dev.ps1` are deprecated and now forward to the Podman wrappers. Please switch to `extras/podman/run.sh` (Linux/macOS) or `extras/podman/run.ps1` (Windows). + +## Quick start + +- Edit `extras/configs/build.env` to set CUDA/UBI/Python defaults. +- Use `extras/podman/build.sh` to build images with those defaults. +- Use `extras/podman/run.ps1` (Windows) or `extras/podman/run.sh` (Linux/macOS) to run the dev container. + +Examples + +- Windows (PowerShell): + - Build image: `./extras/podman/run.ps1 -Build` + - GPU check: `./extras/podman/run.ps1 -GPUCheck` + - Setup build: `./extras/podman/run.ps1 -Setup -WorkVolume vllm-work -Progress` + +- Linux/macOS (bash): + - Build image: `extras/podman/run.sh --build` + - GPU check: `extras/podman/run.sh --gpu-check` + - Setup build: `extras/podman/run.sh --setup --work-volume vllm-work --progress` + +## Secrets + +Place tokens in `extras/secrets/` per its README and never commit them. Load them in session or bind-mount into containers. + +## Testing + +See `extras/testing/README.md` for defining a matrix, recording results, and comparing runs. + +## Storage + +See `extras/storage/README.md` for model/cache volume guidance for performance and reproducibility. + +## Patches + +If you need to tweak upstream vLLM without forking, use `extras/patches/` to stage diffs and apply them during build. diff --git a/extras/configs/README.md b/extras/configs/README.md new file mode 100644 index 000000000000..98ef0f02f786 --- /dev/null +++ b/extras/configs/README.md @@ -0,0 +1,9 @@ +# configs README + +This folder centralizes editable configuration for images/builds: + +- build.env: Bash-exported defaults (CUDA/UBI/Python/vLLM tag, arch list, volumes) +- build.yaml (optional): YAML equivalent for tools that prefer structured configs +- versions.json (optional): Machine-friendly manifest for automation + +Consumers (scripts/Containerfiles) should read values from here and allow runtime overrides via environment variables. diff --git a/extras/configs/build.env b/extras/configs/build.env new file mode 100644 index 000000000000..42e5f71b11c6 --- /dev/null +++ b/extras/configs/build.env @@ -0,0 +1,35 @@ +# Build configuration +# +# Scripts should source this file to obtain default versions. +# Values can be overridden by environment variables provided at runtime. + +# CUDA / UBI / Python baselines +export CUDA_VERSION=${CUDA_VERSION:-13.0.0} +export UBI_VERSION=${UBI_VERSION:-9} +export PYTHON_VERSION=${PYTHON_VERSION:-3.12} +export BASE_FLAVOR=${BASE_FLAVOR:-rockylinux9} + +# vLLM branch/tag to use inside the container when cloning or referring +export VLLM_TAG=${VLLM_TAG:-main} + +# Architectures (space separated) for PyTorch/NVCC +# Include Blackwell sm_120 via TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 12.0" +export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-"7.0 7.5 8.0 8.6 8.9 9.0 12.0 13.0"} + +# Named volume for build scratch/work dir (Podman recommended) +export VLLM_WORK_VOLUME=${VLLM_WORK_VOLUME:-vllm-work} +export VLLM_WORK_DIR_CONTAINER=${VLLM_WORK_DIR_CONTAINER:-/opt/work} + +# Image naming +export VLLM_BASE_IMAGE=${VLLM_BASE_IMAGE:-"nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${BASE_FLAVOR}"} +export VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-"vllm-cuda${CUDA_VERSION}-ubi${UBI_VERSION}"} + +# Torch family components: wheels only (nightly index). No source build fallbacks. +export INSTALL_TORCHVISION=${INSTALL_TORCHVISION:-1} +export INSTALL_TORCHAUDIO=${INSTALL_TORCHAUDIO:-1} + +# FFMPEG optional enforcement for torchaudio features +# Set to 1 to enable RPM Fusion repos and install ffmpeg/ffmpeg-devel; build will fail if unavailable. +# Set to 0 to attempt best-effort install and fallback to building torchaudio without FFMPEG when headers are missing. +#export REQUIRE_FFMPEG=${REQUIRE_FFMPEG:-0} +export REQUIRE_FFMPEG=${REQUIRE_FFMPEG:-1} \ No newline at end of file diff --git a/extras/configs/build.yaml b/extras/configs/build.yaml new file mode 100644 index 000000000000..277737dd92df --- /dev/null +++ b/extras/configs/build.yaml @@ -0,0 +1,11 @@ +cuda: + version: "12.9.1" + tag: "latest" +ubi: + version: "9.4" + tag: "latest" +python: + version: "3.11" + tag: "latest" +vllm: + tag: main diff --git a/extras/patches/0001-cumem-alloc-env-fallback.diff b/extras/patches/0001-cumem-alloc-env-fallback.diff new file mode 100644 index 000000000000..c2a322024961 --- /dev/null +++ b/extras/patches/0001-cumem-alloc-env-fallback.diff @@ -0,0 +1,14 @@ +diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py +--- a/vllm/device_allocator/cumem.py ++++ b/vllm/device_allocator/cumem.py +@@ -140,7 +140,9 @@ class CuMemAllocator: + return CuMemAllocator.instance + + def __init__(self): +- conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") ++ # Prefer new env var; fall back to deprecated one for compatibility ++ conf = os.environ.get("PYTORCH_ALLOC_CONF", ++ os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")) + assert "expandable_segments:True" not in conf, \ + ("Expandable segments are not compatible with memory pool. " + "Please track https://github.com/pytorch/pytorch/issues/147851 " diff --git a/extras/patches/README.md b/extras/patches/README.md new file mode 100644 index 000000000000..ff4f662c4588 --- /dev/null +++ b/extras/patches/README.md @@ -0,0 +1,5 @@ +# Patches and plugins scaffolding + +- Place unified diffs (*.diff) here. +- Use `apply_patches.sh` to apply them before building. +- Optionally, add Python plugins under `plugin/` and load dynamically at runtime. diff --git a/extras/patches/apply_patches.sh b/extras/patches/apply_patches.sh new file mode 100644 index 000000000000..2c4ca43d45c8 --- /dev/null +++ b/extras/patches/apply_patches.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -euo pipefail + +# If CRLF detected, re-exec a normalized temp copy to avoid editing mounted files +if grep -q $'\r' "$0" 2>/dev/null; then + TMP_SELF=$(mktemp /tmp/apply_patches_self.XXXXXX.sh) + tr -d '\r' < "$0" > "$TMP_SELF" || cp "$0" "$TMP_SELF" + chmod +x "$TMP_SELF" + exec "$TMP_SELF" "$@" +fi + +PATCH_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +ROOT_DIR=$(cd -- "${PATCH_DIR}/../.." &>/dev/null && pwd) + +shopt -s nullglob +PATCHES=(${PATCH_DIR}/*.diff) +shopt -u nullglob + +if [ ${#PATCHES[@]} -eq 0 ]; then + echo "[patches] No patches found; nothing to apply." + exit 0 +fi + +pushd "${ROOT_DIR}" >/dev/null +for p in "${PATCHES[@]}"; do + echo "[patches] Applying ${p}" + # Validate patch looks like a git-format patch and normalize EOL to temp file + if ! head -n 1 "$p" | grep -q "^From "; then + echo "[patches] Warning: ${p} is not a git-format patch; trying anyway" >&2 + fi + TMP_PATCH=$(mktemp /tmp/patch.XXXXXX.diff) + tr -d '\r' < "$p" > "$TMP_PATCH" || cp "$p" "$TMP_PATCH" + if ! git apply --check "$TMP_PATCH" 2>/dev/null; then + echo "[patches] Check failed for ${p}" + # Fallback: targeted edit for cumem allocator env var change + case "$(basename "$p")" in + 0001-cumem-alloc-env-fallback.diff) + echo "[patches] Attempting fallback edit for cumem allocator" + python - <<'PY' +import io, os, sys +PATH = os.path.join('vllm','device_allocator','cumem.py') +try: + with io.open(PATH, 'r', encoding='utf-8', newline='') as f: + src = f.read() +except FileNotFoundError: + sys.exit(1) + +target = 'conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")' +if 'PYTORCH_ALLOC_CONF' in src: + print('[patches] cumem already uses PYTORCH_ALLOC_CONF; skipping') + sys.exit(0) + +if target in src: + indent = ' ' * (len(src.split(target)[0].split('\n')[-1]) - len(src.split(target)[0].split('\n')[-1].lstrip(' '))) + replacement = ( + f"{indent}# Prefer new env var; fall back to deprecated one for compatibility\n" + f"{indent}conf = os.environ.get(\"PYTORCH_ALLOC_CONF\",\n" + f"{indent} os.environ.get(\"PYTORCH_CUDA_ALLOC_CONF\", \"\"))" + ) + new_src = src.replace(target, replacement) + with io.open(PATH, 'w', encoding='utf-8', newline='\n') as f: + f.write(new_src) + print('[patches] Applied cumem allocator fallback edit') + sys.exit(0) +else: + print('[patches] Could not find target line in cumem.py; no changes made') + sys.exit(1) +PY + status=$? + if [ $status -ne 0 ]; then + echo "[patches] Fallback edit failed" >&2; exit 1 + fi + ;; + *) + exit 1 + ;; + esac + else + git apply "$TMP_PATCH" + fi +done +popd >/dev/null + +echo "[patches] Done." diff --git a/extras/podman/Containerfile b/extras/podman/Containerfile new file mode 100644 index 000000000000..d42bef4b344e --- /dev/null +++ b/extras/podman/Containerfile @@ -0,0 +1,11 @@ +# syntax=docker/dockerfile:1.7-labs + +# Delegator Containerfile. +# Build using the canonical Dockerfile in extras/ to avoid duplication. + +FROM scratch as noop + +# Usage: +# podman build -f extras/Dockerfile -t vllm-dev:latest . +# or from this folder (wrapper script does this for you): +# bash build.sh diff --git a/extras/podman/README.md b/extras/podman/README.md new file mode 100644 index 000000000000..fb0c361203f2 --- /dev/null +++ b/extras/podman/README.md @@ -0,0 +1,12 @@ +# Podman helpers for vLLM + +This folder contains Podman-specific wrappers. They preserve back-compat by calling the existing scripts in `extras/` when present. + +- Containerfile: Thin wrapper that defers to `extras/Dockerfile` by default. +- build.sh: Builds the image using values from `../configs/build.env`. +- entrypoint/: Optional entrypoint scripts used inside containers. +- scripts/: Utility helpers for Podman machine/GPU/volumes. + +See README for usage. + +Documentation: see `docs/contributing/podman-dev.md` for the Podman-first workflow and deprecation notes for legacy launchers. diff --git a/extras/podman/build.sh b/extras/podman/build.sh new file mode 100644 index 000000000000..a4ec5f445825 --- /dev/null +++ b/extras/podman/build.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Why: Back-compat wrapper that sources central config and builds using the canonical Dockerfile. + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +ROOT_DIR=$(cd -- "${SCRIPT_DIR}/../.." &>/dev/null && pwd) +CONFIG_DIR="${SCRIPT_DIR}/../configs" + +# shellcheck source=../configs/build.env +if [ -f "${CONFIG_DIR}/build.env" ]; then + # shellcheck disable=SC1091 + source "${CONFIG_DIR}/build.env" +fi + +CUDA_VERSION=${CUDA_VERSION:-12.9.1} +UBI_VERSION=${UBI_VERSION:-9} +VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-"vllm-cuda${CUDA_VERSION}-ubi${UBI_VERSION}"} + +CONTEXT="${ROOT_DIR}" +DOCKERFILE_REL="extras/Dockerfile" + +echo "[podman/build] Building image ${VLLM_IMAGE_TAG} with CUDA=${CUDA_VERSION}, UBI=${UBI_VERSION}" + +podman build \ + --build-arg CUDA_VERSION="${CUDA_VERSION}" \ + --build-arg UBI_VERSION="${UBI_VERSION}" \ + -t "${VLLM_IMAGE_TAG}" \ + -f "${DOCKERFILE_REL}" \ + "${CONTEXT}" + +echo "[podman/build] Done -> ${VLLM_IMAGE_TAG}" diff --git a/extras/podman/dev-setup.sh b/extras/podman/dev-setup.sh new file mode 100644 index 000000000000..153d03b90710 --- /dev/null +++ b/extras/podman/dev-setup.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Robust setup entrypoint: prefer extras/dev-setup.sh, +# otherwise use the image-provided /home/vllmuser/setup_vllm_dev.sh. +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}" )" &>/dev/null && pwd) +EXTRAS_DIR=$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd) + +try_exec() { + local target="$1" + if [[ -f "$target" ]]; then + chmod +x "$target" 2>/dev/null || true + exec "$target" "$@" + fi +} + +# 1) Current canonical path +if [[ -f "${EXTRAS_DIR}/dev-setup.sh" ]]; then + chmod +x "${EXTRAS_DIR}/dev-setup.sh" 2>/dev/null || true + exec "${EXTRAS_DIR}/dev-setup.sh" "$@" +fi + +# 3) Fallback to image helper +if command -v /home/vllmuser/setup_vllm_dev.sh >/dev/null 2>&1 || [[ -f /home/vllmuser/setup_vllm_dev.sh ]]; then + exec /home/vllmuser/setup_vllm_dev.sh "$@" +fi + +echo "[setup] No setup script found at extras/dev-setup.sh, and no image helper present." >&2 +exit 1 diff --git a/extras/podman/entrypoint/apply-patches-then-exec.sh b/extras/podman/entrypoint/apply-patches-then-exec.sh new file mode 100644 index 000000000000..30196ad5e695 --- /dev/null +++ b/extras/podman/entrypoint/apply-patches-then-exec.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Apply repo patches if available; best-effort, normalization handled inside helper. +if command -v apply-vllm-patches >/dev/null 2>&1; then + apply-vllm-patches || true +fi + +exec "$@" diff --git a/extras/podman/run.ps1 b/extras/podman/run.ps1 new file mode 100644 index 000000000000..7aee56fe9bc4 --- /dev/null +++ b/extras/podman/run.ps1 @@ -0,0 +1,257 @@ +#!/usr/bin/env pwsh +[CmdletBinding()] param( + [switch]$Build, + [switch]$Interactive, + [string]$Command = "", + [switch]$Setup, + [switch]$GPUCheck, + [switch]$Mirror, + [switch]$Recreate, + [string]$WorkVolume = "", + [string]$WorkDirHost = "", + [switch]$Progress, + [switch]$NoCache, + [switch]$Pull, + [switch]$Help +) + +if ($Help) { + Write-Host "Usage: extras/podman/run.ps1 [options]" + Write-Host " -Build Build the dev image (reads extras/configs/build.env)" + Write-Host " -Interactive Start an interactive shell" + Write-Host " -Command Run a command inside the dev container" + Write-Host " -Setup Run project setup inside the container" + Write-Host " -GPUCheck Run a CUDA/Torch sanity check" + Write-Host " -Mirror Use local mirror registries if configured" + Write-Host " -Recreate Recreate the container if running" + Write-Host " -WorkVolume Named volume to mount at /opt/work" + Write-Host " -WorkDirHost Host dir to mount at /opt/work" + Write-Host " -Progress Show progress bars in setup" + Write-Host " -NoCache Build image without using cache" + Write-Host " -Pull Always attempt to pull newer base image" + return +} + +if (-not $Interactive -and [string]::IsNullOrEmpty($Command) -and -not $GPUCheck -and -not $Setup) { $Interactive = $true } + +if (-not (Get-Command podman -ErrorAction SilentlyContinue)) { Write-Host "❌ Podman not found in PATH" -ForegroundColor Red; exit 1 } + +$ContainerName = "vllm-dev" +$ImageTag = "vllm-dev:latest" +$SourceDir = (Get-Location).Path + +Write-Host "🐋 vLLM Dev Container (Podman)" -ForegroundColor Green + +if ($Build) { + Write-Host "🔨 Building image (honoring extras/configs/build.env)..." -ForegroundColor Yellow + $configPath = Join-Path $SourceDir "extras/configs/build.env" + $dockerfilePath = Join-Path $SourceDir "extras/Dockerfile" + $cudaVer = $null + $baseFlavor = $null + $archList = $null + $requireFfmpegArg = '1' + $tvRef = $null + $taRef = $null + function Get-DockerArgDefault([string]$name, [string]$fallback) { + if (Test-Path $dockerfilePath) { + $df = Get-Content -Raw -Path $dockerfilePath + $m = [regex]::Match($df, "(?m)^\s*ARG\s+${name}\s*=\s*([^\r\n]+)") + if ($m.Success) { + return $m.Groups[1].Value.Trim() + } + } + return $fallback + } + if (Test-Path $configPath) { + $cfg = Get-Content -Raw -Path $configPath + function Get-EnvDefault([string]$name, [string]$fallback) { + # Match a line like: export NAME=VALUE + $line = [regex]::Match($cfg, "(?m)^\s*export\s+${name}\s*=\s*([^\r\n]+)") + if (-not $line.Success) { return $fallback } + $val = $line.Groups[1].Value.Trim() + # Strip wrapping quotes if present + if (($val.StartsWith('"') -and $val.EndsWith('"')) -or ($val.StartsWith("'") -and $val.EndsWith("'"))) { $val = $val.Substring(1, $val.Length-2) } + # If value is Bash-style ${NAME:-default}, extract default + if ($val.StartsWith('${') -and $val.Contains(':-')) { + $idx = $val.IndexOf(':-'); $end = $val.IndexOf('}', $idx) + if ($idx -ge 0 -and $end -gt $idx) { + $def = $val.Substring($idx+2, $end-($idx+2)).Trim() + if (($def.StartsWith('"') -and $def.EndsWith('"')) -or ($def.StartsWith("'") -and $def.EndsWith("'"))) { $def = $def.Substring(1, $def.Length-2) } + return $def + } + } + return $val + } + $cudaVer = Get-EnvDefault -name 'CUDA_VERSION' -fallback (Get-DockerArgDefault 'CUDA_VERSION' '13.0.0') + $baseFlavor = Get-EnvDefault -name 'BASE_FLAVOR' -fallback (Get-DockerArgDefault 'BASE_FLAVOR' 'rockylinux9') + $archList = Get-EnvDefault -name 'TORCH_CUDA_ARCH_LIST' -fallback (Get-DockerArgDefault 'TORCH_CUDA_ARCH_LIST' '7.0 7.5 8.0 8.6 8.9 9.0 12.0 13.0') + # No longer used: wheels-only installs for torchvision/torchaudio + $requireFfmpeg = Get-EnvDefault -name 'REQUIRE_FFMPEG' -fallback (Get-DockerArgDefault 'REQUIRE_FFMPEG' '1') + if ($requireFfmpeg -match '^[01]$') { $requireFfmpegArg = $requireFfmpeg } else { $requireFfmpegArg = '1' } + } + # Derive PyTorch nightly index from CUDA version (e.g., 13.0 -> cu130, 12.9 -> cu129) + $torchCudaIndex = if ($cudaVer -match '^13\.') { 'cu130' } elseif ($cudaVer -match '^12\.9') { 'cu129' } else { + $parts = $cudaVer.Split('.') + if ($parts.Length -ge 2) { 'cu' + $parts[0] + $parts[1] + '0' } else { 'cu129' } + } + Write-Host ("Config: CUDA={0} BASE_FLAVOR={1} TORCH_CUDA_INDEX={2} ARCH_LIST=({3})" -f $cudaVer,$baseFlavor,$torchCudaIndex,$archList) -ForegroundColor DarkGray + $buildCmd = @("build","-f","extras/Dockerfile", + "--build-arg","CUDA_VERSION=$cudaVer", + "--build-arg","BASE_FLAVOR=$baseFlavor", + "--build-arg","TORCH_CUDA_INDEX=$torchCudaIndex", + "--build-arg","TORCH_CUDA_ARCH_LIST=$archList", + "--build-arg","REQUIRE_FFMPEG=$requireFfmpegArg", + "-t",$ImageTag,".") + # Use cache by default; add --no-cache only when requested + if ($NoCache) { $buildCmd = @($buildCmd[0],"--no-cache") + $buildCmd[1..($buildCmd.Length-1)] } + if ($Pull) { $buildCmd = @($buildCmd[0],"--pull=always") + $buildCmd[1..($buildCmd.Length-1)] } + & podman @buildCmd + if ($LASTEXITCODE -ne 0) { Write-Host "❌ Build failed" -ForegroundColor Red; exit 1 } + Write-Host "✅ Build ok" -ForegroundColor Green +} + +# Already running? +$running = podman ps --filter "name=$ContainerName" --format "{{.Names}}" 2>$null + +if ($Recreate -and $running -eq $ContainerName) { + Write-Host "♻️ Removing existing container '$ContainerName'" -ForegroundColor Yellow + podman rm -f $ContainerName | Out-Null + $running = $null +} + +if ($running -eq $ContainerName) { + if ($GPUCheck) { + Write-Host "🔍 GPU check (existing container)" -ForegroundColor Yellow + $cmd = @' +source /home/vllmuser/venv/bin/activate && python - <<'PY' +import torch, os +print("PyTorch:", getattr(torch,"__version__","n/a")) +print("CUDA:", torch.cuda.is_available()) +print("Devices:", torch.cuda.device_count() if torch.cuda.is_available() else 0) +print("LD_LIBRARY_PATH:", os.environ.get("LD_LIBRARY_PATH")) +if torch.cuda.is_available(): + try: + print("GPU 0:", torch.cuda.get_device_name(0)) + except Exception as e: + print("GPU name error:", e) +PY +nvidia-smi || true +'@ + $cmd = "export NVIDIA_VISIBLE_DEVICES=all; " + $cmd + podman exec $ContainerName bash -lc $cmd + exit $LASTEXITCODE + } + if ($Setup) { + Write-Host "🔧 Running dev setup in existing container" -ForegroundColor Yellow + $envs = @() + if ($Mirror) { $envs += @('LOCAL_MIRROR=1') } + if ($Progress) { $envs += @('PROGRESS_WATCH=1') } + $envs += @('NVIDIA_VISIBLE_DEVICES=all') + $envStr = ($envs | ForEach-Object { "export $_;" }) -join ' ' + $cmd = "$envStr chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh" + if ($Progress) { podman exec -it $ContainerName bash -lc $cmd } else { podman exec $ContainerName bash -lc $cmd } + exit $LASTEXITCODE + } + if ($Command) { + Write-Host "🚀 Running command in existing container" -ForegroundColor Green + $runCmd = "source /home/vllmuser/venv/bin/activate && $Command" + podman exec $ContainerName bash -c $runCmd + exit $LASTEXITCODE + } + $resp = Read-Host "Attach to running container? [Y/n]" + if ($resp -eq "" -or $resp -match '^[Yy]$') { podman exec -it $ContainerName bash; exit $LASTEXITCODE } else { exit 0 } +} + +# Ensure image exists +podman image exists $ImageTag +if ($LASTEXITCODE -ne 0) { Write-Host "❌ Image missing. Use -Build." -ForegroundColor Red; exit 1 } + +# Base args (no default /tmp tmpfs; can be enabled via VLLM_TMPFS_TMP_SIZE) +$runArgs = @("run","--rm","--security-opt=label=disable","--shm-size","8g","-v","${SourceDir}:/workspace:Z") +if (-not [string]::IsNullOrWhiteSpace($WorkVolume)) { $runArgs += @('-v',"${WorkVolume}:/opt/work:Z") } +elseif ($WorkDirHost -and (Test-Path $WorkDirHost)) { $runArgs += @('-v',"${WorkDirHost}:/opt/work:Z") } +$runArgs += @('-w','/workspace','--name',"$ContainerName",'--user','vllmuser','--env','ENGINE=podman') +# Use a tiny entrypoint to apply patches before executing the requested command +$runArgs += @('--entrypoint','/workspace/extras/podman/entrypoint/apply-patches-then-exec.sh') + +$tmpfsSize = [Environment]::GetEnvironmentVariable('VLLM_TMPFS_TMP_SIZE') +if (-not [string]::IsNullOrEmpty($tmpfsSize) -and $tmpfsSize -ne '0') { $runArgs += @('--tmpfs',"/tmp:size=$tmpfsSize") } + +if ($true) { # Request GPU via CDI hooks + $runArgs = @("run","--rm","--security-opt=label=disable","--device=nvidia.com/gpu=all") + $runArgs[2..($runArgs.Length-1)] +} + +# WSL GPU: map /dev/dxg and mount WSL libs +$runArgs += @('--device','/dev/dxg','-v','/usr/lib/wsl:/usr/lib/wsl:ro') +if ($Mirror) { $runArgs += @('--env','LOCAL_MIRROR=1') } +foreach ($ev in 'NVIDIA_VISIBLE_DEVICES','NVIDIA_DRIVER_CAPABILITIES','NVIDIA_REQUIRE_CUDA') { + $val = [Environment]::GetEnvironmentVariable($ev) + if ($val) { $runArgs += @('--env',"$ev=$val") } +} +$runArgs += @('--env','ENGINE=podman','--env','NVIDIA_VISIBLE_DEVICES=all','--env','NVIDIA_DRIVER_CAPABILITIES=compute,utility','--env','NVIDIA_REQUIRE_CUDA=') + +if ($GPUCheck) { + $pyDiag = @' +import json, torch, os +out = { + "torch_version": getattr(torch, "__version__", "n/a"), + "torch_cuda_version": getattr(getattr(torch, "version", None), "cuda", "n/a"), + "cuda_available": torch.cuda.is_available(), + "ld_library_path": os.environ.get("LD_LIBRARY_PATH"), +} +try: + out["device_count"] = torch.cuda.device_count() +except Exception as e: + out["device_count_error"] = str(e) +if out["cuda_available"] and out.get("device_count", 0) > 0: + try: + cap = torch.cuda.get_device_capability(0) + out["device_0"] = {"name": torch.cuda.get_device_name(0), "capability": f"sm_{cap[0]}{cap[1]}"} + except Exception as e: + out["device_0_error"] = str(e) +else: + out["diagnostics"] = ["Missing /dev/nvidia* or podman machine without GPU passthrough"] +print(json.dumps(out, indent=2)) +'@ + $pyB64 = [Convert]::ToBase64String([Text.Encoding]::UTF8.GetBytes($pyDiag)) + $gpuScript = @' +echo '=== GPU Check ===' +which nvidia-smi && nvidia-smi || echo 'nvidia-smi unavailable' +echo '--- /dev/nvidia* ---' +ls -l /dev/nvidia* 2>/dev/null || echo 'no /dev/nvidia* nodes' +echo '--- Environment (NVIDIA_*) ---' +env | grep -E '^NVIDIA_' || echo 'no NVIDIA_* env vars' +if [ "$NVIDIA_VISIBLE_DEVICES" = "void" ]; then echo 'WARN: NVIDIA_VISIBLE_DEVICES=void (no GPU mapped)'; fi +echo '--- LD_LIBRARY_PATH ---' +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" +source /home/vllmuser/venv/bin/activate 2>/dev/null || true +echo __PY_B64__ | base64 -d > /tmp/gpucheck.py +python /tmp/gpucheck.py || true +rm -f /tmp/gpucheck.py +'@ + $gpuScript = "export NVIDIA_VISIBLE_DEVICES=all; export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/lib/wsl/drivers:`$LD_LIBRARY_PATH; " + ($gpuScript -replace '__PY_B64__', $pyB64) -replace "`r","" + $runArgs += @('--user','root', $ImageTag,'bash','-lc',$gpuScript) +} elseif ($Setup) { + # Use robust setup entrypoint that finds the right script (extras/dev-setup.sh or image helper) + $prefix = 'for f in ./extras/dev-setup.sh ./extras/podman/dev-setup.sh; do if [ -f "$f" ]; then sed -i "s/\r$//" "$f" || true; fi; done; chmod +x ./extras/podman/dev-setup.sh 2>/dev/null || true; apply-vllm-patches || true; ' + $envPrefix = '' + if ($Mirror) { $envPrefix += 'export LOCAL_MIRROR=1; ' } + if ($Progress) { $envPrefix += 'export PROGRESS_WATCH=1; ' } + $envPrefix += 'export TMPDIR=/opt/work/tmp; export TMP=/opt/work/tmp; export TEMP=/opt/work/tmp; mkdir -p /opt/work/tmp; ' + $setupCmd = $prefix + $envPrefix + "./extras/podman/dev-setup.sh" + if ($Progress) { $runArgs += @('-it', $ImageTag, 'bash','-lc', $setupCmd) } else { $runArgs += @($ImageTag, 'bash','-lc', $setupCmd) } + Write-Host "🔧 Running dev setup" -ForegroundColor Green +} elseif ($Interactive -and -not $Command) { + $runArgs += @('-it',$ImageTag,'bash') + Write-Host "🚀 Interactive shell" -ForegroundColor Green +} elseif ($Command) { + $runArgs += @($ImageTag,'bash','-lc',"export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/lib/wsl/drivers:`$LD_LIBRARY_PATH; source /home/vllmuser/venv/bin/activate && $Command") + Write-Host "🚀 Running command" -ForegroundColor Green +} else { + $runArgs += @($ImageTag) +} + +Write-Host "Command: podman $($runArgs -join ' ')" -ForegroundColor Gray +& podman @runArgs + +if ($LASTEXITCODE -eq 0 -and $Interactive) { Write-Host "Exited cleanly" -ForegroundColor Green } diff --git a/extras/podman/run.sh b/extras/podman/run.sh new file mode 100644 index 000000000000..ddafbcc578d0 --- /dev/null +++ b/extras/podman/run.sh @@ -0,0 +1,158 @@ +#!/usr/bin/env bash +# Unified lightweight vLLM dev container launcher (Podman-first, Linux/macOS) +set -euo pipefail + +IMAGE_TAG="vllm-dev:latest" +CONTAINER_NAME="vllm-dev" +SOURCE_DIR="$(pwd)" + +show_help() { + cat <&2; show_help; exit 1 ;; + esac +done + +if ! command -v podman >/dev/null 2>&1; then + echo "Error: podman not found in PATH" >&2 + exit 1 +fi + +echo "[vLLM] Engine: podman Image: $IMAGE_TAG Container: $CONTAINER_NAME" + +if [[ $BUILD -eq 1 ]]; then + echo "[vLLM] Building image..." + if ! podman build -f extras/Dockerfile -t "$IMAGE_TAG" .; then + echo "[vLLM] Build failed" >&2 + exit 1 + fi + echo "[vLLM] Build complete" +fi + +# If container running, attach / exec +RUNNING=$(podman ps --filter "name=${CONTAINER_NAME}" --format '{{.Names}}' 2>/dev/null || true) + +if [[ "$RUNNING" == "$CONTAINER_NAME" ]]; then + if [[ $GPU_CHECK -eq 1 ]]; then + echo "[vLLM] GPU check (existing container)" + exec podman exec "$CONTAINER_NAME" bash -lc 'source /home/vllmuser/venv/bin/activate 2>/dev/null || true; which nvidia-smi && nvidia-smi || true; python - </dev/null || true; ./extras/dev-setup.sh' + else + exec podman exec "$CONTAINER_NAME" bash -lc 'chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh' + fi + fi + if [[ -n "$CMD" ]]; then + echo "[vLLM] Exec command in existing container" + podman exec "$CONTAINER_NAME" bash -lc "source /home/vllmuser/venv/bin/activate 2>/dev/null || true; $CMD" + exit $? + fi + read -r -p "Attach to running container ${CONTAINER_NAME}? [Y/n] " RESP || true + if [[ -z "$RESP" || "$RESP" =~ ^[Yy]$ ]]; then + exec podman exec -it "$CONTAINER_NAME" bash + else + exit 0 + fi +fi + +# Ensure image exists if not building +if [[ $BUILD -ne 1 ]]; then + if ! podman image exists "$IMAGE_TAG"; then + echo "Image $IMAGE_TAG missing. Use --build." >&2; exit 1 + fi +fi + +# Base run args +RUN_ARGS=(run --rm --device=nvidia.com/gpu=all --security-opt=label=disable --shm-size 8g --name "$CONTAINER_NAME" -v "${SOURCE_DIR}:/workspace:Z" -w /workspace --user vllmuser --env ENGINE=podman) + +# Prefer named volume for /opt/work if provided +if [[ -n "$WORK_VOLUME" ]]; then + RUN_ARGS+=(-v "${WORK_VOLUME}:/opt/work:Z") +fi + +# Allow configurable /tmp tmpfs size via VLLM_TMPFS_TMP_SIZE (default 0=disabled) +TMPFS_SIZE="${VLLM_TMPFS_TMP_SIZE:-0}" +if [[ -n "$TMPFS_SIZE" && "$TMPFS_SIZE" != "0" ]]; then + RUN_ARGS+=(--tmpfs "/tmp:size=${TMPFS_SIZE}") +fi + +# Ensure sane NVIDIA env defaults inside container to avoid 'void' and missing caps +RUN_ARGS+=(--env "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}" \ + --env "NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}" \ + --env "NVIDIA_REQUIRE_CUDA=") + +if [[ $GPU_CHECK -eq 1 ]]; then + GPU_SCRIPT=$'echo "=== GPU Check ==="; which nvidia-smi && nvidia-smi || echo "nvidia-smi unavailable"; echo "--- /dev/nvidia* ---"; ls -l /dev/nvidia* 2>/dev/null || echo "no /dev/nvidia* nodes"; echo "--- Environment (NVIDIA_*) ---"; env | grep -E "^NVIDIA_" || echo "no NVIDIA_* env vars"; if [ "$NVIDIA_VISIBLE_DEVICES" = "void" ]; then echo "WARN: NVIDIA_VISIBLE_DEVICES=void (no GPU mapped)"; fi; echo "--- LD_LIBRARY_PATH ---"; echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"; source /home/vllmuser/venv/bin/activate 2>/dev/null || true; python - <0:\n\ttry:\n\t\tcap=torch.cuda.get_device_capability(0)\n\t\tout[\'device_0\']={\'name\':torch.cuda.get_device_name(0),\'capability\':f"sm_{cap[0]}{cap[1]}"}\n\texcept Exception as e:\n\t\tout[\'device_0_error\']=str(e)\nelse:\n\tout[\'diagnostics\']=[\'Missing /dev/nvidia* or podman machine without GPU passthrough\']\nprint(json.dumps(out,indent=2))\nPY' + RUN_ARGS+=("$IMAGE_TAG" bash -lc "$GPU_SCRIPT") +elif [[ $SETUP -eq 1 ]]; then + if [[ $MIRROR -eq 1 ]]; then + RUN_ARGS+=(--env LOCAL_MIRROR=1) + fi + if [[ $PROGRESS -eq 1 ]]; then + RUN_ARGS+=(--env PROGRESS_WATCH=1) + RUN_ARGS+=("-it" "$IMAGE_TAG" bash -lc 'chmod +x ./extras/podman/dev-setup.sh 2>/dev/null || true; ./extras/podman/dev-setup.sh') + else + RUN_ARGS+=("$IMAGE_TAG" bash -lc 'chmod +x ./extras/podman/dev-setup.sh 2>/dev/null || true; ./extras/podman/dev-setup.sh') + fi +elif [[ -n "$CMD" ]]; then + RUN_ARGS+=("$IMAGE_TAG" bash -lc "source /home/vllmuser/venv/bin/activate 2>/dev/null || true; $CMD") +else + RUN_ARGS+=("-it" "$IMAGE_TAG" bash) + echo "[vLLM] Interactive shell. Helpful inside container:" + echo " ./extras/dev-setup.sh # Build/install editable vLLM" + echo " python -c 'import torch;print(torch.cuda.is_available())'" + echo " python -c 'import vllm'" +fi + +echo "[vLLM] Command: podman ${RUN_ARGS[*]}" +exec podman "${RUN_ARGS[@]}" diff --git a/extras/podman/scripts/gpu_status.sh b/extras/podman/scripts/gpu_status.sh new file mode 100644 index 000000000000..a50c78b01c03 --- /dev/null +++ b/extras/podman/scripts/gpu_status.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Helper to show GPU/CDI status under Podman (Linux/WSL) + +podman info --format json | jq '.host' || podman info || true + +# Show CDI devices if available +podman cdi list || true diff --git a/extras/secrets/.gitignore b/extras/secrets/.gitignore new file mode 100644 index 000000000000..d4895ec18947 --- /dev/null +++ b/extras/secrets/.gitignore @@ -0,0 +1,4 @@ +# Ensure this directory stays out of git; keep this file only. +* +!.gitignore +!README.md diff --git a/extras/secrets/README.md b/extras/secrets/README.md new file mode 100644 index 000000000000..b519087af198 --- /dev/null +++ b/extras/secrets/README.md @@ -0,0 +1,12 @@ +# secrets directory + +This directory is gitignored and intended for local-only secret material such as model hub tokens. + +Files are expected to be simple KEY=VALUE lines that can be sourced by shell scripts. + +Examples: + +- hf-credentials.env +- cn-modelhub-credentials.env + +Do NOT commit secrets. See README for details. diff --git a/extras/storage/README.md b/extras/storage/README.md new file mode 100644 index 000000000000..d106b6d7378c --- /dev/null +++ b/extras/storage/README.md @@ -0,0 +1,7 @@ +# Storage helpers + +Declare and manage external volumes for models and caches. + +- storage-config.yaml: Declarative host/container paths +- setup_local.sh: Helper to prepare a local volume or directory +- scripts/: Utilities for warmup, cache management, mounts diff --git a/extras/storage/scripts/warm_cache.sh b/extras/storage/scripts/warm_cache.sh new file mode 100644 index 000000000000..1d97b7f044f6 --- /dev/null +++ b/extras/storage/scripts/warm_cache.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Placeholder for cache warmup logic. +# Example usage: ./warm_cache.sh meta-llama/Llama-3-8B /models +MODEL_ID=${1:-meta-llama/Llama-3-8B} +TARGET=${2:-/models} +mkdir -p "$TARGET" +echo "(scaffold) Would warm cache for $MODEL_ID under $TARGET" diff --git a/extras/storage/setup_local.sh b/extras/storage/setup_local.sh new file mode 100644 index 000000000000..101826bc7396 --- /dev/null +++ b/extras/storage/setup_local.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Prepare a local directory for models and ensure reasonable permissions. +TARGET=${1:-/mnt/ml-models} +mkdir -p "$TARGET" +chmod 775 "$TARGET" || true + +echo "Model storage prepared at: $TARGET" diff --git a/extras/storage/storage-config.yaml b/extras/storage/storage-config.yaml new file mode 100644 index 000000000000..90310b572b3c --- /dev/null +++ b/extras/storage/storage-config.yaml @@ -0,0 +1,4 @@ +model_volume: + path_host: "/mnt/ml-models" + path_container: "/models" + shared: true diff --git a/extras/testing/README.md b/extras/testing/README.md new file mode 100644 index 000000000000..2c64d538ac97 --- /dev/null +++ b/extras/testing/README.md @@ -0,0 +1,7 @@ +# Testing and benchmarking harness + +- Define a matrix of models/environments in `test_matrix.yaml`. +- Run via `python extras/testing/run_tests.py --output-dir extras/testing/results/$(date +%F_%H-%M)`. +- Store results in `results/` with timestamps for regression tracking. + +This scaffolding is intentionally minimal; models and benchmarks can be added incrementally. diff --git a/extras/testing/compare_results.py b/extras/testing/compare_results.py new file mode 100644 index 000000000000..f6c91bdd6667 --- /dev/null +++ b/extras/testing/compare_results.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +import argparse +import json + + +def load(path: str) -> dict: + with open(path, encoding="utf-8") as f: + return json.load(f) + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("a") + p.add_argument("b") + args = p.parse_args() + + A = load(args.a) + B = load(args.b) + + # Placeholder comparison: print keys that differ + diffs = sorted(set(A.keys()) ^ set(B.keys())) + print(json.dumps({"diff_keys": diffs})) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/extras/testing/run_tests.py b/extras/testing/run_tests.py new file mode 100644 index 000000000000..131521c0dbac --- /dev/null +++ b/extras/testing/run_tests.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Minimal, non-destructive test harness that prints a JSON line per test. +This is a scaffold; integrate with your local launchers or CI as needed. +""" +from __future__ import annotations + +import argparse +import json +import os +from datetime import datetime + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--cuda-version", + default=os.getenv("CUDA_VERSION", "12.9.1")) + p.add_argument("--ubi-version", default=os.getenv("UBI_VERSION", "9.4")) + p.add_argument("--models", default="Example-Llama3-8B") + p.add_argument("--output-dir", + default=os.path.join("extras", "testing", "results", + datetime.now().strftime("%F_%H-%M"))) + args = p.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + result = { + "ts": datetime.utcnow().isoformat() + "Z", + "cuda": args.cuda_version, + "ubi": args.ubi_version, + "models": args.models.split(","), + "status": "scaffold", + "notes": "Integrate with vLLM server/client to collect real metrics.", + } + + out_path = os.path.join(args.output_dir, "scaffold.json") + with open(out_path, "w", encoding="utf-8") as f: + json.dump(result, f, indent=2) + + print(json.dumps({"written": out_path})) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/extras/testing/test_matrix.yaml b/extras/testing/test_matrix.yaml new file mode 100644 index 000000000000..270e7ff5ec13 --- /dev/null +++ b/extras/testing/test_matrix.yaml @@ -0,0 +1,16 @@ +models: + - name: Example-Llama3-8B + id: meta-llama/Llama-3-8B + chat_template: chat_templates/llama-3-instruct.jinja + params: + max_tokens: 64 + temperature: 0.7 + +environments: + - cuda: 12.9.1 + ubi: 9.4 + +benchmarks: + - name: inference_speed + input: "Summarize: vLLM extras modularization plan." + metrics: [latency_ms, tokens_per_sec] diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 7963fb15c419..69f38fd0a178 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -143,7 +143,9 @@ def get_instance() -> "CuMemAllocator": return CuMemAllocator.instance def __init__(self): - conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") + # Prefer new env var; fall back to deprecated one for compatibility + conf = os.environ.get("PYTORCH_ALLOC_CONF", + os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")) assert "expandable_segments:True" not in conf, \ ("Expandable segments are not compatible with memory pool. " "Please track https://github.com/pytorch/pytorch/issues/147851 "