diff --git a/.github/ci-trigger-20250814-1 b/.github/ci-trigger-20250814-1 new file mode 100644 index 000000000000..8ca993aa58b2 --- /dev/null +++ b/.github/ci-trigger-20250814-1 @@ -0,0 +1 @@ +trigger: sync_with_upstream diff --git a/.github/workflows/sync_with_upstream.yml b/.github/workflows/sync_with_upstream.yml new file mode 100644 index 000000000000..df1048a43833 --- /dev/null +++ b/.github/workflows/sync_with_upstream.yml @@ -0,0 +1,80 @@ +name: Sync with Upstream + +on: + schedule: + - cron: '0 0 * * *' # Runs daily at midnight + push: + branches: + - main + +jobs: + sync: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Git + run: | + git config --global user.name 'Zhuul' + git config --global user.email '40538530+Zhuul@users.noreply.github.com' + + - name: Add upstream remote + run: git remote add upstream https://github.com/vllm-project/vllm.git + + - name: Fetch upstream changes + run: git fetch upstream + + - name: Merge upstream changes + id: merge + run: | + git checkout main + git merge upstream/main || { + echo "Merge conflict detected. Creating a new branch for manual resolution." + git checkout -b "merge-conflict-$(date +%Y%m%d%H%M%S)" + git push origin HEAD + echo "conflict=true" >> "$GITHUB_OUTPUT" + exit 1 + } + echo "conflict=false" >> "$GITHUB_OUTPUT" + + - name: Check for workflow file changes + id: workflow_change + run: | + if git diff --name-only upstream/main | grep '^.github/workflows/'; then + echo "workflow_changed=true" >> "$GITHUB_OUTPUT" + else + echo "workflow_changed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Set up PAT authentication + env: + GH_PAT: ${{ secrets.GH_PAT }} + run: | + git remote set-url origin "https://Zhuul:${GH_PAT}@github.com/Zhuul/vllm.git" + + - name: Push changes if no workflow files changed + if: steps.workflow_change.outputs.workflow_changed == 'false' && steps.merge.outputs.conflict == 'false' + run: git push origin main + + - name: Create Pull Request for workflow file changes + if: steps.workflow_change.outputs.workflow_changed == 'true' && steps.merge.outputs.conflict == 'false' + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ secrets.GH_PAT }} + commit-message: "Sync with upstream: update workflow files" + title: "Sync with upstream: update workflow files" + body: | + This PR was automatically created because workflow files were updated while syncing with upstream. + Please review and merge. + branch: workflow-sync-${{ github.run_id }} + base: main + + - name: Send notification if merge conflict + if: steps.merge.outputs.conflict == 'true' + run: | + echo "Merge conflict detected. Manual intervention required." + # Add your notification logic here (e.g., send an email, create an issue, etc.) diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 5a2a70d57e85..1c2a31cf895c 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -31,6 +31,8 @@ See . Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source][build-from-source] documentation for details. +For a containerized developer workflow, see Podman-first dev: `docs/contributing/podman-dev.md`. + For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. ### Building the docs with MkDocs diff --git a/docs/contributing/podman-dev.md b/docs/contributing/podman-dev.md new file mode 100644 index 000000000000..881e495f8421 --- /dev/null +++ b/docs/contributing/podman-dev.md @@ -0,0 +1,41 @@ +--- +title: Podman-first Development Environment +--- + +This guide documents the Podman-first development workflow for building vLLM from source with CUDA and PyTorch nightly. + +Primary entrypoint + +- Windows (PowerShell): `./extras/podman/run.ps1` +- Linux/macOS (bash): `extras/podman/run.sh` + +Legacy launchers at `extras/run-vllm-dev.ps1` and `extras/run-vllm-dev.sh` are deprecated and forward to the Podman wrappers. + +Prerequisites + +- Podman with GPU CDI enabled (on Windows, use Podman Desktop + WSL; ensure NVIDIA drivers and CUDA are installed on the host). +- Optional named volume for build/work space, e.g., `vllm-work`. + +Quick start + +Windows (PowerShell): + +```powershell +./extras/podman/run.ps1 -Build +./extras/podman/run.ps1 -GPUCheck +./extras/podman/run.ps1 -Setup -WorkVolume vllm-work -Progress +``` + +Linux/macOS (bash): + +```bash +extras/podman/run.sh --build +extras/podman/run.sh --gpu-check +extras/podman/run.sh --setup --work-volume vllm-work --progress +``` + +Notes + +- The image uses CUDA 12.9 UBI9 and installs PyTorch nightly cu129 first to ensure latest GPU arch support (including sm_120 when present). +- The setup step performs an editable vLLM install without downgrading torch family packages. +- Use a named Podman volume for `/opt/work` to avoid `/tmp` tmpfs pressure and to speed up rebuilds. diff --git a/extras/.dockerignore b/extras/.dockerignore new file mode 100644 index 000000000000..60a8d81a82c1 --- /dev/null +++ b/extras/.dockerignore @@ -0,0 +1,39 @@ +# Reduce build context to avoid Windows Podman tar write issues +.git +.github +.vscode +.venv +venv +node_modules +build +dist +csrc/ +vllm/ +benchmarks/ +docs/ +examples/ +tests/ +**/__pycache__ +**/*.pyc +**/*.pyo +**/*.pyd +**/*.so +**/*.o +**/*.a +**/*.dll +**/*.dylib +extras/build.log +extras/*.bak +extras/tools/ +extras/run-vllm-dev-*.ps1 +extras/run-vllm-dev-*.sh +extras/*wsl* +extras/*docker*.ps1 + +!extras/Dockerfile +!extras/run-vllm-dev.ps1 +!extras/run-vllm-dev.sh +!extras/dev-setup.sh +requirements/ +pyproject.toml +setup.py diff --git a/extras/Dockerfile b/extras/Dockerfile new file mode 100644 index 000000000000..6a5f5a6e4e9d --- /dev/null +++ b/extras/Dockerfile @@ -0,0 +1,168 @@ +# vLLM Development Container with GPU Support +# Uses vLLM's own requirements for automatic dependency management + +FROM nvidia/cuda:12.9.1-cudnn-devel-ubi9 + +# Set CUDA environment variables for build tools +ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_ROOT=/usr/local/cuda +ENV PATH=$CUDA_HOME/bin:$PATH +ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH +ENV CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME +ENV CUDNN_LIBRARY_PATH=/usr/lib64 +ENV CUDNN_INCLUDE_PATH=/usr/include + +# Install system packages with additional CUDA development libraries +RUN dnf update -y && dnf install --allowerasing -y \ + python3 python3-pip python3-devel \ + git gcc gcc-c++ cmake ninja-build \ + make patch which findutils tar rsync \ + wget curl vim nano \ + && dnf clean all + +# Create symlinks for python +RUN ln -sf /usr/bin/python3 /usr/bin/python + +# Create a non-root user for development +RUN useradd -m -s /bin/bash vllmuser && \ + echo "vllmuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +# Install essential system tools +RUN dnf install -y hostname iproute iputils + +# Add NVIDIA Machine Learning repo for RHEL9/UBI9 and install NCCL runtime/devel +# Needed for PyTorch nightly cu129 to avoid ncclCommWindowRegister symbol errors +# Install NCCL runtime/devel from the CUDA repository available in the base image +RUN set -euxo pipefail \ + && dnf makecache -y \ + && (dnf install -y libnccl libnccl-devel || dnf install -y libnccl-2 libnccl-devel-2) \ + && dnf clean all + +# Set working directory and adjust ownership +WORKDIR /workspace +RUN chown -R vllmuser:vllmuser /workspace + +# Create build directories with proper permissions +RUN mkdir -p /workspace/.deps && chown -R vllmuser:vllmuser /workspace/.deps && \ + mkdir -p /tmp/vllm-build && chmod 777 /tmp/vllm-build && \ + mkdir -p /opt/work && chmod 777 /opt/work && \ + mkdir -p /home/vllmuser/.cache && chown -R vllmuser:vllmuser /home/vllmuser/.cache && \ + mkdir -p /home/vllmuser/.ccache && chown -R vllmuser:vllmuser /home/vllmuser/.ccache && \ + mkdir -p /home/vllmuser/.cmake && chown -R vllmuser:vllmuser /home/vllmuser/.cmake && \ + chmod -R 755 /workspace && \ + chmod -R 777 /tmp + +# Switch to the non-root user +USER vllmuser + +# Create and activate virtual environment +ENV VIRTUAL_ENV=/home/vllmuser/venv +RUN python3 -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# Set pip configuration +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 +ENV PIP_NO_CACHE_DIR=1 +ENV PYTHONUNBUFFERED=1 +ENV PIP_DEFAULT_TIMEOUT=120 +ENV PIP_RETRIES=5 +ENV PIP_PREFER_BINARY=1 + +# Upgrade pip and setuptools to latest versions +RUN pip install --upgrade pip setuptools>=61 wheel + +COPY requirements/ /tmp/requirements/ + +# Install PyTorch nightly first (includes latest GPU arch support such as Blackwell sm_120 when present) +RUN pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129 + +# Install modern build tools and vLLM's build dependencies and CUDA deps early, +# but sanitize requirements to avoid downgrading torch-family or forcing xformers pins. +COPY pyproject.toml /tmp/pyproject.toml +RUN set -euxo pipefail \ + && cd /tmp \ + && pip install "setuptools>=61" "setuptools-scm>=8" build wheel ninja cmake \ + && mkdir -p /tmp/requirements_sanitized \ + && for f in build.txt cuda.txt common.txt; do \ + if [ -f "/tmp/requirements/$f" ]; then \ + sed -E '/^(torch|torchvision|torchaudio|xformers)\b/Id' "/tmp/requirements/$f" > "/tmp/requirements_sanitized/$f"; \ + fi; \ + done \ + && pip install --pre \ + -r /tmp/requirements_sanitized/build.txt \ + -r /tmp/requirements_sanitized/cuda.txt \ + -r /tmp/requirements_sanitized/common.txt \ + && pip install --pre --upgrade \ + torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129 + +# Install minimal development extras +RUN pip install pytest pytest-asyncio ipython + +# Note: vLLM will be installed from source in development mode via dev-setup.sh +# This ensures compatibility with the PyTorch nightly build + +# Create activation script for easy virtual environment access +RUN echo '#!/bin/bash' > /home/vllmuser/activate_venv.sh && \ + echo 'source /home/vllmuser/venv/bin/activate' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "Virtual environment activated: $VIRTUAL_ENV"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "Python version: $(python --version)"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "PyTorch version: $(python -c \"import torch; print(torch.__version__)\")"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "CUDA available: $(python -c \"import torch; print(torch.cuda.is_available())\")"' >> /home/vllmuser/activate_venv.sh && \ + chmod +x /home/vllmuser/activate_venv.sh + +# Ensure virtual environment is activated in .bashrc +RUN echo 'source /home/vllmuser/venv/bin/activate' >> /home/vllmuser/.bashrc && \ + echo 'echo "๐Ÿ Python virtual environment activated"' >> /home/vllmuser/.bashrc && \ + echo 'echo "๐Ÿš€ Ready for vLLM development!"' >> /home/vllmuser/.bashrc + +# Create development helper script that uses current workspace requirements +RUN echo '#!/bin/bash' > /home/vllmuser/setup_vllm_dev.sh && \ + echo 'echo "๐Ÿ”ง Setting up vLLM for development..."' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'cd /workspace' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Use temporary build directory to avoid permission issues' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export TMPDIR=/tmp/vllm-build' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'mkdir -p "$TMPDIR" && chmod 777 "$TMPDIR"' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export CMAKE_BUILD_PARALLEL_LEVEL=4' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export VLLM_INSTALL_PUNICA_KERNELS=0' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export MAX_JOBS=4' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Install current workspace requirements first' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'if [ -f requirements/common.txt ]; then pip install -r requirements/common.txt; fi' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Use temporary directory for CMake build files' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'FETCHCONTENT_BASE_DIR="$TMPDIR/deps" pip install -e . --no-deps --no-build-isolation --verbose' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'echo "โœ… vLLM installed in editable mode!"' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'python -c "import vllm; print(\"vLLM version:\", vllm.__version__)"' >> /home/vllmuser/setup_vllm_dev.sh && \ + chmod +x /home/vllmuser/setup_vllm_dev.sh + +# Add environment variables for better CUDA memory management and build optimization +ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +# Do not pin a single GPU here; let runtime inject device selection +# ENV CUDA_VISIBLE_DEVICES=0 +ENV CMAKE_BUILD_PARALLEL_LEVEL=4 +ENV VLLM_INSTALL_PUNICA_KERNELS=0 +ENV MAX_JOBS=4 + +# Enable ccache for faster rebuilds +ENV CCACHE_DIR=/home/vllmuser/.ccache +ENV CCACHE_MAXSIZE=10G +ENV PATH=/usr/lib64/ccache:$PATH + +# CUDA arch list including legacy + latest (sm_120) so builds cover both older and newest GPUs. +ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 12.0" +# Do not force-disable Machete; allow upstream defaults. User may still pass -e CMAKE_ARGS for custom CMake settings. +ENV CMAKE_ARGS="" + +# WSL2-specific CUDA environment configuration +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility +ENV LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/lib/wsl/drivers:/usr/lib/wsl/lib:/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH + +# Add runtime library detection script +RUN echo '#!/bin/bash' > /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "=== CUDA Library Check ==="' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "Searching for CUDA libraries..."' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'find /usr/lib/wsl -name "libcuda.so*" 2>/dev/null | head -3 || echo "No WSL CUDA libs"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'ldconfig -p | grep cuda | head -3 || echo "No CUDA in ldconfig"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "PyTorch CUDA status:"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'python -c "import torch; print(f\"CUDA available: {torch.cuda.is_available()}\"); print(f\"Device count: {torch.cuda.device_count()}\")" 2>/dev/null || echo "PyTorch not available"' >> /home/vllmuser/check_cuda_libs.sh && \ + chmod +x /home/vllmuser/check_cuda_libs.sh diff --git a/extras/README.md b/extras/README.md new file mode 100644 index 000000000000..bafd5a6dc4ca --- /dev/null +++ b/extras/README.md @@ -0,0 +1,50 @@ +# extras/ overview + +This directory hosts all non-core assets: container/build tooling, configs, testing, storage helpers, and optional patches. The goals are clarity, single-responsibility, and easy extension without touching the vLLM core. + +Suggested layout (implemented here): + +- podman/ โ€” Podman-specific build/launch wrappers and helpers +- configs/ โ€” Centralized, declarative versions and build configuration +- secrets/ โ€” Gitignored area for local tokens/config (not committed) +- testing/ โ€” Test/benchmark harness, matrices, and results +- storage/ โ€” External volumes and cache management helpers +- patches/ โ€” Optional patch/plug-in mechanism for controlled tweaks + +Primary entrypoint: use `extras/podman/` as the canonical way to build and run the dev container. + +Deprecation: the legacy launchers `extras/run-vllm-dev.sh` and `extras/run-vllm-dev.ps1` are deprecated and now forward to the Podman wrappers. Please switch to `extras/podman/run.sh` (Linux/macOS) or `extras/podman/run.ps1` (Windows). + +## Quick start + +- Edit `extras/configs/build.env` to set CUDA/UBI/Python defaults. +- Use `extras/podman/build.sh` to build images with those defaults. +- Use `extras/podman/run.ps1` (Windows) or `extras/podman/run.sh` (Linux/macOS) to run the dev container. + +Examples + +- Windows (PowerShell): + - Build image: `./extras/podman/run.ps1 -Build` + - GPU check: `./extras/podman/run.ps1 -GPUCheck` + - Setup build: `./extras/podman/run.ps1 -Setup -WorkVolume vllm-work -Progress` + +- Linux/macOS (bash): + - Build image: `extras/podman/run.sh --build` + - GPU check: `extras/podman/run.sh --gpu-check` + - Setup build: `extras/podman/run.sh --setup --work-volume vllm-work --progress` + +## Secrets + +Place tokens in `extras/secrets/` per its README and never commit them. Load them in session or bind-mount into containers. + +## Testing + +See `extras/testing/README.md` for defining a matrix, recording results, and comparing runs. + +## Storage + +See `extras/storage/README.md` for model/cache volume guidance for performance and reproducibility. + +## Patches + +If you need to tweak upstream vLLM without forking, use `extras/patches/` to stage diffs and apply them during build. diff --git a/extras/configs/README.md b/extras/configs/README.md new file mode 100644 index 000000000000..98ef0f02f786 --- /dev/null +++ b/extras/configs/README.md @@ -0,0 +1,9 @@ +# configs README + +This folder centralizes editable configuration for images/builds: + +- build.env: Bash-exported defaults (CUDA/UBI/Python/vLLM tag, arch list, volumes) +- build.yaml (optional): YAML equivalent for tools that prefer structured configs +- versions.json (optional): Machine-friendly manifest for automation + +Consumers (scripts/Containerfiles) should read values from here and allow runtime overrides via environment variables. diff --git a/extras/configs/build.env b/extras/configs/build.env new file mode 100644 index 000000000000..37babe3a18d0 --- /dev/null +++ b/extras/configs/build.env @@ -0,0 +1,24 @@ +# Build configuration +# +# Scripts should source this file to obtain default versions. +# Values can be overridden by environment variables provided at runtime. + +# CUDA / UBI / Python baselines +export CUDA_VERSION=${CUDA_VERSION:-12.9.1} +export UBI_VERSION=${UBI_VERSION:-9} +export PYTHON_VERSION=${PYTHON_VERSION:-3.11} + +# vLLM branch/tag to use inside the container when cloning or referring +export VLLM_TAG=${VLLM_TAG:-main} + +# Architectures (space separated) for PyTorch/NVCC +# Include Blackwell sm_120 via TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 12.0" +export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-"7.0 7.5 8.0 8.6 8.9 9.0 12.0"} + +# Named volume for build scratch/work dir (Podman recommended) +export VLLM_WORK_VOLUME=${VLLM_WORK_VOLUME:-vllm-work} +export VLLM_WORK_DIR_CONTAINER=${VLLM_WORK_DIR_CONTAINER:-/opt/work} + +# Image naming +export VLLM_BASE_IMAGE=${VLLM_BASE_IMAGE:-"nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubi9"} +export VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-"vllm-cuda${CUDA_VERSION}-ubi${UBI_VERSION}"} diff --git a/extras/configs/build.yaml b/extras/configs/build.yaml new file mode 100644 index 000000000000..277737dd92df --- /dev/null +++ b/extras/configs/build.yaml @@ -0,0 +1,11 @@ +cuda: + version: "12.9.1" + tag: "latest" +ubi: + version: "9.4" + tag: "latest" +python: + version: "3.11" + tag: "latest" +vllm: + tag: main diff --git a/extras/old/build-from-source.sh b/extras/old/build-from-source.sh new file mode 100644 index 000000000000..58db6e19e37e --- /dev/null +++ b/extras/old/build-from-source.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Activate venv if present +if [ -f /home/vllmuser/venv/bin/activate ]; then + source /home/vllmuser/venv/bin/activate || true +fi + +# Temporary build dirs to avoid permission issues +export TMPDIR=${TMPDIR:-/tmp/vllm-build} +umask 0002 +mkdir -p "$TMPDIR" || true +chmod 777 "$TMPDIR" || true +export FETCHCONTENT_BASE_DIR="${FETCHCONTENT_BASE_DIR:-$TMPDIR/deps}" + +# Parallelism and CUDA arch list (include Blackwell sm_120 == 12.0) +export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-4} +export MAX_JOBS=${MAX_JOBS:-4} +export NVCC_THREADS=${NVCC_THREADS:-2} +export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-7.0 7.5 8.0 8.6 8.9 9.0 12.0}" + +# Keep FA2/FA3 and machete enabled by default +export VLLM_DISABLE_FA3=${VLLM_DISABLE_FA3:-0} # 0=build FA3 +export FA3_MEMORY_SAFE_MODE=${FA3_MEMORY_SAFE_MODE:-0} + +echo "=== Build env ===" +echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST" +echo "FETCHCONTENT_BASE_DIR=$FETCHCONTENT_BASE_DIR" +echo "CMAKE_BUILD_PARALLEL_LEVEL=$CMAKE_BUILD_PARALLEL_LEVEL MAX_JOBS=$MAX_JOBS NVCC_THREADS=$NVCC_THREADS" + +python - << 'PY' +import os, torch +print('torch', torch.__version__) +print('cuda_version', torch.version.cuda) +print('cuda_available', torch.cuda.is_available()) +print('arch_list', os.getenv('TORCH_CUDA_ARCH_LIST')) +PY + +# Ensure core build tools present (setup will also ensure, this is harmless) +python -m pip install -r requirements/build.txt -q + +# Run editable build with verbose logs and capture output +mkdir -p extras +set +e +python -m pip install -e . --no-build-isolation -vv |& tee extras/build.log +status=${PIPESTATUS[0]} +set -e +echo "=== pip exited with code: $status ===" +exit $status diff --git a/extras/old/dev-setup.sh b/extras/old/dev-setup.sh new file mode 100644 index 000000000000..9e3edb1da6f4 --- /dev/null +++ b/extras/old/dev-setup.sh @@ -0,0 +1,319 @@ +#!/bin/bash +# dev-setup.sh - Set up vLLM development environment using nightly wheels +set -euo pipefail + +echo "=== vLLM Development Environment Setup ===" +echo "Container: $(hostname)" +echo "User: $(whoami)" +echo "Working directory: $(pwd)" +echo "" + +# Activate virtual environment +echo "๐Ÿ Activating Python virtual environment..." +source /home/vllmuser/venv/bin/activate +echo "Virtual environment: $VIRTUAL_ENV" +echo "Python version: $(python --version)" +echo "" + +# Check current PyTorch +echo "๐Ÿ“ฆ Current PyTorch:" +python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}')" 2>/dev/null || echo "PyTorch not installed" +echo "" + +### Optional: build from a local mirror to avoid slow Windows/virtiofs mounts during heavy C++ builds +if [ "${LOCAL_MIRROR:-0}" = "1" ]; then + echo "๐Ÿ“ LOCAL_MIRROR=1 -> Copying sources for faster builds..." + DEST="/opt/work" + if ! mkdir -p "$DEST" 2>/dev/null; then + echo "โš ๏ธ No permission to create $DEST, falling back to /tmp/work" + DEST="/tmp/work" + mkdir -p "$DEST" + fi + echo " โžœ Mirror destination: $DEST" + # Ensure destination doesn't have a stray .git folder that could cause permission errors + rm -rf "$DEST/.git" 2>/dev/null || true + # Use tar pipeline but avoid preserving ownership/permissions/timestamps to prevent utime errors on Windows mounts + # Exclude .git to avoid permission issues and speed up copy + if ! tar -C /workspace --exclude='.git' -cf - . | tar -C "$DEST" -xf - --no-same-owner --no-same-permissions 2>/dev/null; then + echo " โš ๏ธ tar copy failed (likely timestamp/perm issue). Falling back to rsync/cp ..." + shopt -s dotglob + if command -v rsync >/dev/null 2>&1; then + rsync -a --delete --exclude='.git' /workspace/ "$DEST"/ 2>/dev/null || true + else + for f in /workspace/*; do + bname="$(basename "$f")" + [ "$bname" = ".git" ] && continue + cp -R "$f" "$DEST"/ 2>/dev/null || true + done + fi + shopt -u dotglob + fi + export VLLM_SRC_DIR="$DEST" +else + export VLLM_SRC_DIR=/workspace +fi +echo "Source dir for build: ${VLLM_SRC_DIR}" + +# Ensure a large, persistent temporary directory for heavy builds (pip/CMake use $TMPDIR) +# Default to /opt/work/tmp unless user overrides via VLLM_TMPDIR/TMPDIR +if [ -n "${VLLM_TMPDIR:-}" ]; then + export TMPDIR="$VLLM_TMPDIR" +fi +if [ -z "${TMPDIR:-}" ] || [[ "$TMPDIR" == "/tmp"* ]]; then + export TMPDIR="/opt/work/tmp" +fi +export TMP="$TMPDIR"; export TEMP="$TMPDIR" +mkdir -p "$TMPDIR" 2>/dev/null || true +echo "Using TMPDIR=$TMPDIR for build temps" + +# Install PyTorch with CUDA 12.9 for RTX 5090 support +echo "๐Ÿš€ Installing PyTorch nightly (CUDA 12.9 toolchain) ..." +pip uninstall -y torch torchvision torchaudio 2>/dev/null || true +pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129 + +# Create a constraints file to prevent downgrades of any currently installed package. +# Use format "name>=version" to allow upgrades but disallow downgrades. Avoid third-party deps. +CONSTRAINTS_FILE="/tmp/pip-constraints-installed.txt" +python - <<'PY' +try: + from importlib.metadata import distributions +except Exception: # py39 backport + from importlib_metadata import distributions # type: ignore + +exclude = {pkg.lower() for pkg in ( + 'pip', 'setuptools', 'wheel' +)} +lines = [] +for d in distributions(): + name = (d.metadata.get('Name') or '').strip() + if not name or name.lower() in exclude: + continue + ver = (d.version or '').strip() + if not ver: + continue + # Remove local version suffix (after '+') to keep constraint parser happy + pv = ver.split('+', 1)[0] + norm = name.lower().replace('_', '-') + lines.append(f"{norm}>={pv}") +with open('/tmp/pip-constraints-installed.txt','w') as f: + f.write('\n'.join(sorted(set(lines)))) +print('๐Ÿ“Œ Constraints written to /tmp/pip-constraints-installed.txt (count):', len(lines)) +PY +export PIP_CONSTRAINT="$CONSTRAINTS_FILE" +echo "Using PIP_CONSTRAINT=$PIP_CONSTRAINT" + +# Set CUDA architecture list; include latest (sm_120) so builds are forward-compatible if such GPU is present. +echo "๐Ÿ”ง Configuring CUDA architectures (legacy + latest)..." +export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 12.0" +echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST" + +# Verify PyTorch version and CUDA capabilities +echo "๐Ÿ” Verifying PyTorch installation..." +python -c " +import torch +print(f'PyTorch version: {torch.__version__}') +print(f'CUDA version: {torch.version.cuda}') +print(f'CUDA available: {torch.cuda.is_available()}') +if torch.cuda.is_available(): + try: + device_props = torch.cuda.get_device_properties(0) + print(f'GPU: {torch.cuda.get_device_name(0)}') + print(f'Compute Capability: {device_props.major}.{device_props.minor}') + print(f'Memory: {device_props.total_memory // 1024**3} GB') + if device_props.major >= 9: # Blackwell architecture (RTX 50xx) + print('๐ŸŽ‰ RTX 50xx series detected - sm_120 support available!') + else: + print(f'Detected GPU architecture: sm_{device_props.major}{device_props.minor}') + except Exception as e: + print(f'GPU details unavailable: {e}') + print('Note: This is common in containers - GPU access might need container restart') +" +echo "" + +echo "๐Ÿ“ฆ Preparing to install vLLM from source (editable)..." +pip uninstall vllm -y 2>/dev/null || true + +# Preinstall pinned deps to avoid long resolver work (esp. numba/llvmlite) +echo "๐Ÿ“‹ Installing pinned requirements (build + cuda + common), sanitized to keep torch nightly..." +mkdir -p /tmp/requirements_sanitized +for f in build.txt cuda.txt common.txt; do + if [ -f "requirements/$f" ]; then + sed -E '/^(torch|torchvision|torchaudio|xformers)\b/Id' "requirements/$f" > "/tmp/requirements_sanitized/$f" + fi +done +pip install --pre \ + -r /tmp/requirements_sanitized/build.txt \ + -r /tmp/requirements_sanitized/cuda.txt \ + -r /tmp/requirements_sanitized/common.txt + +# Reinstall PyTorch nightly to override any accidental downgrade from requirements +echo "โ™ป๏ธ Ensuring PyTorch stays on nightly cu129 after requirements..." +pip install --pre --upgrade \ + torch torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/nightly/cu129 + +# Optionally install xformers if requested; otherwise skip to avoid pin conflicts with torch nightlies. +if [ "${WITH_XFORMERS:-0}" = "1" ]; then + echo "โž• Installing xformers (may override torch constraints)..." + pip install --pre xformers -f https://download.pytorch.org/whl/nightly/cu129/torch_nightly.html || true +else + echo "โญ๏ธ Skipping xformers (set WITH_XFORMERS=1 to include)" +fi + +# Build environment tuning +export VLLM_TARGET_DEVICE=cuda +export SETUPTOOLS_SCM_PRETEND_VERSION="0.10.1.dev+cu129" +# Place large build/dependency artifacts on /opt/work to avoid small /tmp tmpfs exhaustion +export VLLM_BUILD_ROOT=${VLLM_BUILD_ROOT:-/opt/work} +export FETCHCONTENT_BASE_DIR="$VLLM_BUILD_ROOT/vllm-build/deps" +mkdir -p "$FETCHCONTENT_BASE_DIR" + +# ccache for faster rebuilds +export CCACHE_DIR=/home/vllmuser/.ccache +export CCACHE_MAXSIZE=10G +export PATH=/usr/lib64/ccache:$PATH +command -v ccache >/dev/null 2>&1 && ccache -s || true + +# Respect user-provided MAX_JOBS; otherwise derive a conservative default to avoid FA3 OOM (signal 9) +if [ -z "${MAX_JOBS:-}" ]; then + # Derive from available cores but cap to 4 and adjust for memory pressure + CORES=$(nproc 2>/dev/null || echo 4) + # Read MemTotal (kB); if < 32GB, use 2; if < 16GB use 1 + MEM_KB=$(grep -i MemTotal /proc/meminfo 2>/dev/null | awk '{print $2}') + if [ -n "$MEM_KB" ]; then + if [ "$MEM_KB" -lt 16000000 ]; then + MAX_JOBS=1 + elif [ "$MEM_KB" -lt 32000000 ]; then + MAX_JOBS=2 + else + MAX_JOBS=$(( CORES < 4 ? CORES : 4 )) + fi + else + MAX_JOBS=$(( CORES < 4 ? CORES : 4 )) + fi +fi +export MAX_JOBS + +# Allow an optional memory safe mode specifically for heavy FA3 compilation (can be toggled externally) +if [ "${FA3_MEMORY_SAFE_MODE:-0}" = "1" ]; then + echo "โš ๏ธ FA3_MEMORY_SAFE_MODE=1 -> Forcing MAX_JOBS=1 and NVCC_THREADS=1 to reduce peak RAM during compilation" + export MAX_JOBS=1 + export NVCC_THREADS=1 +else + # If user has not set NVCC_THREADS, keep it low (2) to reduce per-translation-unit memory usage + if [ -z "${NVCC_THREADS:-}" ]; then + export NVCC_THREADS=2 + fi +fi + +# We no longer pass custom CMAKE_ARGS that refer to removed/unsupported options (e.g. ENABLE_MACHETE) to avoid noise. +unset CMAKE_ARGS 2>/dev/null || true +# Enable ccache via CMake compiler launchers (C/C++/CUDA) and enable verbose messages +export CMAKE_ARGS="${CMAKE_ARGS:+$CMAKE_ARGS }-DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_RULE_MESSAGES=ON" +export NINJA_STATUS="[%f/%t %o/sec] " +export CMAKE_COLOR_DIAGNOSTICS=ON + +# By default we DO NOT disable FA3; user may export VLLM_DISABLE_FA3=1 before invoking this script to skip it. +if [ -z "${VLLM_DISABLE_FA3:-}" ]; then + export VLLM_DISABLE_FA3=0 +fi + +echo "๐Ÿ”ง Build environment configured:" +echo " TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST" +echo " MAX_JOBS: $MAX_JOBS" +echo " NVCC_THREADS: ${NVCC_THREADS:-unset}" +echo " FETCHCONTENT_BASE_DIR: $FETCHCONTENT_BASE_DIR" +echo " VLLM_DISABLE_FA3: $VLLM_DISABLE_FA3 (0=build FA3, 1=skip)" +echo " FA3_MEMORY_SAFE_MODE: ${FA3_MEMORY_SAFE_MODE:-0}" + +# Build and install vLLM +echo "๐Ÿ—๏ธ Building vLLM from source (no dependency resolution)..." +cd "$VLLM_SRC_DIR" +# Ensure pip/CMake use our larger build root for temp files +export TMPDIR="$VLLM_BUILD_ROOT/tmp" +export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-$MAX_JOBS} +mkdir -p "$TMPDIR" 2>/dev/null || true +LOG_DST="$VLLM_SRC_DIR/extras/build.log" +mkdir -p "$(dirname "$LOG_DST")" 2>/dev/null || true +set -o pipefail +TIMEFORMAT='โฑ Build time: %3lR' +# Progress watcher is fully opt-in now (no auto-enable on TTY) +PROGRESS_WATCH=${PROGRESS_WATCH:-0} + +# Optional lightweight progress watcher: echoes lines like "[25/341] ..." as they appear +WATCH_PID="" +if [ "$PROGRESS_WATCH" = "1" ]; then + echo "๐Ÿช„ Progress watcher enabled (looking for [x/total] in build.log)" + ( + # tail -F waits for file to appear; --pid ensures it exits with this script + tail --pid=$$ -n +1 -F "$LOG_DST" 2>/dev/null | \ + awk 'match($0,/\[[0-9]+\/[0-9]+\]/){ + ts=strftime("%H:%M:%S"); + # print a compact, updating status line + printf("\r[%s] %s", ts, substr($0, RSTART, RLENGTH)); + fflush(stdout); + } END { print "" }' + ) & + WATCH_PID=$! +fi + +# Prefer line-buffered output for better streaming through tee if stdbuf exists +if command -v stdbuf >/dev/null 2>&1; then + time stdbuf -oL -eL pip install --no-build-isolation --no-deps -e . -vv | tee "$LOG_DST" +else + time pip install --no-build-isolation --no-deps -e . -vv | tee "$LOG_DST" +fi + +# Cleanup watcher so we leave the cursor nicely +if [ -n "${WATCH_PID}" ]; then + kill "$WATCH_PID" 2>/dev/null || true + echo "" >&2 +fi +echo "๐Ÿ“„ Build log: $LOG_DST" + +if [ $? -eq 0 ]; then + echo "โœ… vLLM editable install completed successfully" +else + echo "โŒ Failed to install vLLM" + exit 1 +fi + +echo "" +echo "๐Ÿงช Testing vLLM installation..." +python -c "import vllm; print('vLLM version:', vllm.__version__)" + +echo "" +echo "๐ŸŽฎ Testing GPU support..." +python -c " +import torch +print('CUDA available:', torch.cuda.is_available()) +if torch.cuda.is_available(): + print('GPU count:', torch.cuda.device_count()) + try: + print('Current GPU:', torch.cuda.get_device_name(0)) + except Exception as e: + print('GPU name unavailable (container GPU access issue)') +else: + print('No GPU detected - check container GPU mounting') +" + +echo "" +echo "๐Ÿ“ vLLM Development Environment Ready!" +echo "======================================" +echo "Source code: /workspace" +echo "Virtual env: $VIRTUAL_ENV" +echo "GPU support: $(python -c 'import torch; print(torch.cuda.is_available())')" +echo "" +echo "๐Ÿ› ๏ธ Quick Commands:" +echo " python -c 'import vllm' # Test vLLM import" +echo " python -c 'import torch; print(torch.cuda.is_available())' # Test CUDA" +echo " nvidia-smi # Check GPU status" +echo "" +echo "๏ฟฝ Ready for vLLM development!" +echo "- Edit code: files are mounted from host" +echo "- Test changes: python -m pytest tests/" +echo "- Test environment: python /workspace/extras/final_environment_test.py" +echo "- Run vLLM: python -m vllm.entrypoints.openai.api_server" +echo "- SSH access: ssh vllmuser@localhost -p 2222 (password: vllmdev)" +echo "" +echo "โœจ Happy coding!" diff --git a/extras/old/run-vllm-dev.ps1 b/extras/old/run-vllm-dev.ps1 new file mode 100644 index 000000000000..55820ff7471d --- /dev/null +++ b/extras/old/run-vllm-dev.ps1 @@ -0,0 +1,6 @@ +#!/usr/bin/env pwsh +# Deprecated: please use extras/podman/run.ps1. This script forwards for back-compat. +param([Parameter(ValueFromRemainingArguments=$true)] [string[]]$Args) +$pod = Join-Path $PSScriptRoot 'podman\run.ps1' +if (-not (Test-Path $pod)) { Write-Error "Missing: $pod"; exit 1 } +& $pod @Args diff --git a/extras/old/run-vllm-dev.sh b/extras/old/run-vllm-dev.sh new file mode 100644 index 000000000000..b5a8a906ad06 --- /dev/null +++ b/extras/old/run-vllm-dev.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Deprecated: please use extras/podman/run.sh. This script forwards for back-compat. +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}" )" &>/dev/null && pwd) +exec "${SCRIPT_DIR}/podman/run.sh" "$@" diff --git a/extras/old/test-vllm-container.ps1 b/extras/old/test-vllm-container.ps1 new file mode 100644 index 000000000000..61852551c124 --- /dev/null +++ b/extras/old/test-vllm-container.ps1 @@ -0,0 +1,32 @@ +# vLLM Container Test Script +# Run this from the vLLM workspace directory + +Write-Host "๐Ÿš€ Testing vLLM Container Environment..." -ForegroundColor Green +Write-Host ("=" * 50) + +# Test 1: Basic container functionality +Write-Host "`n๐Ÿ“‹ Test 1: Container and GPU Access" -ForegroundColor Yellow +& podman run --rm --device=nvidia.com/gpu=all vllm-dev-fixed:v2 bash -c 'source /home/vllmuser/venv/bin/activate; cd /tmp; python -c "import torch; print(torch.cuda.is_available())"' + +if ($LASTEXITCODE -eq 0) { + Write-Host "โœ… Container and GPU access working!" -ForegroundColor Green +} else { + Write-Host "โŒ Container or GPU access failed!" -ForegroundColor Red + exit 1 +} + +# Test 2: vLLM installation +Write-Host "`n๐Ÿ“‹ Test 2: vLLM Installation" -ForegroundColor Yellow +& podman run --rm --device=nvidia.com/gpu=all vllm-dev-fixed:v2 bash -c 'source /home/vllmuser/venv/bin/activate; cd /tmp; python -c "import vllm; print(vllm.__version__)"' + +if ($LASTEXITCODE -eq 0) { + Write-Host "โœ… vLLM installation working!" -ForegroundColor Green +} else { + Write-Host "โŒ vLLM installation failed!" -ForegroundColor Red + exit 1 +} + +Write-Host "`n๐ŸŽ‰ SUCCESS: vLLM container environment is fully functional!" -ForegroundColor Green +Write-Host "`n๐Ÿ“– Usage:" -ForegroundColor Cyan +Write-Host ' podman run --rm -it --device=nvidia.com/gpu=all -v "${PWD}:/workspace" vllm-dev-fixed:v2' -ForegroundColor White +Write-Host "`n๐Ÿ“š Documentation: See CONTAINER_SETUP_COMPLETE.md for detailed usage guide" -ForegroundColor Cyan diff --git a/extras/patches/README.md b/extras/patches/README.md new file mode 100644 index 000000000000..ff4f662c4588 --- /dev/null +++ b/extras/patches/README.md @@ -0,0 +1,5 @@ +# Patches and plugins scaffolding + +- Place unified diffs (*.diff) here. +- Use `apply_patches.sh` to apply them before building. +- Optionally, add Python plugins under `plugin/` and load dynamically at runtime. diff --git a/extras/patches/apply_patches.sh b/extras/patches/apply_patches.sh new file mode 100644 index 000000000000..70437f0bd645 --- /dev/null +++ b/extras/patches/apply_patches.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +PATCH_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +ROOT_DIR=$(cd -- "${PATCH_DIR}/../.." &>/dev/null && pwd) + +shopt -s nullglob +PATCHES=(${PATCH_DIR}/*.diff) +shopt -u nullglob + +if [ ${#PATCHES[@]} -eq 0 ]; then + echo "[patches] No patches found; nothing to apply." + exit 0 +fi + +pushd "${ROOT_DIR}" >/dev/null +for p in "${PATCHES[@]}"; do + echo "[patches] Applying ${p}" + git apply --check "${p}" + git apply "${p}" + done +popd >/dev/null + +echo "[patches] Done." diff --git a/extras/podman/Containerfile b/extras/podman/Containerfile new file mode 100644 index 000000000000..d42bef4b344e --- /dev/null +++ b/extras/podman/Containerfile @@ -0,0 +1,11 @@ +# syntax=docker/dockerfile:1.7-labs + +# Delegator Containerfile. +# Build using the canonical Dockerfile in extras/ to avoid duplication. + +FROM scratch as noop + +# Usage: +# podman build -f extras/Dockerfile -t vllm-dev:latest . +# or from this folder (wrapper script does this for you): +# bash build.sh diff --git a/extras/podman/README.md b/extras/podman/README.md new file mode 100644 index 000000000000..fb0c361203f2 --- /dev/null +++ b/extras/podman/README.md @@ -0,0 +1,12 @@ +# Podman helpers for vLLM + +This folder contains Podman-specific wrappers. They preserve back-compat by calling the existing scripts in `extras/` when present. + +- Containerfile: Thin wrapper that defers to `extras/Dockerfile` by default. +- build.sh: Builds the image using values from `../configs/build.env`. +- entrypoint/: Optional entrypoint scripts used inside containers. +- scripts/: Utility helpers for Podman machine/GPU/volumes. + +See README for usage. + +Documentation: see `docs/contributing/podman-dev.md` for the Podman-first workflow and deprecation notes for legacy launchers. diff --git a/extras/podman/build.sh b/extras/podman/build.sh new file mode 100644 index 000000000000..a4ec5f445825 --- /dev/null +++ b/extras/podman/build.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Why: Back-compat wrapper that sources central config and builds using the canonical Dockerfile. + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +ROOT_DIR=$(cd -- "${SCRIPT_DIR}/../.." &>/dev/null && pwd) +CONFIG_DIR="${SCRIPT_DIR}/../configs" + +# shellcheck source=../configs/build.env +if [ -f "${CONFIG_DIR}/build.env" ]; then + # shellcheck disable=SC1091 + source "${CONFIG_DIR}/build.env" +fi + +CUDA_VERSION=${CUDA_VERSION:-12.9.1} +UBI_VERSION=${UBI_VERSION:-9} +VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-"vllm-cuda${CUDA_VERSION}-ubi${UBI_VERSION}"} + +CONTEXT="${ROOT_DIR}" +DOCKERFILE_REL="extras/Dockerfile" + +echo "[podman/build] Building image ${VLLM_IMAGE_TAG} with CUDA=${CUDA_VERSION}, UBI=${UBI_VERSION}" + +podman build \ + --build-arg CUDA_VERSION="${CUDA_VERSION}" \ + --build-arg UBI_VERSION="${UBI_VERSION}" \ + -t "${VLLM_IMAGE_TAG}" \ + -f "${DOCKERFILE_REL}" \ + "${CONTEXT}" + +echo "[podman/build] Done -> ${VLLM_IMAGE_TAG}" diff --git a/extras/podman/dev-setup.sh b/extras/podman/dev-setup.sh new file mode 100644 index 000000000000..09eea6079a02 --- /dev/null +++ b/extras/podman/dev-setup.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Robust setup entrypoint: prefer extras/dev-setup.sh, fallback to extras/old/dev-setup.sh, +# otherwise use the image-provided /home/vllmuser/setup_vllm_dev.sh. +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}" )" &>/dev/null && pwd) +EXTRAS_DIR=$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd) + +try_exec() { + local target="$1" + if [[ -f "$target" ]]; then + chmod +x "$target" 2>/dev/null || true + exec "$target" "$@" + fi +} + +# 1) Current canonical path +if [[ -f "${EXTRAS_DIR}/dev-setup.sh" ]]; then + chmod +x "${EXTRAS_DIR}/dev-setup.sh" 2>/dev/null || true + exec "${EXTRAS_DIR}/dev-setup.sh" "$@" +fi + +# 2) Legacy archived location +if [[ -f "${EXTRAS_DIR}/old/dev-setup.sh" ]]; then + chmod +x "${EXTRAS_DIR}/old/dev-setup.sh" 2>/dev/null || true + exec "${EXTRAS_DIR}/old/dev-setup.sh" "$@" +fi + +# 3) Fallback to image helper +if command -v /home/vllmuser/setup_vllm_dev.sh >/dev/null 2>&1 || [[ -f /home/vllmuser/setup_vllm_dev.sh ]]; then + exec /home/vllmuser/setup_vllm_dev.sh "$@" +fi + +echo "[setup] No setup script found at extras/dev-setup.sh or extras/old/dev-setup.sh, and no image helper present." >&2 +exit 1 diff --git a/extras/podman/run.ps1 b/extras/podman/run.ps1 new file mode 100644 index 000000000000..6724db007417 --- /dev/null +++ b/extras/podman/run.ps1 @@ -0,0 +1,180 @@ +#!/usr/bin/env pwsh +[CmdletBinding()] param( + [switch]$Build, + [switch]$Interactive, + [string]$Command = "", + [switch]$Setup, + [switch]$GPUCheck, + [switch]$Mirror, + [switch]$Recreate, + [string]$WorkVolume = "", + [string]$WorkDirHost = "", + [switch]$Progress, + [switch]$Help +) + +if ($Help) { + Write-Host "Usage: extras/podman/run.ps1 [-Build] [-Interactive] [-Command ] [-Setup] [-GPUCheck] [-Mirror] [-Recreate] [-WorkVolume ] [-WorkDirHost ] [-Progress]"; exit 0 +} + +if (-not $Interactive -and [string]::IsNullOrEmpty($Command) -and -not $GPUCheck -and -not $Setup) { $Interactive = $true } + +if (-not (Get-Command podman -ErrorAction SilentlyContinue)) { Write-Host "โŒ Podman not found in PATH" -ForegroundColor Red; exit 1 } + +$ContainerName = "vllm-dev" +$ImageTag = "vllm-dev:latest" +$SourceDir = (Get-Location).Path + +Write-Host "๐Ÿ‹ vLLM Dev Container (Podman)" -ForegroundColor Green + +if ($Build) { + Write-Host "๐Ÿ”จ Building image..." -ForegroundColor Yellow + $buildCmd = @("build","-f","extras/Dockerfile","-t",$ImageTag,".") + & podman @buildCmd + if ($LASTEXITCODE -ne 0) { Write-Host "โŒ Build failed" -ForegroundColor Red; exit 1 } + Write-Host "โœ… Build ok" -ForegroundColor Green +} + +# Already running? +$running = podman ps --filter "name=$ContainerName" --format "{{.Names}}" 2>$null + +if ($Recreate -and $running -eq $ContainerName) { + Write-Host "โ™ป๏ธ Removing existing container '$ContainerName'" -ForegroundColor Yellow + podman rm -f $ContainerName | Out-Null + $running = $null +} + +if ($running -eq $ContainerName) { + if ($GPUCheck) { + Write-Host "๐Ÿ” GPU check (existing container)" -ForegroundColor Yellow + $cmd = @' +source /home/vllmuser/venv/bin/activate && python - <<'PY' +import torch, os +print("PyTorch:", getattr(torch,"__version__","n/a")) +print("CUDA:", torch.cuda.is_available()) +print("Devices:", torch.cuda.device_count() if torch.cuda.is_available() else 0) +print("LD_LIBRARY_PATH:", os.environ.get("LD_LIBRARY_PATH")) +if torch.cuda.is_available(): + try: + print("GPU 0:", torch.cuda.get_device_name(0)) + except Exception as e: + print("GPU name error:", e) +PY +nvidia-smi || true +'@ + $cmd = "export NVIDIA_VISIBLE_DEVICES=all; " + $cmd + podman exec $ContainerName bash -lc $cmd + exit $LASTEXITCODE + } + if ($Setup) { + Write-Host "๐Ÿ”ง Running dev setup in existing container" -ForegroundColor Yellow + $envs = @() + if ($Mirror) { $envs += @('LOCAL_MIRROR=1') } + if ($Progress) { $envs += @('PROGRESS_WATCH=1') } + $envs += @('NVIDIA_VISIBLE_DEVICES=all') + $envStr = ($envs | ForEach-Object { "export $_;" }) -join ' ' + $cmd = "$envStr chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh" + if ($Progress) { podman exec -it $ContainerName bash -lc $cmd } else { podman exec $ContainerName bash -lc $cmd } + exit $LASTEXITCODE + } + if ($Command) { + Write-Host "๐Ÿš€ Running command in existing container" -ForegroundColor Green + $runCmd = "source /home/vllmuser/venv/bin/activate && $Command" + podman exec $ContainerName bash -c $runCmd + exit $LASTEXITCODE + } + $resp = Read-Host "Attach to running container? [Y/n]" + if ($resp -eq "" -or $resp -match '^[Yy]$') { podman exec -it $ContainerName bash; exit $LASTEXITCODE } else { exit 0 } +} + +# Ensure image exists +podman image exists $ImageTag +if ($LASTEXITCODE -ne 0) { Write-Host "โŒ Image missing. Use -Build." -ForegroundColor Red; exit 1 } + +# Base args (no default /tmp tmpfs; can be enabled via VLLM_TMPFS_TMP_SIZE) +$runArgs = @("run","--rm","--security-opt=label=disable","--shm-size","8g","-v","${SourceDir}:/workspace:Z") +if (-not [string]::IsNullOrWhiteSpace($WorkVolume)) { $runArgs += @('-v',"${WorkVolume}:/opt/work:Z") } +elseif ($WorkDirHost -and (Test-Path $WorkDirHost)) { $runArgs += @('-v',"${WorkDirHost}:/opt/work:Z") } +$runArgs += @('-w','/workspace','--name',"$ContainerName",'--user','vllmuser','--env','ENGINE=podman') + +$tmpfsSize = [Environment]::GetEnvironmentVariable('VLLM_TMPFS_TMP_SIZE') +if (-not [string]::IsNullOrEmpty($tmpfsSize) -and $tmpfsSize -ne '0') { $runArgs += @('--tmpfs',"/tmp:size=$tmpfsSize") } + +if ($true) { # Request GPU via CDI hooks + $runArgs = @("run","--rm","--security-opt=label=disable","--device=nvidia.com/gpu=all") + $runArgs[2..($runArgs.Length-1)] +} + +# WSL GPU: map /dev/dxg and mount WSL libs +$runArgs += @('--device','/dev/dxg','-v','/usr/lib/wsl:/usr/lib/wsl:ro') +if ($Mirror) { $runArgs += @('--env','LOCAL_MIRROR=1') } +foreach ($ev in 'NVIDIA_VISIBLE_DEVICES','NVIDIA_DRIVER_CAPABILITIES','NVIDIA_REQUIRE_CUDA') { + $val = [Environment]::GetEnvironmentVariable($ev) + if ($val) { $runArgs += @('--env',"$ev=$val") } +} +$runArgs += @('--env','ENGINE=podman','--env','NVIDIA_VISIBLE_DEVICES=all','--env','NVIDIA_DRIVER_CAPABILITIES=compute,utility','--env','NVIDIA_REQUIRE_CUDA=') + +if ($GPUCheck) { + $pyDiag = @' +import json, torch, os +out = { + "torch_version": getattr(torch, "__version__", "n/a"), + "torch_cuda_version": getattr(getattr(torch, "version", None), "cuda", "n/a"), + "cuda_available": torch.cuda.is_available(), + "ld_library_path": os.environ.get("LD_LIBRARY_PATH"), +} +try: + out["device_count"] = torch.cuda.device_count() +except Exception as e: + out["device_count_error"] = str(e) +if out["cuda_available"] and out.get("device_count", 0) > 0: + try: + cap = torch.cuda.get_device_capability(0) + out["device_0"] = {"name": torch.cuda.get_device_name(0), "capability": f"sm_{cap[0]}{cap[1]}"} + except Exception as e: + out["device_0_error"] = str(e) +else: + out["diagnostics"] = ["Missing /dev/nvidia* or podman machine without GPU passthrough"] +print(json.dumps(out, indent=2)) +'@ + $pyB64 = [Convert]::ToBase64String([Text.Encoding]::UTF8.GetBytes($pyDiag)) + $gpuScript = @' +echo '=== GPU Check ===' +which nvidia-smi && nvidia-smi || echo 'nvidia-smi unavailable' +echo '--- /dev/nvidia* ---' +ls -l /dev/nvidia* 2>/dev/null || echo 'no /dev/nvidia* nodes' +echo '--- Environment (NVIDIA_*) ---' +env | grep -E '^NVIDIA_' || echo 'no NVIDIA_* env vars' +if [ "$NVIDIA_VISIBLE_DEVICES" = "void" ]; then echo 'WARN: NVIDIA_VISIBLE_DEVICES=void (no GPU mapped)'; fi +echo '--- LD_LIBRARY_PATH ---' +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" +source /home/vllmuser/venv/bin/activate 2>/dev/null || true +echo __PY_B64__ | base64 -d > /tmp/gpucheck.py +python /tmp/gpucheck.py || true +rm -f /tmp/gpucheck.py +'@ + $gpuScript = "export NVIDIA_VISIBLE_DEVICES=all; export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/lib/wsl/drivers:`$LD_LIBRARY_PATH; " + ($gpuScript -replace '__PY_B64__', $pyB64) -replace "`r","" + $runArgs += @('--user','root', $ImageTag,'bash','-lc',$gpuScript) +} elseif ($Setup) { + # Use robust setup entrypoint that finds the right script (extras/dev-setup.sh, extras/old/dev-setup.sh, or image helper) + $prefix = "chmod +x ./extras/podman/dev-setup.sh 2>/dev/null || true; " + $envPrefix = '' + if ($Mirror) { $envPrefix += 'export LOCAL_MIRROR=1; ' } + if ($Progress) { $envPrefix += 'export PROGRESS_WATCH=1; ' } + $envPrefix += 'export TMPDIR=/opt/work/tmp; export TMP=/opt/work/tmp; export TEMP=/opt/work/tmp; mkdir -p /opt/work/tmp; ' + $setupCmd = $prefix + $envPrefix + "./extras/podman/dev-setup.sh" + if ($Progress) { $runArgs += @('-it', $ImageTag, 'bash','-lc', $setupCmd) } else { $runArgs += @($ImageTag, 'bash','-lc', $setupCmd) } + Write-Host "๐Ÿ”ง Running dev setup" -ForegroundColor Green +} elseif ($Interactive -and -not $Command) { + $runArgs += @('-it',$ImageTag,'bash') + Write-Host "๐Ÿš€ Interactive shell" -ForegroundColor Green +} elseif ($Command) { + $runArgs += @($ImageTag,'bash','-lc',"source /home/vllmuser/venv/bin/activate && $Command") + Write-Host "๐Ÿš€ Running command" -ForegroundColor Green +} else { + $runArgs += @($ImageTag) +} + +Write-Host "Command: podman $($runArgs -join ' ')" -ForegroundColor Gray +& podman @runArgs + +if ($LASTEXITCODE -eq 0 -and $Interactive) { Write-Host "Exited cleanly" -ForegroundColor Green } diff --git a/extras/podman/run.sh b/extras/podman/run.sh new file mode 100644 index 000000000000..ddafbcc578d0 --- /dev/null +++ b/extras/podman/run.sh @@ -0,0 +1,158 @@ +#!/usr/bin/env bash +# Unified lightweight vLLM dev container launcher (Podman-first, Linux/macOS) +set -euo pipefail + +IMAGE_TAG="vllm-dev:latest" +CONTAINER_NAME="vllm-dev" +SOURCE_DIR="$(pwd)" + +show_help() { + cat <&2; show_help; exit 1 ;; + esac +done + +if ! command -v podman >/dev/null 2>&1; then + echo "Error: podman not found in PATH" >&2 + exit 1 +fi + +echo "[vLLM] Engine: podman Image: $IMAGE_TAG Container: $CONTAINER_NAME" + +if [[ $BUILD -eq 1 ]]; then + echo "[vLLM] Building image..." + if ! podman build -f extras/Dockerfile -t "$IMAGE_TAG" .; then + echo "[vLLM] Build failed" >&2 + exit 1 + fi + echo "[vLLM] Build complete" +fi + +# If container running, attach / exec +RUNNING=$(podman ps --filter "name=${CONTAINER_NAME}" --format '{{.Names}}' 2>/dev/null || true) + +if [[ "$RUNNING" == "$CONTAINER_NAME" ]]; then + if [[ $GPU_CHECK -eq 1 ]]; then + echo "[vLLM] GPU check (existing container)" + exec podman exec "$CONTAINER_NAME" bash -lc 'source /home/vllmuser/venv/bin/activate 2>/dev/null || true; which nvidia-smi && nvidia-smi || true; python - </dev/null || true; ./extras/dev-setup.sh' + else + exec podman exec "$CONTAINER_NAME" bash -lc 'chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh' + fi + fi + if [[ -n "$CMD" ]]; then + echo "[vLLM] Exec command in existing container" + podman exec "$CONTAINER_NAME" bash -lc "source /home/vllmuser/venv/bin/activate 2>/dev/null || true; $CMD" + exit $? + fi + read -r -p "Attach to running container ${CONTAINER_NAME}? [Y/n] " RESP || true + if [[ -z "$RESP" || "$RESP" =~ ^[Yy]$ ]]; then + exec podman exec -it "$CONTAINER_NAME" bash + else + exit 0 + fi +fi + +# Ensure image exists if not building +if [[ $BUILD -ne 1 ]]; then + if ! podman image exists "$IMAGE_TAG"; then + echo "Image $IMAGE_TAG missing. Use --build." >&2; exit 1 + fi +fi + +# Base run args +RUN_ARGS=(run --rm --device=nvidia.com/gpu=all --security-opt=label=disable --shm-size 8g --name "$CONTAINER_NAME" -v "${SOURCE_DIR}:/workspace:Z" -w /workspace --user vllmuser --env ENGINE=podman) + +# Prefer named volume for /opt/work if provided +if [[ -n "$WORK_VOLUME" ]]; then + RUN_ARGS+=(-v "${WORK_VOLUME}:/opt/work:Z") +fi + +# Allow configurable /tmp tmpfs size via VLLM_TMPFS_TMP_SIZE (default 0=disabled) +TMPFS_SIZE="${VLLM_TMPFS_TMP_SIZE:-0}" +if [[ -n "$TMPFS_SIZE" && "$TMPFS_SIZE" != "0" ]]; then + RUN_ARGS+=(--tmpfs "/tmp:size=${TMPFS_SIZE}") +fi + +# Ensure sane NVIDIA env defaults inside container to avoid 'void' and missing caps +RUN_ARGS+=(--env "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}" \ + --env "NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}" \ + --env "NVIDIA_REQUIRE_CUDA=") + +if [[ $GPU_CHECK -eq 1 ]]; then + GPU_SCRIPT=$'echo "=== GPU Check ==="; which nvidia-smi && nvidia-smi || echo "nvidia-smi unavailable"; echo "--- /dev/nvidia* ---"; ls -l /dev/nvidia* 2>/dev/null || echo "no /dev/nvidia* nodes"; echo "--- Environment (NVIDIA_*) ---"; env | grep -E "^NVIDIA_" || echo "no NVIDIA_* env vars"; if [ "$NVIDIA_VISIBLE_DEVICES" = "void" ]; then echo "WARN: NVIDIA_VISIBLE_DEVICES=void (no GPU mapped)"; fi; echo "--- LD_LIBRARY_PATH ---"; echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"; source /home/vllmuser/venv/bin/activate 2>/dev/null || true; python - <0:\n\ttry:\n\t\tcap=torch.cuda.get_device_capability(0)\n\t\tout[\'device_0\']={\'name\':torch.cuda.get_device_name(0),\'capability\':f"sm_{cap[0]}{cap[1]}"}\n\texcept Exception as e:\n\t\tout[\'device_0_error\']=str(e)\nelse:\n\tout[\'diagnostics\']=[\'Missing /dev/nvidia* or podman machine without GPU passthrough\']\nprint(json.dumps(out,indent=2))\nPY' + RUN_ARGS+=("$IMAGE_TAG" bash -lc "$GPU_SCRIPT") +elif [[ $SETUP -eq 1 ]]; then + if [[ $MIRROR -eq 1 ]]; then + RUN_ARGS+=(--env LOCAL_MIRROR=1) + fi + if [[ $PROGRESS -eq 1 ]]; then + RUN_ARGS+=(--env PROGRESS_WATCH=1) + RUN_ARGS+=("-it" "$IMAGE_TAG" bash -lc 'chmod +x ./extras/podman/dev-setup.sh 2>/dev/null || true; ./extras/podman/dev-setup.sh') + else + RUN_ARGS+=("$IMAGE_TAG" bash -lc 'chmod +x ./extras/podman/dev-setup.sh 2>/dev/null || true; ./extras/podman/dev-setup.sh') + fi +elif [[ -n "$CMD" ]]; then + RUN_ARGS+=("$IMAGE_TAG" bash -lc "source /home/vllmuser/venv/bin/activate 2>/dev/null || true; $CMD") +else + RUN_ARGS+=("-it" "$IMAGE_TAG" bash) + echo "[vLLM] Interactive shell. Helpful inside container:" + echo " ./extras/dev-setup.sh # Build/install editable vLLM" + echo " python -c 'import torch;print(torch.cuda.is_available())'" + echo " python -c 'import vllm'" +fi + +echo "[vLLM] Command: podman ${RUN_ARGS[*]}" +exec podman "${RUN_ARGS[@]}" diff --git a/extras/podman/scripts/gpu_status.sh b/extras/podman/scripts/gpu_status.sh new file mode 100644 index 000000000000..a50c78b01c03 --- /dev/null +++ b/extras/podman/scripts/gpu_status.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Helper to show GPU/CDI status under Podman (Linux/WSL) + +podman info --format json | jq '.host' || podman info || true + +# Show CDI devices if available +podman cdi list || true diff --git a/extras/secrets/.gitignore b/extras/secrets/.gitignore new file mode 100644 index 000000000000..d4895ec18947 --- /dev/null +++ b/extras/secrets/.gitignore @@ -0,0 +1,4 @@ +# Ensure this directory stays out of git; keep this file only. +* +!.gitignore +!README.md diff --git a/extras/secrets/README.md b/extras/secrets/README.md new file mode 100644 index 000000000000..ec4e155665e8 --- /dev/null +++ b/extras/secrets/README.md @@ -0,0 +1,11 @@ +# secrets directory + +This directory is gitignored and intended for local-only secret material such as model hub tokens. + +Files are expected to be simple KEY=VALUE lines that can be sourced by shell scripts. + +Examples: +- hf-credentials.env +- cn-modelhub-credentials.env + +Do NOT commit secrets. See README for details. diff --git a/extras/storage/README.md b/extras/storage/README.md new file mode 100644 index 000000000000..d106b6d7378c --- /dev/null +++ b/extras/storage/README.md @@ -0,0 +1,7 @@ +# Storage helpers + +Declare and manage external volumes for models and caches. + +- storage-config.yaml: Declarative host/container paths +- setup_local.sh: Helper to prepare a local volume or directory +- scripts/: Utilities for warmup, cache management, mounts diff --git a/extras/storage/scripts/warm_cache.sh b/extras/storage/scripts/warm_cache.sh new file mode 100644 index 000000000000..1d97b7f044f6 --- /dev/null +++ b/extras/storage/scripts/warm_cache.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Placeholder for cache warmup logic. +# Example usage: ./warm_cache.sh meta-llama/Llama-3-8B /models +MODEL_ID=${1:-meta-llama/Llama-3-8B} +TARGET=${2:-/models} +mkdir -p "$TARGET" +echo "(scaffold) Would warm cache for $MODEL_ID under $TARGET" diff --git a/extras/storage/setup_local.sh b/extras/storage/setup_local.sh new file mode 100644 index 000000000000..101826bc7396 --- /dev/null +++ b/extras/storage/setup_local.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Prepare a local directory for models and ensure reasonable permissions. +TARGET=${1:-/mnt/ml-models} +mkdir -p "$TARGET" +chmod 775 "$TARGET" || true + +echo "Model storage prepared at: $TARGET" diff --git a/extras/storage/storage-config.yaml b/extras/storage/storage-config.yaml new file mode 100644 index 000000000000..90310b572b3c --- /dev/null +++ b/extras/storage/storage-config.yaml @@ -0,0 +1,4 @@ +model_volume: + path_host: "/mnt/ml-models" + path_container: "/models" + shared: true diff --git a/extras/testing/README.md b/extras/testing/README.md new file mode 100644 index 000000000000..2c64d538ac97 --- /dev/null +++ b/extras/testing/README.md @@ -0,0 +1,7 @@ +# Testing and benchmarking harness + +- Define a matrix of models/environments in `test_matrix.yaml`. +- Run via `python extras/testing/run_tests.py --output-dir extras/testing/results/$(date +%F_%H-%M)`. +- Store results in `results/` with timestamps for regression tracking. + +This scaffolding is intentionally minimal; models and benchmarks can be added incrementally. diff --git a/extras/testing/compare_results.py b/extras/testing/compare_results.py new file mode 100644 index 000000000000..628e08e7d72c --- /dev/null +++ b/extras/testing/compare_results.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +from __future__ import annotations +import argparse +import json +import os +import sys + +def load(path: str) -> dict: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("a") + p.add_argument("b") + args = p.parse_args() + + A = load(args.a) + B = load(args.b) + + # Placeholder comparison: print keys that differ + diffs = sorted(set(A.keys()) ^ set(B.keys())) + print(json.dumps({"diff_keys": diffs})) + return 0 + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/extras/testing/run_tests.py b/extras/testing/run_tests.py new file mode 100644 index 000000000000..0e58573bb8d0 --- /dev/null +++ b/extras/testing/run_tests.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Minimal, non-destructive test harness that prints a JSON line per test. +This is a scaffold; integrate with your local launchers or CI as needed. +""" +from __future__ import annotations +import argparse +import json +import os +import sys +from datetime import datetime + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--cuda-version", default=os.getenv("CUDA_VERSION", "12.9.1")) + p.add_argument("--ubi-version", default=os.getenv("UBI_VERSION", "9.4")) + p.add_argument("--models", default="Example-Llama3-8B") + p.add_argument("--output-dir", default=os.path.join("extras", "testing", "results", datetime.now().strftime("%F_%H-%M"))) + args = p.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + result = { + "ts": datetime.utcnow().isoformat() + "Z", + "cuda": args.cuda_version, + "ubi": args.ubi_version, + "models": args.models.split(","), + "status": "scaffold", + "notes": "Integrate with vLLM server/client to collect real metrics.", + } + + out_path = os.path.join(args.output_dir, "scaffold.json") + with open(out_path, "w", encoding="utf-8") as f: + json.dump(result, f, indent=2) + + print(json.dumps({"written": out_path})) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/extras/testing/test_matrix.yaml b/extras/testing/test_matrix.yaml new file mode 100644 index 000000000000..270e7ff5ec13 --- /dev/null +++ b/extras/testing/test_matrix.yaml @@ -0,0 +1,16 @@ +models: + - name: Example-Llama3-8B + id: meta-llama/Llama-3-8B + chat_template: chat_templates/llama-3-instruct.jinja + params: + max_tokens: 64 + temperature: 0.7 + +environments: + - cuda: 12.9.1 + ubi: 9.4 + +benchmarks: + - name: inference_speed + input: "Summarize: vLLM extras modularization plan." + metrics: [latency_ms, tokens_per_sec]