diff --git a/.gemini/config.yaml b/.gemini/config.yaml deleted file mode 100644 index 2499d3f09510..000000000000 --- a/.gemini/config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github -have_fun: false # Just review the code -code_review: - comment_severity_threshold: HIGH # Reduce quantity of comments - pull_request_opened: - summary: false # Don't summarize the PR in a separate comment diff --git a/.github/ci-trigger-20250814-1 b/.github/ci-trigger-20250814-1 new file mode 100644 index 000000000000..8ca993aa58b2 --- /dev/null +++ b/.github/ci-trigger-20250814-1 @@ -0,0 +1 @@ +trigger: sync_with_upstream diff --git a/.github/workflows/sync_with_upstream.yml b/.github/workflows/sync_with_upstream.yml new file mode 100644 index 000000000000..5dce797dae16 --- /dev/null +++ b/.github/workflows/sync_with_upstream.yml @@ -0,0 +1,99 @@ +name: Sync with Upstream + +on: + schedule: + - cron: '0 0 * * *' # Runs daily at midnight UTC + push: + branches: + - main + +jobs: + sync: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Git + run: | + git config --global user.name 'Zhuul' + git config --global user.email '40538530+Zhuul@users.noreply.github.com' + + - name: Add upstream remote + run: git remote add upstream https://github.com/vllm-project/vllm.git + + - name: Fetch upstream changes + run: git fetch upstream + + - name: Merge upstream changes + id: merge + run: | + git checkout main + git merge upstream/main --allow-unrelated-histories --no-edit || { + echo "Merge conflict detected. Creating a new branch for manual resolution." + BRANCH="merge-conflict-$(date +%Y%m%d%H%M%S)" + git checkout -b "$BRANCH" + # Push with a descriptive message + git push origin HEAD + echo "conflict=true" >> "$GITHUB_OUTPUT" + echo "conflict_branch=$BRANCH" >> "$GITHUB_OUTPUT" + exit 1 + } + echo "conflict=false" >> "$GITHUB_OUTPUT" + + - name: Check for workflow file changes + id: workflow_change + run: | + if git diff --name-only upstream/main | grep '^.github/workflows/'; then + echo "workflow_changed=true" >> "$GITHUB_OUTPUT" + else + echo "workflow_changed=false" >> "$GITHUB_OUTPUT" + fi + + # Use GITHUB_TOKEN for authentication, sufficient for repo write access in actions + - name: Set up authentication + run: git remote set-url origin "https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/Zhuul/vllm.git" + + - name: Push changes if no workflow files changed + if: steps.workflow_change.outputs.workflow_changed == 'false' && steps.merge.outputs.conflict == 'false' + run: git push origin main + + - name: Create Pull Request for workflow file changes + if: steps.workflow_change.outputs.workflow_changed == 'true' && steps.merge.outputs.conflict == 'false' + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: "Sync with upstream: update workflow files" + title: "Sync with upstream: update workflow files" + body: | + This PR was automatically created because workflow files were updated while syncing with upstream. + Please review and merge. + branch: workflow-sync-${{ github.run_id }} + base: main + + # Notification step: create an issue if merge conflict detected + - name: Create Issue on Merge Conflict + if: steps.merge.outputs.conflict == 'true' + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: "Merge Conflict Detected During Upstream Sync", + body: ` + A merge conflict occurred while syncing with upstream (vllm-project/vllm). + Branch for manual resolution: ${{ steps.merge.outputs.conflict_branch }} + + Please resolve this conflict at https://github.com/${context.repo.owner}/${context.repo.repo}/tree/${{ steps.merge.outputs.conflict_branch }} + + This issue was automatically created by the sync workflow. + ` + }) + + - name: Log completion + run: echo "Sync with upstream completed. Thank you for using automated upstream sync πŸš€" diff --git a/.gitignore b/.gitignore index 465935d488f8..a5bd3740e844 100644 --- a/.gitignore +++ b/.gitignore @@ -209,4 +209,5 @@ shellcheck*/ csrc/moe/marlin_moe_wna16/kernel_* # Ignore ep_kernels_workspace folder -ep_kernels_workspace/ \ No newline at end of file +ep_kernels_workspace/node_modules/ +package*.json diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index f051eb070222..be7be6214987 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -30,7 +30,7 @@ __global__ void rms_norm_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum()); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -85,7 +85,7 @@ fused_add_rms_norm_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum()); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -126,7 +126,7 @@ fused_add_rms_norm_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum()); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu index 0fd5849d9626..6427396471e2 100644 --- a/csrc/layernorm_quant_kernels.cu +++ b/csrc/layernorm_quant_kernels.cu @@ -39,7 +39,7 @@ __global__ void rms_norm_static_fp8_quant_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum()); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -100,7 +100,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum()); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -149,7 +149,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum()); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index d8369108d0bd..f4a646471c28 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -173,7 +173,7 @@ __global__ void dynamic_scaled_int8_quant_kernel( }); using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage tmp; - float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x); + float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max()); __shared__ float absmax; if (tid == 0) { absmax = block_max; diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 5fe5dd04bd89..ae7d0d81eb4c 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -116,7 +116,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage tmp; const float block_max = - BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x); + BlockReduce(tmp).Reduce(absmax_val, cub::Max()); __shared__ float token_scale; if (tid == 0) { diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh index 3f188872d80d..b7cc1f0a0b5f 100644 --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh @@ -36,7 +36,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x); + ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum()); __shared__ float s_rms; if (threadIdx.x == 0) { @@ -73,7 +73,7 @@ __device__ void compute_dynamic_per_token_scales( __shared__ typename BlockReduce::TempStorage reduceStore; block_absmax_val_maybe = BlockReduce(reduceStore) - .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x); + .Max(block_absmax_val_maybe); __shared__ float s_token_scale; if (threadIdx.x == 0) { @@ -169,7 +169,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x); + ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum()); __shared__ float s_rms; if (threadIdx.x == 0) { @@ -240,7 +240,7 @@ __device__ void compute_dynamic_per_token_scales( __shared__ typename BlockReduce::TempStorage reduceStore; block_absmax_val_maybe = BlockReduce(reduceStore) - .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x); + .Max(block_absmax_val_maybe); __shared__ float s_token_scale; if (threadIdx.x == 0) { diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 5a2a70d57e85..1c2a31cf895c 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -31,6 +31,8 @@ See . Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source][build-from-source] documentation for details. +For a containerized developer workflow, see Podman-first dev: `docs/contributing/podman-dev.md`. + For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. ### Building the docs with MkDocs diff --git a/docs/contributing/podman-dev.md b/docs/contributing/podman-dev.md new file mode 100644 index 000000000000..881e495f8421 --- /dev/null +++ b/docs/contributing/podman-dev.md @@ -0,0 +1,41 @@ +--- +title: Podman-first Development Environment +--- + +This guide documents the Podman-first development workflow for building vLLM from source with CUDA and PyTorch nightly. + +Primary entrypoint + +- Windows (PowerShell): `./extras/podman/run.ps1` +- Linux/macOS (bash): `extras/podman/run.sh` + +Legacy launchers at `extras/run-vllm-dev.ps1` and `extras/run-vllm-dev.sh` are deprecated and forward to the Podman wrappers. + +Prerequisites + +- Podman with GPU CDI enabled (on Windows, use Podman Desktop + WSL; ensure NVIDIA drivers and CUDA are installed on the host). +- Optional named volume for build/work space, e.g., `vllm-work`. + +Quick start + +Windows (PowerShell): + +```powershell +./extras/podman/run.ps1 -Build +./extras/podman/run.ps1 -GPUCheck +./extras/podman/run.ps1 -Setup -WorkVolume vllm-work -Progress +``` + +Linux/macOS (bash): + +```bash +extras/podman/run.sh --build +extras/podman/run.sh --gpu-check +extras/podman/run.sh --setup --work-volume vllm-work --progress +``` + +Notes + +- The image uses CUDA 12.9 UBI9 and installs PyTorch nightly cu129 first to ensure latest GPU arch support (including sm_120 when present). +- The setup step performs an editable vLLM install without downgrading torch family packages. +- Use a named Podman volume for `/opt/work` to avoid `/tmp` tmpfs pressure and to speed up rebuilds. diff --git a/extras/.dockerignore b/extras/.dockerignore new file mode 100644 index 000000000000..60a8d81a82c1 --- /dev/null +++ b/extras/.dockerignore @@ -0,0 +1,39 @@ +# Reduce build context to avoid Windows Podman tar write issues +.git +.github +.vscode +.venv +venv +node_modules +build +dist +csrc/ +vllm/ +benchmarks/ +docs/ +examples/ +tests/ +**/__pycache__ +**/*.pyc +**/*.pyo +**/*.pyd +**/*.so +**/*.o +**/*.a +**/*.dll +**/*.dylib +extras/build.log +extras/*.bak +extras/tools/ +extras/run-vllm-dev-*.ps1 +extras/run-vllm-dev-*.sh +extras/*wsl* +extras/*docker*.ps1 + +!extras/Dockerfile +!extras/run-vllm-dev.ps1 +!extras/run-vllm-dev.sh +!extras/dev-setup.sh +requirements/ +pyproject.toml +setup.py diff --git a/extras/Dockerfile b/extras/Dockerfile new file mode 100644 index 000000000000..69ee583e5bb7 --- /dev/null +++ b/extras/Dockerfile @@ -0,0 +1,266 @@ +# vLLM Development Container with GPU Support +# Uses vLLM's own requirements for automatic dependency management + +# Build-time args to control CUDA/OS base and PyTorch nightly index +ARG CUDA_VERSION=13.0.0 +ARG UBI_VERSION=9 +ARG TORCH_CUDA_INDEX=cu130 +# Base flavor for CUDA image: e.g. 'rockylinux9' (default) or 'ubi9' +ARG BASE_FLAVOR=rockylinux9 + +# Switchable base: defaults to Rocky Linux to avoid subscription-gated repos +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${BASE_FLAVOR} + +# Set CUDA environment variables for build tools +ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_ROOT=/usr/local/cuda +ENV PATH=$CUDA_HOME/bin:$PATH +ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH +ENV CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME +ENV CUDNN_LIBRARY_PATH=/usr/lib64 +ENV CUDNN_INCLUDE_PATH=/usr/include + +# Install system packages with additional CUDA development libraries +RUN dnf update -y && dnf install --allowerasing -y \ + python3 python3-pip python3-devel \ + git gcc gcc-c++ cmake \ + make patch which findutils tar rsync \ + wget curl vim nano pkgconfig \ + zlib-devel bzip2 bzip2-devel xz xz-devel libffi-devel \ + openssl-devel sqlite-devel \ + && (dnf install -y readline-devel || true) \ + && dnf clean all + +# Prefer Python 3.12 from packages if available (fallback to system python3) +RUN dnf install -y python3.12 python3.12-devel || true + +## Ensure /usr/bin/python exists for compatibility +RUN ln -sf $(command -v python3) /usr/bin/python || true + +# Create a non-root user for development +RUN useradd -m -s /bin/bash vllmuser && \ + echo "vllmuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +# Install essential system tools +RUN dnf install -y hostname iproute iputils + +ARG REQUIRE_FFMPEG=1 +# Multimedia and image libs with optional ffmpeg-devel enforcement +# Install EPEL and RPM Fusion repos for EL (9/10) and pull ffmpeg/ffmpeg-devel from there. +# When REQUIRE_FFMPEG=1, fail the build if ffmpeg is still unavailable. +RUN set -euxo pipefail \ + && (dnf install -y dnf-plugins-core || true) \ + && (dnf config-manager --set-enabled crb || true) \ + && (dnf makecache -y || true) \ + && . /etc/os-release \ + && ELVER="${VERSION_ID%%.*}" \ + && echo "[Dockerfile] Detected Enterprise Linux major version: ${ELVER}" \ + && dnf install -y \ + libjpeg-turbo-devel libpng-devel zlib-devel freetype-devel \ + libsndfile libsndfile-devel sox sox-devel || true \ + && if [ "${REQUIRE_FFMPEG}" = "1" ]; then \ + echo "[Dockerfile] Enabling EPEL and RPM Fusion for ffmpeg (EL${ELVER})"; \ + dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-${ELVER}.noarch.rpm; \ + dnf install -y https://download1.rpmfusion.org/free/el/rpmfusion-free-release-${ELVER}.noarch.rpm; \ + dnf install -y https://download1.rpmfusion.org/nonfree/el/rpmfusion-nonfree-release-${ELVER}.noarch.rpm; \ + dnf makecache -y; \ + dnf install -y ffmpeg ffmpeg-devel; \ + command -v ffmpeg >/dev/null 2>&1; \ + else \ + # Best-effort install when not enforced + (dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-${ELVER}.noarch.rpm || true); \ + (dnf install -y https://download1.rpmfusion.org/free/el/rpmfusion-free-release-${ELVER}.noarch.rpm || true); \ + (dnf install -y https://download1.rpmfusion.org/nonfree/el/rpmfusion-nonfree-release-${ELVER}.noarch.rpm || true); \ + (dnf makecache -y || true); \ + (dnf install -y ffmpeg ffmpeg-devel || true); \ + fi \ + && (dnf install -y --enablerepo=crb ninja-build || \ + dnf install -y --enablerepo=crb ninja || \ + dnf install -y ninja-build || \ + dnf install -y ninja || true) \ + && dnf clean all || true + + + +# Add NVIDIA Machine Learning repo for RHEL9/UBI9 and install NCCL runtime/devel +# Needed for PyTorch nightly cu129 to avoid ncclCommWindowRegister symbol errors +# Install NCCL runtime/devel from the CUDA repository available in the base image +RUN set -euxo pipefail \ + && dnf makecache -y \ + && (dnf install -y libnccl libnccl-devel || dnf install -y libnccl-2 libnccl-devel-2) \ + && dnf clean all + +# Set working directory and adjust ownership +WORKDIR /workspace +RUN chown -R vllmuser:vllmuser /workspace + +# Create build directories with proper permissions +RUN mkdir -p /workspace/.deps && chown -R vllmuser:vllmuser /workspace/.deps && \ + mkdir -p /tmp/vllm-build && chmod 777 /tmp/vllm-build && \ + mkdir -p /opt/work && chmod 777 /opt/work && \ + mkdir -p /home/vllmuser/.cache && chown -R vllmuser:vllmuser /home/vllmuser/.cache && \ + mkdir -p /home/vllmuser/.ccache && chown -R vllmuser:vllmuser /home/vllmuser/.ccache && \ + mkdir -p /home/vllmuser/.cmake && chown -R vllmuser:vllmuser /home/vllmuser/.cmake && \ + chmod -R 755 /workspace && \ + chmod -R 777 /tmp + +# Switch to the non-root user +USER vllmuser + +# Create and activate virtual environment using the best available Python (3.12 preferred) +ENV VIRTUAL_ENV=/home/vllmuser/venv +RUN PY_BIN="$(command -v python3.12 || command -v python3)" && "$PY_BIN" -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# Set pip configuration +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 +ENV PIP_NO_CACHE_DIR=1 +ENV PYTHONUNBUFFERED=1 +ENV PIP_DEFAULT_TIMEOUT=120 +ENV PIP_RETRIES=5 +ENV PIP_PREFER_BINARY=1 + +# CUDA arch list: CUDA 13+ drops SM70/SM75; default to supported archs only. +# Override at build time with: --build-arg TORCH_CUDA_ARCH_LIST="..." +ARG TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 12.0 13.0" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" + +# Also set CUDAARCHS (semicolon separated) for CMake/NVCC generators. +# Override at build time with: --build-arg CUDA_ARCHS="80;86;89;90;120" +ARG CUDA_ARCHS="80;86;89;90;120" +ENV CUDAARCHS="${CUDA_ARCHS}" + +# Upgrade pip and setuptools to latest versions +RUN pip install --upgrade pip setuptools>=61 wheel + +COPY requirements/ /tmp/requirements/ + +# Install PyTorch nightly first (includes latest GPU arch support such as Blackwell sm_120 when present) +ARG TORCH_CUDA_INDEX +RUN pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX} + +RUN pip install --pre torchvision --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX} +RUN pip install --pre torchaudio --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX} + +# Install PyAV for torchvision video I/O (read_video) compatibility +RUN pip install --upgrade av + +# Install TorchCodec to support torchaudio.load on recent nightlies +RUN set -euxo pipefail \ + && (pip install --pre torchcodec \ + || pip install torchcodec \ + || pip install --no-deps 'git+https://github.com/pytorch/torchcodec@main') + +# Install modern build tools and vLLM's build dependencies and CUDA deps early, +# but sanitize requirements to avoid downgrading torch-family or forcing xformers pins. +COPY pyproject.toml /tmp/pyproject.toml +RUN set -euxo pipefail \ + && cd /tmp \ + && pip install "setuptools>=61" "setuptools-scm>=8" build wheel ninja cmake \ + && mkdir -p /tmp/requirements_sanitized \ + && for f in build.txt cuda.txt common.txt; do \ + if [ -f "/tmp/requirements/$f" ]; then \ + sed -E '/^(torch|torchvision|torchaudio|xformers)\b/Id' "/tmp/requirements/$f" > "/tmp/requirements_sanitized/$f"; \ + fi; \ + done \ + && pip install --pre \ + -r /tmp/requirements_sanitized/build.txt \ + -r /tmp/requirements_sanitized/cuda.txt \ + -r /tmp/requirements_sanitized/common.txt \ + && pip install --pre --upgrade \ + torch --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX} + +# Install minimal development extras +RUN pip install pytest pytest-asyncio ipython + +# Note: vLLM will be installed from source in development mode via dev-setup.sh +# This ensures compatibility with the PyTorch nightly build + +# Create activation script for easy virtual environment access +RUN echo '#!/bin/bash' > /home/vllmuser/activate_venv.sh && \ + echo 'source /home/vllmuser/venv/bin/activate' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "Virtual environment activated: $VIRTUAL_ENV"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "Python version: $(python --version)"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "PyTorch version: $(python -c \"import torch; print(torch.__version__)\")"' >> /home/vllmuser/activate_venv.sh && \ + echo 'echo "CUDA available: $(python -c \"import torch; print(torch.cuda.is_available())\")"' >> /home/vllmuser/activate_venv.sh && \ + chmod +x /home/vllmuser/activate_venv.sh + +# Ensure virtual environment is activated in .bashrc +RUN echo 'source /home/vllmuser/venv/bin/activate' >> /home/vllmuser/.bashrc && \ + echo 'echo "🐍 Python virtual environment activated"' >> /home/vllmuser/.bashrc && \ + echo 'echo "πŸš€ Ready for vLLM development!"' >> /home/vllmuser/.bashrc + +# Create development helper script that uses current workspace requirements +RUN echo '#!/bin/bash' > /home/vllmuser/setup_vllm_dev.sh && \ + echo 'echo "πŸ”§ Setting up vLLM for development..."' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'cd /workspace' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Use temporary build directory to avoid permission issues' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export TMPDIR=/tmp/vllm-build' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'mkdir -p "$TMPDIR" && chmod 777 "$TMPDIR"' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export CMAKE_BUILD_PARALLEL_LEVEL=4' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export VLLM_INSTALL_PUNICA_KERNELS=0' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'export MAX_JOBS=4' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Install current workspace requirements first' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'if [ -f requirements/common.txt ]; then pip install -r requirements/common.txt; fi' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo '# Use temporary directory for CMake build files' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'FETCHCONTENT_BASE_DIR="$TMPDIR/deps" pip install -e . --no-deps --no-build-isolation --verbose' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'echo "βœ… vLLM installed in editable mode!"' >> /home/vllmuser/setup_vllm_dev.sh && \ + echo 'python -c "import vllm; print(\"vLLM version:\", vllm.__version__)"' >> /home/vllmuser/setup_vllm_dev.sh && \ + chmod +x /home/vllmuser/setup_vllm_dev.sh + +# Provide a helper to apply repo patches against the mounted /workspace +# Create under /usr/local/bin as root, then switch back to non-root user +USER root +RUN printf '%s\n' \ + '#!/usr/bin/env bash' \ + 'set -euo pipefail' \ + 'cd /workspace 2>/dev/null || exit 0' \ + 'SCRIPT=./extras/patches/apply_patches.sh' \ + 'if [ -f "$SCRIPT" ]; then' \ + ' echo "[apply-patches] Running $SCRIPT"' \ + ' # Copy to temp and normalize EOL to avoid permission errors on mounted FS' \ + ' TMP_SCRIPT=$(mktemp /tmp/apply_patches.XXXXXX.sh)' \ + ' tr -d '\''\r'\'' < "$SCRIPT" > "$TMP_SCRIPT" || cp "$SCRIPT" "$TMP_SCRIPT"' \ + ' chmod +x "$TMP_SCRIPT"' \ + ' bash "$TMP_SCRIPT" || {' \ + ' echo "[apply-patches] Warning: patch apply failed (continuing)" >&2; exit 0; }' \ + 'fi' \ + > /usr/local/bin/apply-vllm-patches && \ + chmod +x /usr/local/bin/apply-vllm-patches +USER vllmuser + +# Add environment variables for better CUDA memory management and build optimization +# Use the new variable name to avoid deprecation warnings. +# (Not working with vllm) +# ENV PYTORCH_ALLOC_CONF=expandable_segments:True +# +# Do not pin a single GPU here; let runtime inject device selection +# ENV CUDA_VISIBLE_DEVICES=0 +ENV CMAKE_BUILD_PARALLEL_LEVEL=4 +ENV VLLM_INSTALL_PUNICA_KERNELS=0 +ENV MAX_JOBS=4 + +# Enable ccache for faster rebuilds +ENV CCACHE_DIR=/home/vllmuser/.ccache +ENV CCACHE_MAXSIZE=10G +ENV PATH=/usr/lib64/ccache:$PATH + +# (TORCH_CUDA_ARCH_LIST defined earlier) +# Do not force-disable Machete; allow upstream defaults. User may still pass -e CMAKE_ARGS for custom CMake settings. +ENV CMAKE_ARGS="" + +# WSL2-specific CUDA environment configuration +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility +ENV LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/lib/wsl/drivers:/usr/lib/wsl/lib:/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH + +# Add runtime library detection script +RUN echo '#!/bin/bash' > /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "=== CUDA Library Check ==="' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "Searching for CUDA libraries..."' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'find /usr/lib/wsl -name "libcuda.so*" 2>/dev/null | head -3 || echo "No WSL CUDA libs"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'ldconfig -p | grep cuda | head -3 || echo "No CUDA in ldconfig"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'echo "PyTorch CUDA status:"' >> /home/vllmuser/check_cuda_libs.sh && \ + echo 'python -c "import torch; print(f\"CUDA available: {torch.cuda.is_available()}\"); print(f\"Device count: {torch.cuda.device_count()}\")" 2>/dev/null || echo "PyTorch not available"' >> /home/vllmuser/check_cuda_libs.sh && \ + chmod +x /home/vllmuser/check_cuda_libs.sh diff --git a/extras/README.md b/extras/README.md new file mode 100644 index 000000000000..a33042a97c58 --- /dev/null +++ b/extras/README.md @@ -0,0 +1,65 @@ +# extras/ overview + +This directory hosts non-core assets: container/build tooling, configs, testing, storage helpers, and optional patches. Everything here is designed to be self-contained and safe for Windows + WSL and Linux. + +Layout + +- podman/ β€” Podman-first run/build wrappers (Windows PowerShell + bash) +- configs/ β€” Centralized versions and build configuration +- patches/ β€” Optional patches applied automatically at container start +- storage/ β€” Volume/caching helpers +- testing/ β€” Test harness, matrices, and results +- secrets/ β€” Local, gitignored credentials + +Primary entrypoint: `extras/podman/run.ps1` (Windows) or `extras/podman/run.sh` (Linux/macOS). + +## What’s new + +- CUDA 13.0 base (Rocky Linux 9) with PyTorch nightlies and ffmpeg stack. +- Default CUDA arch policy updated for CUDA 13 (drops SM70/SM75): + - TORCH_CUDA_ARCH_LIST: "8.0 8.6 8.9 9.0 12.0 13.0" + - CUDAARCHS: "80;86;89;90;120" + - Override via `extras/configs/build.env` or environment variables. +- Auto-patch on container start (idempotent, CRLF-safe): + - 0001-cumem-alloc-env-fallback.diff β€” prefer PYTORCH_ALLOC_CONF + - 0002-cub-reduce-to-sum-cuda13.diff β€” CUB Reduce->Sum compatibility +- Setup flow is CRLF/WSL-safe: scripts run from a normalized temp copy. + +## Quick start + +1) Configure (optional): edit `extras/configs/build.env`. +2) Build the image: + - Windows: `./extras/podman/run.ps1 -Build` + - Linux/macOS: `extras/podman/run.sh --build` +3) GPU check: + - Windows: `./extras/podman/run.ps1 -GPUCheck` + - Linux/macOS: `extras/podman/run.sh --gpu-check` +4) Install vLLM in editable mode (compiles extensions): + - Windows: `./extras/podman/run.ps1 -Setup -WorkVolume vllm-work -Progress` + - Linux/macOS: `extras/podman/run.sh --setup --work-volume vllm-work --progress` + +Notes for Windows/WSL + +- The launcher maps /dev/dxg and WSL libraries automatically; NV env vars are set safely (no "void"). +- PowerShell quoting for inline Python: + - `./extras/podman/run.ps1 -Command 'python -c "import torch;print(torch.__version__)"'` +- Scripts avoid in-place edits on the mounted repo to prevent permission errors. + +## Patches + +Place `.diff` files in `extras/patches/`. On container start, a helper normalizes CRLF, applies patches, or uses targeted Python fallbacks for known fragile hunks. No source-file changes are committed to the host by design. + +## Storage and caches + +Use a named volume for large builds and cache: + +- `-WorkVolume vllm-work` (PowerShell) +- `--work-volume vllm-work` (bash) + +## Testing + +See `extras/testing/README.md` for matrix and run helpers. + +## Secrets + +See `extras/secrets/README.md` for token handling. diff --git a/extras/configs/README.md b/extras/configs/README.md new file mode 100644 index 000000000000..062170cbd2a6 --- /dev/null +++ b/extras/configs/README.md @@ -0,0 +1,16 @@ +# configs README + +This folder centralizes editable configuration for images/builds: + +- build.env: Bash-exported defaults (CUDA/UBI/Python/vLLM tag, arch list, volumes) +- build.yaml (optional): YAML equivalent for tools that prefer structured configs +- versions.json (optional): Machine-friendly manifest for automation + +Consumers (scripts/Containerfiles) should read values from here and allow runtime overrides via environment variables. + +CUDA 13 arch policy + +- TORCH_CUDA_ARCH_LIST defaults to: "8.0 8.6 8.9 9.0 12.0 13.0" +- CUDAARCHS defaults to: "80;86;89;90;120" + +Both `extras/podman/run.ps1` and `extras/podman/run.sh` read build.env and pass these values into builds and setup runs. diff --git a/extras/configs/build.env b/extras/configs/build.env new file mode 100644 index 000000000000..c2b015526e65 --- /dev/null +++ b/extras/configs/build.env @@ -0,0 +1,38 @@ +# Build configuration +# +# Scripts should source this file to obtain default versions. +# Values can be overridden by environment variables provided at runtime. + +# CUDA / UBI / Python baselines +export CUDA_VERSION=${CUDA_VERSION:-13.0.0} +export UBI_VERSION=${UBI_VERSION:-9} +export PYTHON_VERSION=${PYTHON_VERSION:-3.12} +export BASE_FLAVOR=${BASE_FLAVOR:-rockylinux9} + +# vLLM branch/tag to use inside the container when cloning or referring +export VLLM_TAG=${VLLM_TAG:-main} + +## Architectures (space separated) for PyTorch/NVCC +# CUDA 13+ no longer supports SM70/SM75; default to supported archs only. +# Include Blackwell via sm_120 (13.0) while keeping Hopper/Ada. +export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-"8.0 8.6 8.9 9.0 12.0 13.0"} +# Semicolon-separated CUDAARCHS for CMake/NVCC generators +export CUDA_ARCHS=${CUDA_ARCHS:-"80;86;89;90;120"} + +# Named volume for build scratch/work dir (Podman recommended) +export VLLM_WORK_VOLUME=${VLLM_WORK_VOLUME:-vllm-work} +export VLLM_WORK_DIR_CONTAINER=${VLLM_WORK_DIR_CONTAINER:-/opt/work} + +# Image naming +export VLLM_BASE_IMAGE=${VLLM_BASE_IMAGE:-"nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${BASE_FLAVOR}"} +export VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-"vllm-cuda${CUDA_VERSION}-ubi${UBI_VERSION}"} + +# Torch family components: wheels only (nightly index). No source build fallbacks. +export INSTALL_TORCHVISION=${INSTALL_TORCHVISION:-1} +export INSTALL_TORCHAUDIO=${INSTALL_TORCHAUDIO:-1} + +# FFMPEG optional enforcement for torchaudio features +# Set to 1 to enable RPM Fusion repos and install ffmpeg/ffmpeg-devel; build will fail if unavailable. +# Set to 0 to attempt best-effort install and fallback to building torchaudio without FFMPEG when headers are missing. +#export REQUIRE_FFMPEG=${REQUIRE_FFMPEG:-0} +export REQUIRE_FFMPEG=${REQUIRE_FFMPEG:-1} \ No newline at end of file diff --git a/extras/configs/build.yaml b/extras/configs/build.yaml new file mode 100644 index 000000000000..d90e66a116c8 --- /dev/null +++ b/extras/configs/build.yaml @@ -0,0 +1,15 @@ +cuda: + version: "13.0.0" + tag: "latest" + base_flavor: "rockylinux9" +ubi: + version: "9" + tag: "latest" +python: + version: "3.12" + tag: "latest" +vllm: + tag: main +arch: + torch_cuda_arch_list: "8.0 8.6 8.9 9.0 12.0 13.0" + cuda_archs: "80;86;89;90;120" diff --git a/extras/patches/0001-cumem-alloc-env-fallback.diff b/extras/patches/0001-cumem-alloc-env-fallback.diff new file mode 100644 index 000000000000..c2a322024961 --- /dev/null +++ b/extras/patches/0001-cumem-alloc-env-fallback.diff @@ -0,0 +1,14 @@ +diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py +--- a/vllm/device_allocator/cumem.py ++++ b/vllm/device_allocator/cumem.py +@@ -140,7 +140,9 @@ class CuMemAllocator: + return CuMemAllocator.instance + + def __init__(self): +- conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") ++ # Prefer new env var; fall back to deprecated one for compatibility ++ conf = os.environ.get("PYTORCH_ALLOC_CONF", ++ os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")) + assert "expandable_segments:True" not in conf, \ + ("Expandable segments are not compatible with memory pool. " + "Please track https://github.com/pytorch/pytorch/issues/147851 " diff --git a/extras/patches/0002-cub-reduce-to-sum-cuda13.diff b/extras/patches/0002-cub-reduce-to-sum-cuda13.diff new file mode 100644 index 000000000000..3938f6e880db --- /dev/null +++ b/extras/patches/0002-cub-reduce-to-sum-cuda13.diff @@ -0,0 +1,59 @@ +diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu +--- a/csrc/layernorm_kernels.cu ++++ b/csrc/layernorm_kernels.cu +@@ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; +- variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); ++ // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum() ++ variance = BlockReduce(reduceStore).Sum(variance); +@@ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; +- variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); ++ // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum() ++ variance = BlockReduce(reduceStore).Sum(variance); +@@ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; +- variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); ++ // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum() ++ variance = BlockReduce(reduceStore).Sum(variance); + + diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu + --- a/csrc/layernorm_quant_kernels.cu + +++ b/csrc/layernorm_quant_kernels.cu + @@ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum() + + variance = BlockReduce(reduceStore).Sum(variance); + @@ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum() + + variance = BlockReduce(reduceStore).Sum(variance); + @@ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum() + + variance = BlockReduce(reduceStore).Sum(variance); + + diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh + --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh + +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh + @@ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + - ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x); + + // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum() + + ss = BlockReduce(reduceStore).Sum(ss); + @@ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + - ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x); + + // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum() + + ss = BlockReduce(reduceStore).Sum(ss); \ No newline at end of file diff --git a/extras/patches/README.md b/extras/patches/README.md new file mode 100644 index 000000000000..ff4f662c4588 --- /dev/null +++ b/extras/patches/README.md @@ -0,0 +1,5 @@ +# Patches and plugins scaffolding + +- Place unified diffs (*.diff) here. +- Use `apply_patches.sh` to apply them before building. +- Optionally, add Python plugins under `plugin/` and load dynamically at runtime. diff --git a/extras/patches/apply_patches.sh b/extras/patches/apply_patches.sh new file mode 100644 index 000000000000..4ae3f091422b --- /dev/null +++ b/extras/patches/apply_patches.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Normalize CRLF and re-exec if needed +if grep -q $'\r' "$0" 2>/dev/null; then + TMP_SELF=$(mktemp /tmp/apply_patches_self.XXXXXX.sh) + tr -d '\r' < "$0" > "$TMP_SELF" || cp "$0" "$TMP_SELF" + chmod +x "$TMP_SELF" 2>/dev/null || true + exec "$TMP_SELF" "$@" +fi + +# Resolve paths +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +# Treat current working directory as repo root (wrapper cd's to /workspace) +ROOT_DIR=${ROOT_DIR:-$(pwd)} +# Prefer patches from repo under ./extras/patches; fall back to script dir (e.g., /tmp copy) +PRIMARY_PATCH_DIR="${ROOT_DIR}/extras/patches" +PATCH_DIR="$PRIMARY_PATCH_DIR" +if [ ! -d "$PATCH_DIR" ] || ! ls "$PATCH_DIR"/*.diff >/dev/null 2>&1; then + PATCH_DIR="$SCRIPT_DIR" +fi + +pushd "$ROOT_DIR" >/dev/null + +shopt -s nullglob +PATCHES=("${PATCH_DIR}"/*.diff) +shopt -u nullglob + +echo "[patches] Using ROOT_DIR=$ROOT_DIR" +echo "[patches] Scanning ${PATCH_DIR} for .diff files" +echo "[patches] Found ${#PATCHES[@]} .diff file(s) in ${PATCH_DIR}" +for pp in "${PATCHES[@]}"; do echo " - $(basename "$pp")"; done + +for p in "${PATCHES[@]}"; do + echo "[patches] Applying ${p}" + # Normalize EOL to a temp patch file + TMP_PATCH=$(mktemp /tmp/patch.XXXXXX.diff) + tr -d '\r' < "$p" > "$TMP_PATCH" 2>/dev/null || cp "$p" "$TMP_PATCH" + if git apply --check "$TMP_PATCH" 2>/dev/null; then + git apply "$TMP_PATCH" || true + continue + fi + echo "[patches] git apply check failed for $(basename "$p"); attempting fallback if known" + case "$(basename "$p")" in + 0001-cumem-alloc-env-fallback.diff) + echo "[patches] Fallback: update cumem allocator env var preference" + python - <<'PY' +import io, os +path = os.path.join('vllm','device_allocator','cumem.py') +try: + with io.open(path, 'r', encoding='utf-8', newline='') as f: + src = f.read() +except FileNotFoundError: + raise SystemExit(0) +if 'PYTORCH_ALLOC_CONF' in src: + print('[patches] cumem already prefers PYTORCH_ALLOC_CONF; skipping') + raise SystemExit(0) +needle = 'conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")' +if needle in src: + new = src.replace(needle, + 'conf = os.environ.get("PYTORCH_ALLOC_CONF",\n' + ' os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""))') + with io.open(path, 'w', encoding='utf-8', newline='\n') as f: + f.write(new) + print('[patches] Applied cumem env var fallback edit') +else: + print('[patches] cumem pattern not found; skipping') +PY + ;; + 0002-cub-reduce-to-sum-cuda13.diff) + echo "[patches] Fallback will be handled by the post-pass rewrite" + ;; + *) + echo "[patches] Unknown patch; skipping fallback" + ;; + esac +done + +echo "[patches] Post-pass: normalize CUB to Reduce(expr, cub::Op()) across all csrc" +python - <<'PY' +import io, os, re + +files = [] +for root, _, names in os.walk('csrc'): + for n in names: + if n.endswith(('.cu', '.cuh')): + files.append(os.path.join(root, n)) + +# Patterns: +# 1) Convert convenience methods to Reduce with functor: BlockReduce(...).Max(expr) -> BlockReduce(...).Reduce(expr, cub::Max()) +pat_method = re.compile(r"(BlockReduce\([^)]*\))\.(?PSum|Max|Min)\(\s*(?P[^)]+?)\s*\)") + +# 2) Ensure functor form uses parentheses not braces (cub::Op{} -> cub::Op()) +pat_functor_braces = re.compile(r"(BlockReduce\([^)]*\)\.Reduce\(\s*[^,]+,\s*cub::(Sum|Max|Min))\{\}(\s*(?:,[^)]*)?\))") + +changed_any = False +for path in files: + try: + with io.open(path, 'r', encoding='utf-8', newline='') as f: + src = f.read() + except FileNotFoundError: + continue + # Method -> Reduce(functor) + def repl_method(m): + receiver = m.group(1) + op = m.group('op') + expr = m.group('expr').strip() + return f"{receiver}.Reduce({expr}, cub::{op}())" + new_src = pat_method.sub(repl_method, src) + # Braces -> Parens + new_src = pat_functor_braces.sub(r"\1()\3", new_src) + if new_src != src: + with io.open(path, 'w', encoding='utf-8', newline='\n') as f: + f.write(new_src) + print(f"[patches] Normalized CUB Reduce in {path}") + changed_any = True +if not changed_any: + print('[patches] Post-pass: no changes (already applied)') +PY + +popd >/dev/null + +echo "[patches] Done." diff --git a/extras/podman/Containerfile b/extras/podman/Containerfile new file mode 100644 index 000000000000..d42bef4b344e --- /dev/null +++ b/extras/podman/Containerfile @@ -0,0 +1,11 @@ +# syntax=docker/dockerfile:1.7-labs + +# Delegator Containerfile. +# Build using the canonical Dockerfile in extras/ to avoid duplication. + +FROM scratch as noop + +# Usage: +# podman build -f extras/Dockerfile -t vllm-dev:latest . +# or from this folder (wrapper script does this for you): +# bash build.sh diff --git a/extras/podman/README.md b/extras/podman/README.md new file mode 100644 index 000000000000..620398fc7895 --- /dev/null +++ b/extras/podman/README.md @@ -0,0 +1,30 @@ +# Podman helpers for vLLM + +These wrappers build and run a CUDA 13 dev container with PyTorch nightlies. + +Key features + +- Windows/WSL and Linux support (PowerShell and bash launchers) +- Auto-apply patches on container start (CRLF-safe, idempotent) +- CUDA arch policy aligned with CUDA 13 (no SM70/SM75) +- Named volume mounting for faster builds (`/opt/work`) + +Launchers + +- Windows: `extras/podman/run.ps1` +- Linux/macOS: `extras/podman/run.sh` + +Common options + +- Build: `-Build` (ps1) / `--build` (sh) +- GPU check: `-GPUCheck` / `--gpu-check` +- Setup (editable install): `-Setup` / `--setup` +- Work volume: `-WorkVolume NAME` / `--work-volume NAME` +- Progress: `-Progress` / `--progress` +- Mirror sources: `-Mirror` / `--mirror` + +Notes + +- Scripts normalize CRLF by running a temp copy to avoid chmod/sed on Windows mounts. +- CUDA arch defaults can be changed in `extras/configs/build.env`. +- The entrypoint is `apply-patches-then-exec.sh`, which runs patching before your command. diff --git a/extras/podman/build.sh b/extras/podman/build.sh new file mode 100644 index 000000000000..f5aefa1b70b9 --- /dev/null +++ b/extras/podman/build.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Why: Back-compat wrapper that sources central config and builds using the canonical Dockerfile. + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +ROOT_DIR=$(cd -- "${SCRIPT_DIR}/../.." &>/dev/null && pwd) +CONFIG_DIR="${SCRIPT_DIR}/../configs" + +# shellcheck source=../configs/build.env +if [ -f "${CONFIG_DIR}/build.env" ]; then + # shellcheck disable=SC1091 + source "${CONFIG_DIR}/build.env" +fi + +CUDA_VERSION=${CUDA_VERSION:-13.0.0} +UBI_VERSION=${UBI_VERSION:-9} +VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-"vllm-cuda${CUDA_VERSION}-ubi${UBI_VERSION}"} + +CONTEXT="${ROOT_DIR}" +DOCKERFILE_REL="extras/Dockerfile" + +echo "[podman/build] Building image ${VLLM_IMAGE_TAG} with CUDA=${CUDA_VERSION}, UBI=${UBI_VERSION}" + +podman build \ + --build-arg CUDA_VERSION="${CUDA_VERSION}" \ + --build-arg UBI_VERSION="${UBI_VERSION}" \ + -t "${VLLM_IMAGE_TAG}" \ + -f "${DOCKERFILE_REL}" \ + "${CONTEXT}" + +echo "[podman/build] Done -> ${VLLM_IMAGE_TAG}" diff --git a/extras/podman/dev-setup.sh b/extras/podman/dev-setup.sh new file mode 100644 index 000000000000..abd67da41463 --- /dev/null +++ b/extras/podman/dev-setup.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# Robust setup entrypoint: prefer extras/dev-setup.sh, +# otherwise use the image-provided /home/vllmuser/setup_vllm_dev.sh. +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}" )" &>/dev/null && pwd) +EXTRAS_DIR=$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd) + +try_exec() { + local target="$1" + if [[ -f "$target" ]]; then + # Normalize CRLF and avoid chmod on mounted FS + local tmp + tmp="$(mktemp /tmp/dev-setup-target.XXXX.sh)" + tr -d '\r' < "$target" > "$tmp" 2>/dev/null || cp "$target" "$tmp" + chmod +x "$tmp" 2>/dev/null || true + exec "$tmp" "$@" + fi +} + +# 1) Current canonical path +if [[ -f "${EXTRAS_DIR}/dev-setup.sh" ]]; then + try_exec "${EXTRAS_DIR}/dev-setup.sh" "$@" +fi + +# 2) Fallback: perform a minimal editable install inline (avoid chmod on /tmp) +echo "πŸ”§ Setting up vLLM (inline fallback)..." +cd /workspace + +# Ensure patches applied before building +if command -v apply-vllm-patches >/dev/null 2>&1; then + apply-vllm-patches || true +fi + +# Prefer /opt/work/tmp (mounted volume) if available, else /tmp +if [[ -d /opt/work ]]; then + export TMPDIR=/opt/work/tmp +else + export TMPDIR=/tmp +fi +mkdir -p "$TMPDIR" || true + +# Build env knobs +export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-4} +export VLLM_INSTALL_PUNICA_KERNELS=${VLLM_INSTALL_PUNICA_KERNELS:-0} +export MAX_JOBS=${MAX_JOBS:-4} +# CUDA 13 toolchain dropped SM70/75; ensure we don't pass them to nvcc +export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-"8.0 8.6 8.9 9.0 12.0 13.0"} +export CUDAARCHS=${CUDAARCHS:-"80;86;89;90;120"} + +# Install Python deps from repo (torch stack already in image) +if [[ -f requirements/common.txt ]]; then + pip install -r requirements/common.txt || true +fi + +# Avoid slow git describe during setuptools_scm by providing a pretend version +export SETUPTOOLS_SCM_PRETEND_VERSION=${SETUPTOOLS_SCM_PRETEND_VERSION:-0+local} + +FETCHCONTENT_BASE_DIR="$TMPDIR/deps" pip install -e . --no-deps --no-build-isolation --verbose +echo "βœ… vLLM installed in editable mode (fallback)!" +python - <<'PY' +import vllm +print("vLLM version:", getattr(vllm, "__version__", "unknown")) +PY diff --git a/extras/podman/entrypoint/apply-patches-then-exec.sh b/extras/podman/entrypoint/apply-patches-then-exec.sh new file mode 100644 index 000000000000..9db4781c0e6a --- /dev/null +++ b/extras/podman/entrypoint/apply-patches-then-exec.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Apply repo patches if available; best-effort, normalization handled inside helper. +if command -v apply-vllm-patches >/dev/null 2>&1; then + echo "[entrypoint] applying patches..." + apply-vllm-patches || true +fi + +# If first args are `bash -lc ` (single token, no spaces), normalize CRLF then exec +if [[ "${1-}" == "bash" && "${2-}" == "-lc" ]]; then + arg3="${3-}" + # Only handle when it's a single token path ending in .sh with no spaces or shell operators + if [[ -n "$arg3" && "$arg3" != *' '* && "$arg3" != *';'* && "$arg3" != *'&'* && "$arg3" != *'|'* && "$arg3" == *.sh ]]; then + # Resolve to filesystem path if it exists + if [[ -f "$arg3" ]]; then + SRC_SCRIPT="$arg3" + TMP_SCRIPT="$(mktemp /tmp/entry-XXXX.sh)" + tr -d '\r' < "$SRC_SCRIPT" > "$TMP_SCRIPT" 2>/dev/null || cp "$SRC_SCRIPT" "$TMP_SCRIPT" + chmod +x "$TMP_SCRIPT" 2>/dev/null || true + exec bash -lc "$TMP_SCRIPT" + fi + fi +fi + +exec "$@" diff --git a/extras/podman/run.ps1 b/extras/podman/run.ps1 new file mode 100644 index 000000000000..5a2f5d44a32e --- /dev/null +++ b/extras/podman/run.ps1 @@ -0,0 +1,264 @@ +#!/usr/bin/env pwsh +[CmdletBinding()] param( + [switch]$Build, + [switch]$Interactive, + [string]$Command = "", + [switch]$Setup, + [switch]$GPUCheck, + [switch]$Mirror, + [switch]$Recreate, + [string]$WorkVolume = "", + [string]$WorkDirHost = "", + [switch]$Progress, + [switch]$NoCache, + [switch]$Pull, + [switch]$Help +) + +if ($Help) { + Write-Host "Usage: extras/podman/run.ps1 [options]" + Write-Host " -Build Build the dev image (reads extras/configs/build.env)" + Write-Host " -Interactive Start an interactive shell" + Write-Host " -Command Run a command inside the dev container" + Write-Host " -Setup Run project setup inside the container" + Write-Host " -GPUCheck Run a CUDA/Torch sanity check" + Write-Host " -Mirror Use local mirror registries if configured" + Write-Host " -Recreate Recreate the container if running" + Write-Host " -WorkVolume Named volume to mount at /opt/work" + Write-Host " -WorkDirHost Host dir to mount at /opt/work" + Write-Host " -Progress Show progress bars in setup" + Write-Host " -NoCache Build image without using cache" + Write-Host " -Pull Always attempt to pull newer base image" + return +} + +if (-not $Interactive -and [string]::IsNullOrEmpty($Command) -and -not $GPUCheck -and -not $Setup) { $Interactive = $true } + +if (-not (Get-Command podman -ErrorAction SilentlyContinue)) { Write-Host "❌ Podman not found in PATH" -ForegroundColor Red; exit 1 } + +$ContainerName = "vllm-dev" +$ImageTag = "vllm-dev:latest" +$SourceDir = (Get-Location).Path + +Write-Host "πŸ‹ vLLM Dev Container (Podman)" -ForegroundColor Green + +if ($Build) { + Write-Host "πŸ”¨ Building image (honoring extras/configs/build.env)..." -ForegroundColor Yellow + $configPath = Join-Path $SourceDir "extras/configs/build.env" + $dockerfilePath = Join-Path $SourceDir "extras/Dockerfile" + $cudaVer = $null + $baseFlavor = $null + $archList = $null + $cudaArchs = $null + $requireFfmpegArg = '1' + $tvRef = $null + $taRef = $null + function Get-DockerArgDefault([string]$name, [string]$fallback) { + if (Test-Path $dockerfilePath) { + $df = Get-Content -Raw -Path $dockerfilePath + $m = [regex]::Match($df, "(?m)^\s*ARG\s+${name}\s*=\s*([^\r\n]+)") + if ($m.Success) { + return $m.Groups[1].Value.Trim() + } + } + return $fallback + } + if (Test-Path $configPath) { + $cfg = Get-Content -Raw -Path $configPath + function Get-EnvDefault([string]$name, [string]$fallback) { + # Match a line like: export NAME=VALUE + $line = [regex]::Match($cfg, "(?m)^\s*export\s+${name}\s*=\s*([^\r\n]+)") + if (-not $line.Success) { return $fallback } + $val = $line.Groups[1].Value.Trim() + # Strip wrapping quotes if present + if (($val.StartsWith('"') -and $val.EndsWith('"')) -or ($val.StartsWith("'") -and $val.EndsWith("'"))) { $val = $val.Substring(1, $val.Length-2) } + # If value is Bash-style ${NAME:-default}, extract default + if ($val.StartsWith('${') -and $val.Contains(':-')) { + $idx = $val.IndexOf(':-'); $end = $val.IndexOf('}', $idx) + if ($idx -ge 0 -and $end -gt $idx) { + $def = $val.Substring($idx+2, $end-($idx+2)).Trim() + if (($def.StartsWith('"') -and $def.EndsWith('"')) -or ($def.StartsWith("'") -and $def.EndsWith("'"))) { $def = $def.Substring(1, $def.Length-2) } + return $def + } + } + return $val + } + $cudaVer = Get-EnvDefault -name 'CUDA_VERSION' -fallback (Get-DockerArgDefault 'CUDA_VERSION' '13.0.0') + $baseFlavor = Get-EnvDefault -name 'BASE_FLAVOR' -fallback (Get-DockerArgDefault 'BASE_FLAVOR' 'rockylinux9') + $archList = Get-EnvDefault -name 'TORCH_CUDA_ARCH_LIST' -fallback (Get-DockerArgDefault 'TORCH_CUDA_ARCH_LIST' '8.0 8.6 8.9 9.0 12.0 13.0') + $cudaArchs = Get-EnvDefault -name 'CUDA_ARCHS' -fallback (Get-DockerArgDefault 'CUDA_ARCHS' '80;86;89;90;120') + # No longer used: wheels-only installs for torchvision/torchaudio + $requireFfmpeg = Get-EnvDefault -name 'REQUIRE_FFMPEG' -fallback (Get-DockerArgDefault 'REQUIRE_FFMPEG' '1') + if ($requireFfmpeg -match '^[01]$') { $requireFfmpegArg = $requireFfmpeg } else { $requireFfmpegArg = '1' } + } + # Derive PyTorch nightly index from CUDA version (e.g., 13.0 -> cu130, 12.9 -> cu129) + $torchCudaIndex = if ($cudaVer -match '^13\.') { 'cu130' } elseif ($cudaVer -match '^12\.9') { 'cu129' } else { + $parts = $cudaVer.Split('.') + if ($parts.Length -ge 2) { 'cu' + $parts[0] + $parts[1] + '0' } else { 'cu129' } + } + Write-Host ("Config: CUDA={0} BASE_FLAVOR={1} TORCH_CUDA_INDEX={2} ARCH_LIST=({3}) CUDA_ARCHS={4}" -f $cudaVer,$baseFlavor,$torchCudaIndex,$archList,$cudaArchs) -ForegroundColor DarkGray + $buildCmd = @("build","-f","extras/Dockerfile", + "--build-arg","CUDA_VERSION=$cudaVer", + "--build-arg","BASE_FLAVOR=$baseFlavor", + "--build-arg","TORCH_CUDA_INDEX=$torchCudaIndex", + "--build-arg","TORCH_CUDA_ARCH_LIST=$archList", + "--build-arg","CUDA_ARCHS=$cudaArchs", + "--build-arg","REQUIRE_FFMPEG=$requireFfmpegArg", + "-t",$ImageTag,".") + # Use cache by default; add --no-cache only when requested + if ($NoCache) { $buildCmd = @($buildCmd[0],"--no-cache") + $buildCmd[1..($buildCmd.Length-1)] } + if ($Pull) { $buildCmd = @($buildCmd[0],"--pull=always") + $buildCmd[1..($buildCmd.Length-1)] } + & podman @buildCmd + if ($LASTEXITCODE -ne 0) { Write-Host "❌ Build failed" -ForegroundColor Red; exit 1 } + Write-Host "βœ… Build ok" -ForegroundColor Green +} + +# Already running? +$running = podman ps --filter "name=$ContainerName" --format "{{.Names}}" 2>$null + +if ($Recreate -and $running -eq $ContainerName) { + Write-Host "♻️ Removing existing container '$ContainerName'" -ForegroundColor Yellow + podman rm -f $ContainerName | Out-Null + $running = $null +} + +if ($running -eq $ContainerName) { + if ($GPUCheck) { + Write-Host "πŸ” GPU check (existing container)" -ForegroundColor Yellow + $cmd = @' +source /home/vllmuser/venv/bin/activate && python - <<'PY' +import torch, os +print("PyTorch:", getattr(torch,"__version__","n/a")) +print("CUDA:", torch.cuda.is_available()) +print("Devices:", torch.cuda.device_count() if torch.cuda.is_available() else 0) +print("LD_LIBRARY_PATH:", os.environ.get("LD_LIBRARY_PATH")) +if torch.cuda.is_available(): + try: + print("GPU 0:", torch.cuda.get_device_name(0)) + except Exception as e: + print("GPU name error:", e) +PY +nvidia-smi || true +'@ + $cmd = "export NVIDIA_VISIBLE_DEVICES=all; " + $cmd + podman exec $ContainerName bash -lc $cmd + exit $LASTEXITCODE + } + if ($Setup) { + Write-Host "πŸ”§ Running dev setup in existing container" -ForegroundColor Yellow + $envs = @() + if ($Mirror) { $envs += @('LOCAL_MIRROR=1') } + if ($Progress) { $envs += @('PROGRESS_WATCH=1') } + $envs += @('NVIDIA_VISIBLE_DEVICES=all') + $envStr = ($envs | ForEach-Object { "export $_;" }) -join ' ' + $cmd = "$envStr apply-vllm-patches || true; chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh" + if ($Progress) { podman exec -it $ContainerName bash -lc $cmd } else { podman exec $ContainerName bash -lc $cmd } + exit $LASTEXITCODE + } + if ($Command) { + Write-Host "πŸš€ Running command in existing container" -ForegroundColor Green + $runCmd = "source /home/vllmuser/venv/bin/activate && $Command" + podman exec $ContainerName bash -c $runCmd + exit $LASTEXITCODE + } + $resp = Read-Host "Attach to running container? [Y/n]" + if ($resp -eq "" -or $resp -match '^[Yy]$') { podman exec -it $ContainerName bash; exit $LASTEXITCODE } else { exit 0 } +} + +# Ensure image exists +podman image exists $ImageTag +if ($LASTEXITCODE -ne 0) { Write-Host "❌ Image missing. Use -Build." -ForegroundColor Red; exit 1 } + +# Base args (no default /tmp tmpfs; can be enabled via VLLM_TMPFS_TMP_SIZE) +$runArgs = @("run","--rm","--security-opt=label=disable","--shm-size","8g","-v","${SourceDir}:/workspace:Z") +if (-not [string]::IsNullOrWhiteSpace($WorkVolume)) { $runArgs += @('-v',"${WorkVolume}:/opt/work:Z") } +elseif ($WorkDirHost -and (Test-Path $WorkDirHost)) { $runArgs += @('-v',"${WorkDirHost}:/opt/work:Z") } +$runArgs += @('-w','/workspace','--name',"$ContainerName",'--user','vllmuser','--env','ENGINE=podman') +# Use a tiny entrypoint to apply patches before executing the requested command +$runArgs += @('--entrypoint','/workspace/extras/podman/entrypoint/apply-patches-then-exec.sh') + +$tmpfsSize = [Environment]::GetEnvironmentVariable('VLLM_TMPFS_TMP_SIZE') +if (-not [string]::IsNullOrEmpty($tmpfsSize) -and $tmpfsSize -ne '0') { $runArgs += @('--tmpfs',"/tmp:size=$tmpfsSize") } + +if ($true) { # Request GPU via CDI hooks + $runArgs = @("run","--rm","--security-opt=label=disable","--device=nvidia.com/gpu=all") + $runArgs[2..($runArgs.Length-1)] +} + +# WSL GPU: map /dev/dxg and mount WSL libs +$runArgs += @('--device','/dev/dxg','-v','/usr/lib/wsl:/usr/lib/wsl:ro') +if ($Mirror) { $runArgs += @('--env','LOCAL_MIRROR=1') } +foreach ($ev in 'NVIDIA_VISIBLE_DEVICES','NVIDIA_DRIVER_CAPABILITIES','NVIDIA_REQUIRE_CUDA') { + $val = [Environment]::GetEnvironmentVariable($ev) + if ($val) { $runArgs += @('--env',"$ev=$val") } +} +$runArgs += @('--env','ENGINE=podman','--env','NVIDIA_VISIBLE_DEVICES=all','--env','NVIDIA_DRIVER_CAPABILITIES=compute,utility','--env','NVIDIA_REQUIRE_CUDA=') + +if ($GPUCheck) { + $pyDiag = @' +import json, torch, os +out = { + "torch_version": getattr(torch, "__version__", "n/a"), + "torch_cuda_version": getattr(getattr(torch, "version", None), "cuda", "n/a"), + "cuda_available": torch.cuda.is_available(), + "ld_library_path": os.environ.get("LD_LIBRARY_PATH"), +} +try: + out["device_count"] = torch.cuda.device_count() +except Exception as e: + out["device_count_error"] = str(e) +if out["cuda_available"] and out.get("device_count", 0) > 0: + try: + cap = torch.cuda.get_device_capability(0) + out["device_0"] = {"name": torch.cuda.get_device_name(0), "capability": f"sm_{cap[0]}{cap[1]}"} + except Exception as e: + out["device_0_error"] = str(e) +else: + out["diagnostics"] = ["Missing /dev/nvidia* or podman machine without GPU passthrough"] +print(json.dumps(out, indent=2)) +'@ + $pyB64 = [Convert]::ToBase64String([Text.Encoding]::UTF8.GetBytes($pyDiag)) + $gpuScript = @' +echo '=== GPU Check ===' +which nvidia-smi && nvidia-smi || echo 'nvidia-smi unavailable' +echo '--- /dev/nvidia* ---' +ls -l /dev/nvidia* 2>/dev/null || echo 'no /dev/nvidia* nodes' +echo '--- Environment (NVIDIA_*) ---' +env | grep -E '^NVIDIA_' || echo 'no NVIDIA_* env vars' +if [ "$NVIDIA_VISIBLE_DEVICES" = "void" ]; then echo 'WARN: NVIDIA_VISIBLE_DEVICES=void (no GPU mapped)'; fi +echo '--- LD_LIBRARY_PATH ---' +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" +source /home/vllmuser/venv/bin/activate 2>/dev/null || true +echo __PY_B64__ | base64 -d > /tmp/gpucheck.py +python /tmp/gpucheck.py || true +rm -f /tmp/gpucheck.py +'@ + $gpuScript = "export NVIDIA_VISIBLE_DEVICES=all; export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/lib/wsl/drivers:`$LD_LIBRARY_PATH; " + ($gpuScript -replace '__PY_B64__', $pyB64) -replace "`r","" + $runArgs += @('--user','root', $ImageTag,'bash','-lc',$gpuScript) +} elseif ($Setup) { + # Use robust setup entrypoint that finds the right script (extras/dev-setup.sh or image helper) + # Avoid in-place edits on Windows-mounted files; run a CRLF-normalized temp copy instead + $prefix = 'TMP_RUN=$(mktemp /tmp/run-dev-setup.XXXX.sh); tr -d "\r" < ./extras/podman/dev-setup.sh > "$TMP_RUN" || cp ./extras/podman/dev-setup.sh "$TMP_RUN"; chmod +x "$TMP_RUN" 2>/dev/null || true; apply-vllm-patches || true; ' + $envPrefix = '' + if ($Mirror) { $envPrefix += 'export LOCAL_MIRROR=1; ' } + if ($Progress) { $envPrefix += 'export PROGRESS_WATCH=1; ' } + # Pass configured archs from build.env (the Dockerfile already defaults to safe values) + if ($archList) { $envPrefix += "export TORCH_CUDA_ARCH_LIST='$archList'; " } + if ($cudaArchs) { $envPrefix += "export CUDAARCHS='$cudaArchs'; " } + $envPrefix += 'export TMPDIR=/opt/work/tmp; export TMP=/opt/work/tmp; export TEMP=/opt/work/tmp; mkdir -p /opt/work/tmp; ' + $setupCmd = $prefix + $envPrefix + '"$TMP_RUN"' + if ($Progress) { $runArgs += @('-it', $ImageTag, 'bash','-lc', $setupCmd) } else { $runArgs += @($ImageTag, 'bash','-lc', $setupCmd) } + Write-Host "πŸ”§ Running dev setup" -ForegroundColor Green +} elseif ($Interactive -and -not $Command) { + $runArgs += @('-it',$ImageTag,'bash') + Write-Host "πŸš€ Interactive shell" -ForegroundColor Green +} elseif ($Command) { + $runArgs += @($ImageTag,'bash','-lc',"export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/lib/wsl/drivers:`$LD_LIBRARY_PATH; source /home/vllmuser/venv/bin/activate && $Command") + Write-Host "πŸš€ Running command" -ForegroundColor Green +} else { + $runArgs += @($ImageTag) +} + +Write-Host "Command: podman $($runArgs -join ' ')" -ForegroundColor Gray +& podman @runArgs + +if ($LASTEXITCODE -eq 0 -and $Interactive) { Write-Host "Exited cleanly" -ForegroundColor Green } diff --git a/extras/podman/run.sh b/extras/podman/run.sh new file mode 100644 index 000000000000..278113440be2 --- /dev/null +++ b/extras/podman/run.sh @@ -0,0 +1,188 @@ +#!/usr/bin/env bash +# Unified lightweight vLLM dev container launcher (Podman-first, Linux/macOS) +set -euo pipefail + +IMAGE_TAG="vllm-dev:latest" +CONTAINER_NAME="vllm-dev" +SOURCE_DIR="$(pwd)" +BUILD_NO_CACHE=0 +BUILD_PULL=0 + +show_help() { + cat <&2; show_help; exit 1 ;; + esac +done + +if ! command -v podman >/dev/null 2>&1; then + echo "Error: podman not found in PATH" >&2 + exit 1 +fi + +echo "[vLLM] Engine: podman Image: $IMAGE_TAG Container: $CONTAINER_NAME" + +if [[ $BUILD -eq 1 ]]; then + echo "[vLLM] Building image..." + BUILD_ARGS=(-f extras/Dockerfile -t "$IMAGE_TAG") + # Load defaults from configs/build.env if present + if [[ -f extras/configs/build.env ]]; then + # shellcheck disable=SC1091 + . extras/configs/build.env + [[ -n "${CUDA_VERSION:-}" ]] && BUILD_ARGS+=(--build-arg "CUDA_VERSION=$CUDA_VERSION") + [[ -n "${BASE_FLAVOR:-}" ]] && BUILD_ARGS+=(--build-arg "BASE_FLAVOR=$BASE_FLAVOR") + # Derive torch nightly index from CUDA version when not set + if [[ -z "${TORCH_CUDA_INDEX:-}" ]]; then + if [[ "${CUDA_VERSION:-}" =~ ^13\. ]]; then TORCH_CUDA_INDEX=cu130; elif [[ "${CUDA_VERSION:-}" =~ ^12\.9 ]]; then TORCH_CUDA_INDEX=cu129; fi + fi + [[ -n "${TORCH_CUDA_INDEX:-}" ]] && BUILD_ARGS+=(--build-arg "TORCH_CUDA_INDEX=${TORCH_CUDA_INDEX}") + [[ -n "${TORCH_CUDA_ARCH_LIST:-}" ]] && BUILD_ARGS+=(--build-arg "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST") + [[ -n "${CUDA_ARCHS:-}" ]] && BUILD_ARGS+=(--build-arg "CUDA_ARCHS=$CUDA_ARCHS") + [[ -n "${REQUIRE_FFMPEG:-}" ]] && BUILD_ARGS+=(--build-arg "REQUIRE_FFMPEG=$REQUIRE_FFMPEG") + fi + [[ $BUILD_NO_CACHE -eq 1 ]] && BUILD_ARGS=(--no-cache "${BUILD_ARGS[@]}") + [[ $BUILD_PULL -eq 1 ]] && BUILD_ARGS=(--pull=always "${BUILD_ARGS[@]}") + if ! podman build "${BUILD_ARGS[@]}" .; then + echo "[vLLM] Build failed" >&2 + exit 1 + fi + echo "[vLLM] Build complete" +fi + +# If container running, attach / exec +RUNNING=$(podman ps --filter "name=${CONTAINER_NAME}" --format '{{.Names}}' 2>/dev/null || true) + +if [[ "$RUNNING" == "$CONTAINER_NAME" ]]; then + if [[ $GPU_CHECK -eq 1 ]]; then + echo "[vLLM] GPU check (existing container)" + exec podman exec "$CONTAINER_NAME" bash -lc 'source /home/vllmuser/venv/bin/activate 2>/dev/null || true; which nvidia-smi && nvidia-smi || true; python - </dev/null || true; ./extras/dev-setup.sh' + else + exec podman exec "$CONTAINER_NAME" bash -lc 'chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh' + fi + fi + if [[ -n "$CMD" ]]; then + echo "[vLLM] Exec command in existing container" + podman exec "$CONTAINER_NAME" bash -lc "source /home/vllmuser/venv/bin/activate 2>/dev/null || true; $CMD" + exit $? + fi + read -r -p "Attach to running container ${CONTAINER_NAME}? [Y/n] " RESP || true + if [[ -z "$RESP" || "$RESP" =~ ^[Yy]$ ]]; then + exec podman exec -it "$CONTAINER_NAME" bash + else + exit 0 + fi +fi + +# Ensure image exists if not building +if [[ $BUILD -ne 1 ]]; then + if ! podman image exists "$IMAGE_TAG"; then + echo "Image $IMAGE_TAG missing. Use --build." >&2; exit 1 + fi +fi + +# Base run args (use entrypoint to auto-apply patches before commands) +RUN_ARGS=(run --rm --device=nvidia.com/gpu=all --security-opt=label=disable --shm-size 8g --name "$CONTAINER_NAME" -v "${SOURCE_DIR}:/workspace:Z" -w /workspace --user vllmuser --env ENGINE=podman --entrypoint /workspace/extras/podman/entrypoint/apply-patches-then-exec.sh) + +# Prefer named volume for /opt/work if provided +if [[ -n "$WORK_VOLUME" ]]; then + RUN_ARGS+=(-v "${WORK_VOLUME}:/opt/work:Z") +fi + +# Allow configurable /tmp tmpfs size via VLLM_TMPFS_TMP_SIZE (default 0=disabled) +TMPFS_SIZE="${VLLM_TMPFS_TMP_SIZE:-0}" +if [[ -n "$TMPFS_SIZE" && "$TMPFS_SIZE" != "0" ]]; then + RUN_ARGS+=(--tmpfs "/tmp:size=${TMPFS_SIZE}") +fi + +# Ensure sane NVIDIA env defaults inside container to avoid 'void' and missing caps +RUN_ARGS+=(--env "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}" \ + --env "NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}" \ + --env "NVIDIA_REQUIRE_CUDA=") + +if [[ $GPU_CHECK -eq 1 ]]; then + GPU_SCRIPT=$'echo "=== GPU Check ==="; which nvidia-smi && nvidia-smi || echo "nvidia-smi unavailable"; echo "--- /dev/nvidia* ---"; ls -l /dev/nvidia* 2>/dev/null || echo "no /dev/nvidia* nodes"; echo "--- Environment (NVIDIA_*) ---"; env | grep -E "^NVIDIA_" || echo "no NVIDIA_* env vars"; if [ "$NVIDIA_VISIBLE_DEVICES" = "void" ]; then echo "WARN: NVIDIA_VISIBLE_DEVICES=void (no GPU mapped)"; fi; echo "--- LD_LIBRARY_PATH ---"; echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"; source /home/vllmuser/venv/bin/activate 2>/dev/null || true; python - <0:\n\ttry:\n\t\tcap=torch.cuda.get_device_capability(0)\n\t\tout[\'device_0\']={\'name\':torch.cuda.get_device_name(0),\'capability\':f"sm_{cap[0]}{cap[1]}"}\n\texcept Exception as e:\n\t\tout[\'device_0_error\']=str(e)\nelse:\n\tout[\'diagnostics\']=[\'Missing /dev/nvidia* or podman machine without GPU passthrough\']\nprint(json.dumps(out,indent=2))\nPY' + RUN_ARGS+=("$IMAGE_TAG" bash -lc "$GPU_SCRIPT") +elif [[ $SETUP -eq 1 ]]; then + # Pass arch policy from configs/build.env if present + if [[ -f extras/configs/build.env ]]; then + # shellcheck disable=SC1091 + . extras/configs/build.env + [[ -n "${TORCH_CUDA_ARCH_LIST:-}" ]] && RUN_ARGS+=(--env "TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}") + [[ -n "${CUDA_ARCHS:-}" ]] && RUN_ARGS+=(--env "CUDAARCHS=${CUDA_ARCHS}") + fi + [[ $MIRROR -eq 1 ]] && RUN_ARGS+=(--env LOCAL_MIRROR=1) + [[ $PROGRESS -eq 1 ]] && RUN_ARGS+=(--env PROGRESS_WATCH=1) + SETUP_CMD='TMP_RUN=$(mktemp /tmp/run-dev-setup.XXXX.sh); tr -d "\r" < ./extras/podman/dev-setup.sh > "$TMP_RUN" || cp ./extras/podman/dev-setup.sh "$TMP_RUN"; chmod +x "$TMP_RUN" 2>/dev/null || true; apply-vllm-patches || true; "$TMP_RUN"' + if [[ $PROGRESS -eq 1 ]]; then + RUN_ARGS+=("-it" "$IMAGE_TAG" bash -lc "$SETUP_CMD") + else + RUN_ARGS+=("$IMAGE_TAG" bash -lc "$SETUP_CMD") + fi +elif [[ -n "$CMD" ]]; then + RUN_ARGS+=("$IMAGE_TAG" bash -lc "source /home/vllmuser/venv/bin/activate 2>/dev/null || true; $CMD") +else + RUN_ARGS+=("-it" "$IMAGE_TAG" bash) + echo "[vLLM] Interactive shell. Helpful inside container:" + echo " ./extras/dev-setup.sh # Build/install editable vLLM" + echo " python -c 'import torch;print(torch.cuda.is_available())'" + echo " python -c 'import vllm'" +fi + +echo "[vLLM] Command: podman ${RUN_ARGS[*]}" +exec podman "${RUN_ARGS[@]}" diff --git a/extras/podman/scripts/gpu_status.sh b/extras/podman/scripts/gpu_status.sh new file mode 100644 index 000000000000..a50c78b01c03 --- /dev/null +++ b/extras/podman/scripts/gpu_status.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Helper to show GPU/CDI status under Podman (Linux/WSL) + +podman info --format json | jq '.host' || podman info || true + +# Show CDI devices if available +podman cdi list || true diff --git a/extras/secrets/.gitignore b/extras/secrets/.gitignore new file mode 100644 index 000000000000..d4895ec18947 --- /dev/null +++ b/extras/secrets/.gitignore @@ -0,0 +1,4 @@ +# Ensure this directory stays out of git; keep this file only. +* +!.gitignore +!README.md diff --git a/extras/secrets/README.md b/extras/secrets/README.md new file mode 100644 index 000000000000..b519087af198 --- /dev/null +++ b/extras/secrets/README.md @@ -0,0 +1,12 @@ +# secrets directory + +This directory is gitignored and intended for local-only secret material such as model hub tokens. + +Files are expected to be simple KEY=VALUE lines that can be sourced by shell scripts. + +Examples: + +- hf-credentials.env +- cn-modelhub-credentials.env + +Do NOT commit secrets. See README for details. diff --git a/extras/storage/README.md b/extras/storage/README.md new file mode 100644 index 000000000000..d106b6d7378c --- /dev/null +++ b/extras/storage/README.md @@ -0,0 +1,7 @@ +# Storage helpers + +Declare and manage external volumes for models and caches. + +- storage-config.yaml: Declarative host/container paths +- setup_local.sh: Helper to prepare a local volume or directory +- scripts/: Utilities for warmup, cache management, mounts diff --git a/extras/storage/scripts/warm_cache.sh b/extras/storage/scripts/warm_cache.sh new file mode 100644 index 000000000000..1d97b7f044f6 --- /dev/null +++ b/extras/storage/scripts/warm_cache.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Placeholder for cache warmup logic. +# Example usage: ./warm_cache.sh meta-llama/Llama-3-8B /models +MODEL_ID=${1:-meta-llama/Llama-3-8B} +TARGET=${2:-/models} +mkdir -p "$TARGET" +echo "(scaffold) Would warm cache for $MODEL_ID under $TARGET" diff --git a/extras/storage/setup_local.sh b/extras/storage/setup_local.sh new file mode 100644 index 000000000000..101826bc7396 --- /dev/null +++ b/extras/storage/setup_local.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Prepare a local directory for models and ensure reasonable permissions. +TARGET=${1:-/mnt/ml-models} +mkdir -p "$TARGET" +chmod 775 "$TARGET" || true + +echo "Model storage prepared at: $TARGET" diff --git a/extras/storage/storage-config.yaml b/extras/storage/storage-config.yaml new file mode 100644 index 000000000000..90310b572b3c --- /dev/null +++ b/extras/storage/storage-config.yaml @@ -0,0 +1,4 @@ +model_volume: + path_host: "/mnt/ml-models" + path_container: "/models" + shared: true diff --git a/extras/testing/README.md b/extras/testing/README.md new file mode 100644 index 000000000000..2c64d538ac97 --- /dev/null +++ b/extras/testing/README.md @@ -0,0 +1,7 @@ +# Testing and benchmarking harness + +- Define a matrix of models/environments in `test_matrix.yaml`. +- Run via `python extras/testing/run_tests.py --output-dir extras/testing/results/$(date +%F_%H-%M)`. +- Store results in `results/` with timestamps for regression tracking. + +This scaffolding is intentionally minimal; models and benchmarks can be added incrementally. diff --git a/extras/testing/compare_results.py b/extras/testing/compare_results.py new file mode 100644 index 000000000000..f6c91bdd6667 --- /dev/null +++ b/extras/testing/compare_results.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +import argparse +import json + + +def load(path: str) -> dict: + with open(path, encoding="utf-8") as f: + return json.load(f) + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("a") + p.add_argument("b") + args = p.parse_args() + + A = load(args.a) + B = load(args.b) + + # Placeholder comparison: print keys that differ + diffs = sorted(set(A.keys()) ^ set(B.keys())) + print(json.dumps({"diff_keys": diffs})) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/extras/testing/run_tests.py b/extras/testing/run_tests.py new file mode 100644 index 000000000000..1dcea180b6d2 --- /dev/null +++ b/extras/testing/run_tests.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Minimal, non-destructive test harness that prints a JSON line per test. +This is a scaffold; integrate with your local launchers or CI as needed. +""" +from __future__ import annotations + +import argparse +import json +import os +from datetime import datetime + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--cuda-version", + default=os.getenv("CUDA_VERSION", "13.0.0")) + p.add_argument("--ubi-version", default=os.getenv("UBI_VERSION", "9")) + p.add_argument("--models", default="Example-Llama3-8B") + p.add_argument("--output-dir", + default=os.path.join("extras", "testing", "results", + datetime.now().strftime("%F_%H-%M"))) + args = p.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + result = { + "ts": datetime.utcnow().isoformat() + "Z", + "cuda": args.cuda_version, + "ubi": args.ubi_version, + "models": args.models.split(","), + "status": "scaffold", + "notes": "Integrate with vLLM server/client to collect real metrics.", + } + + out_path = os.path.join(args.output_dir, "scaffold.json") + with open(out_path, "w", encoding="utf-8") as f: + json.dump(result, f, indent=2) + + print(json.dumps({"written": out_path})) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/extras/testing/test_matrix.yaml b/extras/testing/test_matrix.yaml new file mode 100644 index 000000000000..fcd9e878adf9 --- /dev/null +++ b/extras/testing/test_matrix.yaml @@ -0,0 +1,16 @@ +models: + - name: Example-Llama3-8B + id: meta-llama/Llama-3-8B + chat_template: chat_templates/llama-3-instruct.jinja + params: + max_tokens: 64 + temperature: 0.7 + +environments: + - cuda: 13.0.0 + ubi: 9 + +benchmarks: + - name: inference_speed + input: "Summarize: vLLM extras modularization plan." + metrics: [latency_ms, tokens_per_sec] diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 7963fb15c419..69f38fd0a178 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -143,7 +143,9 @@ def get_instance() -> "CuMemAllocator": return CuMemAllocator.instance def __init__(self): - conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") + # Prefer new env var; fall back to deprecated one for compatibility + conf = os.environ.get("PYTORCH_ALLOC_CONF", + os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")) assert "expandable_segments:True" not in conf, \ ("Expandable segments are not compatible with memory pool. " "Please track https://github.com/pytorch/pytorch/issues/147851 "