diff --git a/.gemini/config.yaml b/.gemini/config.yaml
deleted file mode 100644
index 2499d3f09510..000000000000
--- a/.gemini/config.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
-have_fun: false  # Just review the code
-code_review:
-  comment_severity_threshold: HIGH  # Reduce quantity of comments
-  pull_request_opened:
-    summary: false  # Don't summarize the PR in a separate comment
diff --git a/.github/ci-trigger-20250814-1 b/.github/ci-trigger-20250814-1
new file mode 100644
index 000000000000..8ca993aa58b2
--- /dev/null
+++ b/.github/ci-trigger-20250814-1
@@ -0,0 +1 @@
+trigger: sync_with_upstream
diff --git a/.github/workflows/sync_with_upstream.yml b/.github/workflows/sync_with_upstream.yml
new file mode 100644
index 000000000000..5dce797dae16
--- /dev/null
+++ b/.github/workflows/sync_with_upstream.yml
@@ -0,0 +1,99 @@
+name: Sync with Upstream
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # Runs daily at midnight UTC
+  push:
+    branches:
+      - main
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Git
+        run: |
+          git config --global user.name 'Zhuul'
+          git config --global user.email '40538530+Zhuul@users.noreply.github.com'
+
+      - name: Add upstream remote
+        run: git remote add upstream https://github.com/vllm-project/vllm.git
+
+      - name: Fetch upstream changes
+        run: git fetch upstream
+
+      - name: Merge upstream changes
+        id: merge
+        run: |
+          git checkout main
+          git merge upstream/main --allow-unrelated-histories --no-edit || {
+            echo "Merge conflict detected. Creating a new branch for manual resolution."
+            BRANCH="merge-conflict-$(date +%Y%m%d%H%M%S)"
+            git checkout -b "$BRANCH"
+            # Push with a descriptive message
+            git push origin HEAD
+            echo "conflict=true" >> "$GITHUB_OUTPUT"
+            echo "conflict_branch=$BRANCH" >> "$GITHUB_OUTPUT"
+            exit 1
+          }
+          echo "conflict=false" >> "$GITHUB_OUTPUT"
+
+      - name: Check for workflow file changes
+        id: workflow_change
+        run: |
+          if git diff --name-only upstream/main | grep '^.github/workflows/'; then
+            echo "workflow_changed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "workflow_changed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      # Use GITHUB_TOKEN for authentication, sufficient for repo write access in actions
+      - name: Set up authentication
+        run: git remote set-url origin "https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/Zhuul/vllm.git"
+
+      - name: Push changes if no workflow files changed
+        if: steps.workflow_change.outputs.workflow_changed == 'false' && steps.merge.outputs.conflict == 'false'
+        run: git push origin main
+
+      - name: Create Pull Request for workflow file changes
+        if: steps.workflow_change.outputs.workflow_changed == 'true' && steps.merge.outputs.conflict == 'false'
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          commit-message: "Sync with upstream: update workflow files"
+          title: "Sync with upstream: update workflow files"
+          body: |
+            This PR was automatically created because workflow files were updated while syncing with upstream.
+            Please review and merge.
+          branch: workflow-sync-${{ github.run_id }}
+          base: main
+
+      # Notification step: create an issue if merge conflict detected
+      - name: Create Issue on Merge Conflict
+        if: steps.merge.outputs.conflict == 'true'
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              title: "Merge Conflict Detected During Upstream Sync",
+              body: `
+              A merge conflict occurred while syncing with upstream (vllm-project/vllm).
+              Branch for manual resolution: ${{ steps.merge.outputs.conflict_branch }}
+
+              Please resolve this conflict at https://github.com/${context.repo.owner}/${context.repo.repo}/tree/${{ steps.merge.outputs.conflict_branch }}
+
+              This issue was automatically created by the sync workflow.
+              `
+            })
+
+      - name: Log completion
+        run: echo "Sync with upstream completed. Thank you for using automated upstream sync 🚀"
diff --git a/.gitignore b/.gitignore
index 465935d488f8..a5bd3740e844 100644
--- a/.gitignore
+++ b/.gitignore
@@ -209,4 +209,5 @@ shellcheck*/
 csrc/moe/marlin_moe_wna16/kernel_*
 
 # Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
\ No newline at end of file
+ep_kernels_workspace/node_modules/
+package*.json
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index f051eb070222..be7be6214987 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -30,7 +30,7 @@ __global__ void rms_norm_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum());
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
@@ -85,7 +85,7 @@ fused_add_rms_norm_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum());
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
@@ -126,7 +126,7 @@ fused_add_rms_norm_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum());
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
index 0fd5849d9626..6427396471e2 100644
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -39,7 +39,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum());
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
@@ -100,7 +100,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum());
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
@@ -149,7 +149,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum());
 
   if (threadIdx.x == 0) {
     s_variance = rsqrtf(variance / hidden_size + epsilon);
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index d8369108d0bd..f4a646471c28 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -173,7 +173,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
       });
   using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage tmp;
-  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
+  float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max());
   __shared__ float absmax;
   if (tid == 0) {
     absmax = block_max;
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 5fe5dd04bd89..ae7d0d81eb4c 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -116,7 +116,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
   using BlockReduce = cub::BlockReduce<float, 256>;
   __shared__ typename BlockReduce::TempStorage tmp;
   const float block_max =
-      BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
+      BlockReduce(tmp).Reduce(absmax_val, cub::Max());
 
   __shared__ float token_scale;
   if (tid == 0) {
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index 3f188872d80d..b7cc1f0a0b5f 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -36,7 +36,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum());
 
   __shared__ float s_rms;
   if (threadIdx.x == 0) {
@@ -73,7 +73,7 @@ __device__ void compute_dynamic_per_token_scales(
   __shared__ typename BlockReduce::TempStorage reduceStore;
   block_absmax_val_maybe =
       BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+          .Max(block_absmax_val_maybe);
 
   __shared__ float s_token_scale;
   if (threadIdx.x == 0) {
@@ -169,7 +169,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
-  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum());
 
   __shared__ float s_rms;
   if (threadIdx.x == 0) {
@@ -240,7 +240,7 @@ __device__ void compute_dynamic_per_token_scales(
   __shared__ typename BlockReduce::TempStorage reduceStore;
   block_absmax_val_maybe =
       BlockReduce(reduceStore)
-          .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
+          .Max(block_absmax_val_maybe);
 
   __shared__ float s_token_scale;
   if (threadIdx.x == 0) {
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index 5a2a70d57e85..1c2a31cf895c 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -31,6 +31,8 @@ See <gh-file:LICENSE>.
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
 Check out the [building from source][build-from-source] documentation for details.
 
+For a containerized developer workflow, see Podman-first dev: `docs/contributing/podman-dev.md`.
+
 For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
 
 ### Building the docs with MkDocs
diff --git a/docs/contributing/podman-dev.md b/docs/contributing/podman-dev.md
new file mode 100644
index 000000000000..881e495f8421
--- /dev/null
+++ b/docs/contributing/podman-dev.md
@@ -0,0 +1,41 @@
+---
+title: Podman-first Development Environment
+---
+
+This guide documents the Podman-first development workflow for building vLLM from source with CUDA and PyTorch nightly.
+
+Primary entrypoint
+
+- Windows (PowerShell): `./extras/podman/run.ps1`
+- Linux/macOS (bash): `extras/podman/run.sh`
+
+Legacy launchers at `extras/run-vllm-dev.ps1` and `extras/run-vllm-dev.sh` are deprecated and forward to the Podman wrappers.
+
+Prerequisites
+
+- Podman with GPU CDI enabled (on Windows, use Podman Desktop + WSL; ensure NVIDIA drivers and CUDA are installed on the host).
+- Optional named volume for build/work space, e.g., `vllm-work`.
+
+Quick start
+
+Windows (PowerShell):
+
+```powershell
+./extras/podman/run.ps1 -Build
+./extras/podman/run.ps1 -GPUCheck
+./extras/podman/run.ps1 -Setup -WorkVolume vllm-work -Progress
+```
+
+Linux/macOS (bash):
+
+```bash
+extras/podman/run.sh --build
+extras/podman/run.sh --gpu-check
+extras/podman/run.sh --setup --work-volume vllm-work --progress
+```
+
+Notes
+
+- The image uses CUDA 12.9 UBI9 and installs PyTorch nightly cu129 first to ensure latest GPU arch support (including sm_120 when present).
+- The setup step performs an editable vLLM install without downgrading torch family packages.
+- Use a named Podman volume for `/opt/work` to avoid `/tmp` tmpfs pressure and to speed up rebuilds.
diff --git a/extras/.dockerignore b/extras/.dockerignore
new file mode 100644
index 000000000000..60a8d81a82c1
--- /dev/null
+++ b/extras/.dockerignore
@@ -0,0 +1,39 @@
+# Reduce build context to avoid Windows Podman tar write issues
+.git
+.github
+.vscode
+.venv
+venv
+node_modules
+build
+dist
+csrc/
+vllm/
+benchmarks/
+docs/
+examples/
+tests/
+**/__pycache__
+**/*.pyc
+**/*.pyo
+**/*.pyd
+**/*.so
+**/*.o
+**/*.a
+**/*.dll
+**/*.dylib
+extras/build.log
+extras/*.bak
+extras/tools/
+extras/run-vllm-dev-*.ps1
+extras/run-vllm-dev-*.sh
+extras/*wsl*
+extras/*docker*.ps1
+
+!extras/Dockerfile
+!extras/run-vllm-dev.ps1
+!extras/run-vllm-dev.sh
+!extras/dev-setup.sh
+requirements/
+pyproject.toml
+setup.py
diff --git a/extras/Dockerfile b/extras/Dockerfile
new file mode 100644
index 000000000000..69ee583e5bb7
--- /dev/null
+++ b/extras/Dockerfile
@@ -0,0 +1,266 @@
+# vLLM Development Container with GPU Support
+# Uses vLLM's own requirements for automatic dependency management
+
+# Build-time args to control CUDA/OS base and PyTorch nightly index
+ARG CUDA_VERSION=13.0.0
+ARG UBI_VERSION=9
+ARG TORCH_CUDA_INDEX=cu130
+# Base flavor for CUDA image: e.g. 'rockylinux9' (default) or 'ubi9'
+ARG BASE_FLAVOR=rockylinux9
+
+# Switchable base: defaults to Rocky Linux to avoid subscription-gated repos
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${BASE_FLAVOR}
+
+# Set CUDA environment variables for build tools
+ENV CUDA_HOME=/usr/local/cuda
+ENV CUDA_ROOT=/usr/local/cuda
+ENV PATH=$CUDA_HOME/bin:$PATH
+ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+ENV CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME
+ENV CUDNN_LIBRARY_PATH=/usr/lib64
+ENV CUDNN_INCLUDE_PATH=/usr/include
+
+# Install system packages with additional CUDA development libraries
+RUN dnf update -y && dnf install --allowerasing -y \
+    python3 python3-pip python3-devel \
+    git gcc gcc-c++ cmake \
+    make patch which findutils tar rsync \
+    wget curl vim nano pkgconfig \
+    zlib-devel bzip2 bzip2-devel xz xz-devel libffi-devel \
+    openssl-devel sqlite-devel \
+    && (dnf install -y readline-devel || true) \
+    && dnf clean all
+
+# Prefer Python 3.12 from packages if available (fallback to system python3)
+RUN dnf install -y python3.12 python3.12-devel || true
+
+## Ensure /usr/bin/python exists for compatibility
+RUN ln -sf $(command -v python3) /usr/bin/python || true
+
+# Create a non-root user for development
+RUN useradd -m -s /bin/bash vllmuser && \
+    echo "vllmuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+# Install essential system tools
+RUN dnf install -y hostname iproute iputils
+
+ARG REQUIRE_FFMPEG=1
+# Multimedia and image libs with optional ffmpeg-devel enforcement
+# Install EPEL and RPM Fusion repos for EL (9/10) and pull ffmpeg/ffmpeg-devel from there.
+# When REQUIRE_FFMPEG=1, fail the build if ffmpeg is still unavailable.
+RUN set -euxo pipefail \
+        && (dnf install -y dnf-plugins-core || true) \
+        && (dnf config-manager --set-enabled crb || true) \
+        && (dnf makecache -y || true) \
+    && . /etc/os-release \
+    && ELVER="${VERSION_ID%%.*}" \
+    && echo "[Dockerfile] Detected Enterprise Linux major version: ${ELVER}" \
+        && dnf install -y \
+                libjpeg-turbo-devel libpng-devel zlib-devel freetype-devel \
+                libsndfile libsndfile-devel sox sox-devel || true \
+        && if [ "${REQUIRE_FFMPEG}" = "1" ]; then \
+         echo "[Dockerfile] Enabling EPEL and RPM Fusion for ffmpeg (EL${ELVER})"; \
+         dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-${ELVER}.noarch.rpm; \
+         dnf install -y https://download1.rpmfusion.org/free/el/rpmfusion-free-release-${ELVER}.noarch.rpm; \
+         dnf install -y https://download1.rpmfusion.org/nonfree/el/rpmfusion-nonfree-release-${ELVER}.noarch.rpm; \
+                 dnf makecache -y; \
+                 dnf install -y ffmpeg ffmpeg-devel; \
+                 command -v ffmpeg >/dev/null 2>&1; \
+             else \
+                 # Best-effort install when not enforced
+         (dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-${ELVER}.noarch.rpm || true); \
+         (dnf install -y https://download1.rpmfusion.org/free/el/rpmfusion-free-release-${ELVER}.noarch.rpm || true); \
+         (dnf install -y https://download1.rpmfusion.org/nonfree/el/rpmfusion-nonfree-release-${ELVER}.noarch.rpm || true); \
+                 (dnf makecache -y || true); \
+                 (dnf install -y ffmpeg ffmpeg-devel || true); \
+             fi \
+        && (dnf install -y --enablerepo=crb ninja-build || \
+            dnf install -y --enablerepo=crb ninja || \
+            dnf install -y ninja-build || \
+            dnf install -y ninja || true) \
+    && dnf clean all || true
+
+
+
+# Add NVIDIA Machine Learning repo for RHEL9/UBI9 and install NCCL runtime/devel
+# Needed for PyTorch nightly cu129 to avoid ncclCommWindowRegister symbol errors
+# Install NCCL runtime/devel from the CUDA repository available in the base image
+RUN set -euxo pipefail \
+    && dnf makecache -y \
+    && (dnf install -y libnccl libnccl-devel || dnf install -y libnccl-2 libnccl-devel-2) \
+    && dnf clean all
+
+# Set working directory and adjust ownership
+WORKDIR /workspace
+RUN chown -R vllmuser:vllmuser /workspace
+
+# Create build directories with proper permissions
+RUN mkdir -p /workspace/.deps && chown -R vllmuser:vllmuser /workspace/.deps && \
+    mkdir -p /tmp/vllm-build && chmod 777 /tmp/vllm-build && \
+    mkdir -p /opt/work && chmod 777 /opt/work && \
+    mkdir -p /home/vllmuser/.cache && chown -R vllmuser:vllmuser /home/vllmuser/.cache && \
+    mkdir -p /home/vllmuser/.ccache && chown -R vllmuser:vllmuser /home/vllmuser/.ccache && \
+    mkdir -p /home/vllmuser/.cmake && chown -R vllmuser:vllmuser /home/vllmuser/.cmake && \
+    chmod -R 755 /workspace && \
+    chmod -R 777 /tmp
+
+# Switch to the non-root user
+USER vllmuser
+
+# Create and activate virtual environment using the best available Python (3.12 preferred)
+ENV VIRTUAL_ENV=/home/vllmuser/venv
+RUN PY_BIN="$(command -v python3.12 || command -v python3)" && "$PY_BIN" -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Set pip configuration
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1
+ENV PIP_NO_CACHE_DIR=1
+ENV PYTHONUNBUFFERED=1
+ENV PIP_DEFAULT_TIMEOUT=120
+ENV PIP_RETRIES=5
+ENV PIP_PREFER_BINARY=1
+
+# CUDA arch list: CUDA 13+ drops SM70/SM75; default to supported archs only.
+# Override at build time with: --build-arg TORCH_CUDA_ARCH_LIST="..."
+ARG TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 12.0 13.0"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"
+
+# Also set CUDAARCHS (semicolon separated) for CMake/NVCC generators.
+# Override at build time with: --build-arg CUDA_ARCHS="80;86;89;90;120"
+ARG CUDA_ARCHS="80;86;89;90;120"
+ENV CUDAARCHS="${CUDA_ARCHS}"
+
+# Upgrade pip and setuptools to latest versions
+RUN pip install --upgrade pip setuptools>=61 wheel
+
+COPY requirements/ /tmp/requirements/
+
+# Install PyTorch nightly first (includes latest GPU arch support such as Blackwell sm_120 when present)
+ARG TORCH_CUDA_INDEX
+RUN pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX}
+
+RUN pip install --pre torchvision --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX}
+RUN pip install --pre torchaudio --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX}
+
+# Install PyAV for torchvision video I/O (read_video) compatibility
+RUN pip install --upgrade av
+
+# Install TorchCodec to support torchaudio.load on recent nightlies
+RUN set -euxo pipefail \
+    && (pip install --pre torchcodec \
+        || pip install torchcodec \
+        || pip install --no-deps 'git+https://github.com/pytorch/torchcodec@main')
+
+# Install modern build tools and vLLM's build dependencies and CUDA deps early,
+# but sanitize requirements to avoid downgrading torch-family or forcing xformers pins.
+COPY pyproject.toml /tmp/pyproject.toml
+RUN set -euxo pipefail \
+        && cd /tmp \
+        && pip install "setuptools>=61" "setuptools-scm>=8" build wheel ninja cmake \
+        && mkdir -p /tmp/requirements_sanitized \
+        && for f in build.txt cuda.txt common.txt; do \
+                 if [ -f "/tmp/requirements/$f" ]; then \
+                     sed -E '/^(torch|torchvision|torchaudio|xformers)\b/Id' "/tmp/requirements/$f" > "/tmp/requirements_sanitized/$f"; \
+                 fi; \
+             done \
+    && pip install --pre \
+        -r /tmp/requirements_sanitized/build.txt \
+        -r /tmp/requirements_sanitized/cuda.txt \
+        -r /tmp/requirements_sanitized/common.txt \
+    && pip install --pre --upgrade \
+        torch --index-url https://download.pytorch.org/whl/nightly/${TORCH_CUDA_INDEX}
+
+# Install minimal development extras
+RUN pip install pytest pytest-asyncio ipython
+
+# Note: vLLM will be installed from source in development mode via dev-setup.sh
+# This ensures compatibility with the PyTorch nightly build
+
+# Create activation script for easy virtual environment access
+RUN echo '#!/bin/bash' > /home/vllmuser/activate_venv.sh && \
+    echo 'source /home/vllmuser/venv/bin/activate' >> /home/vllmuser/activate_venv.sh && \
+    echo 'echo "Virtual environment activated: $VIRTUAL_ENV"' >> /home/vllmuser/activate_venv.sh && \
+    echo 'echo "Python version: $(python --version)"' >> /home/vllmuser/activate_venv.sh && \
+    echo 'echo "PyTorch version: $(python -c \"import torch; print(torch.__version__)\")"' >> /home/vllmuser/activate_venv.sh && \
+    echo 'echo "CUDA available: $(python -c \"import torch; print(torch.cuda.is_available())\")"' >> /home/vllmuser/activate_venv.sh && \
+    chmod +x /home/vllmuser/activate_venv.sh
+
+# Ensure virtual environment is activated in .bashrc
+RUN echo 'source /home/vllmuser/venv/bin/activate' >> /home/vllmuser/.bashrc && \
+    echo 'echo "🐍 Python virtual environment activated"' >> /home/vllmuser/.bashrc && \
+    echo 'echo "🚀 Ready for vLLM development!"' >> /home/vllmuser/.bashrc
+
+# Create development helper script that uses current workspace requirements
+RUN echo '#!/bin/bash' > /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'echo "🔧 Setting up vLLM for development..."' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'cd /workspace' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo '# Use temporary build directory to avoid permission issues' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'export TMPDIR=/tmp/vllm-build' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'mkdir -p "$TMPDIR" && chmod 777 "$TMPDIR"' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'export CMAKE_BUILD_PARALLEL_LEVEL=4' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'export VLLM_INSTALL_PUNICA_KERNELS=0' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'export MAX_JOBS=4' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo '# Install current workspace requirements first' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'if [ -f requirements/common.txt ]; then pip install -r requirements/common.txt; fi' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo '# Use temporary directory for CMake build files' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'FETCHCONTENT_BASE_DIR="$TMPDIR/deps" pip install -e . --no-deps --no-build-isolation --verbose' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'echo "✅ vLLM installed in editable mode!"' >> /home/vllmuser/setup_vllm_dev.sh && \
+    echo 'python -c "import vllm; print(\"vLLM version:\", vllm.__version__)"' >> /home/vllmuser/setup_vllm_dev.sh && \
+    chmod +x /home/vllmuser/setup_vllm_dev.sh
+
+# Provide a helper to apply repo patches against the mounted /workspace
+# Create under /usr/local/bin as root, then switch back to non-root user
+USER root
+RUN printf '%s\n' \
+    '#!/usr/bin/env bash' \
+    'set -euo pipefail' \
+    'cd /workspace 2>/dev/null || exit 0' \
+    'SCRIPT=./extras/patches/apply_patches.sh' \
+    'if [ -f "$SCRIPT" ]; then' \
+    '  echo "[apply-patches] Running $SCRIPT"' \
+    '  # Copy to temp and normalize EOL to avoid permission errors on mounted FS' \
+    '  TMP_SCRIPT=$(mktemp /tmp/apply_patches.XXXXXX.sh)' \
+    '  tr -d '\''\r'\'' < "$SCRIPT" > "$TMP_SCRIPT" || cp "$SCRIPT" "$TMP_SCRIPT"' \
+    '  chmod +x "$TMP_SCRIPT"' \
+    '  bash "$TMP_SCRIPT" || {' \
+    '    echo "[apply-patches] Warning: patch apply failed (continuing)" >&2; exit 0; }' \
+    'fi' \
+    > /usr/local/bin/apply-vllm-patches && \
+    chmod +x /usr/local/bin/apply-vllm-patches
+USER vllmuser
+
+# Add environment variables for better CUDA memory management and build optimization
+# Use the new variable name to avoid deprecation warnings.
+# (Not working with vllm)
+# ENV PYTORCH_ALLOC_CONF=expandable_segments:True
+#
+# Do not pin a single GPU here; let runtime inject device selection
+# ENV CUDA_VISIBLE_DEVICES=0
+ENV CMAKE_BUILD_PARALLEL_LEVEL=4
+ENV VLLM_INSTALL_PUNICA_KERNELS=0
+ENV MAX_JOBS=4
+
+# Enable ccache for faster rebuilds
+ENV CCACHE_DIR=/home/vllmuser/.ccache
+ENV CCACHE_MAXSIZE=10G
+ENV PATH=/usr/lib64/ccache:$PATH
+
+# (TORCH_CUDA_ARCH_LIST defined earlier)
+# Do not force-disable Machete; allow upstream defaults. User may still pass -e CMAKE_ARGS for custom CMake settings.
+ENV CMAKE_ARGS=""
+
+# WSL2-specific CUDA environment configuration
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/lib/wsl/drivers:/usr/lib/wsl/lib:/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
+
+# Add runtime library detection script
+RUN echo '#!/bin/bash' > /home/vllmuser/check_cuda_libs.sh && \
+    echo 'echo "=== CUDA Library Check ==="' >> /home/vllmuser/check_cuda_libs.sh && \
+    echo 'echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"' >> /home/vllmuser/check_cuda_libs.sh && \
+    echo 'echo "Searching for CUDA libraries..."' >> /home/vllmuser/check_cuda_libs.sh && \
+    echo 'find /usr/lib/wsl -name "libcuda.so*" 2>/dev/null | head -3 || echo "No WSL CUDA libs"' >> /home/vllmuser/check_cuda_libs.sh && \
+    echo 'ldconfig -p | grep cuda | head -3 || echo "No CUDA in ldconfig"' >> /home/vllmuser/check_cuda_libs.sh && \
+    echo 'echo "PyTorch CUDA status:"' >> /home/vllmuser/check_cuda_libs.sh && \
+    echo 'python -c "import torch; print(f\"CUDA available: {torch.cuda.is_available()}\"); print(f\"Device count: {torch.cuda.device_count()}\")" 2>/dev/null || echo "PyTorch not available"' >> /home/vllmuser/check_cuda_libs.sh && \
+    chmod +x /home/vllmuser/check_cuda_libs.sh
diff --git a/extras/README.md b/extras/README.md
new file mode 100644
index 000000000000..a33042a97c58
--- /dev/null
+++ b/extras/README.md
@@ -0,0 +1,65 @@
+# extras/ overview
+
+This directory hosts non-core assets: container/build tooling, configs, testing, storage helpers, and optional patches. Everything here is designed to be self-contained and safe for Windows + WSL and Linux.
+
+Layout
+
+- podman/ — Podman-first run/build wrappers (Windows PowerShell + bash)
+- configs/ — Centralized versions and build configuration
+- patches/ — Optional patches applied automatically at container start
+- storage/ — Volume/caching helpers
+- testing/ — Test harness, matrices, and results
+- secrets/ — Local, gitignored credentials
+
+Primary entrypoint: `extras/podman/run.ps1` (Windows) or `extras/podman/run.sh` (Linux/macOS).
+
+## What’s new
+
+- CUDA 13.0 base (Rocky Linux 9) with PyTorch nightlies and ffmpeg stack.
+- Default CUDA arch policy updated for CUDA 13 (drops SM70/SM75):
+    - TORCH_CUDA_ARCH_LIST: "8.0 8.6 8.9 9.0 12.0 13.0"
+    - CUDAARCHS: "80;86;89;90;120"
+    - Override via `extras/configs/build.env` or environment variables.
+- Auto-patch on container start (idempotent, CRLF-safe):
+    - 0001-cumem-alloc-env-fallback.diff — prefer PYTORCH_ALLOC_CONF
+    - 0002-cub-reduce-to-sum-cuda13.diff — CUB Reduce->Sum compatibility
+- Setup flow is CRLF/WSL-safe: scripts run from a normalized temp copy.
+
+## Quick start
+
+1) Configure (optional): edit `extras/configs/build.env`.
+2) Build the image:
+     - Windows: `./extras/podman/run.ps1 -Build`
+     - Linux/macOS: `extras/podman/run.sh --build`
+3) GPU check:
+     - Windows: `./extras/podman/run.ps1 -GPUCheck`
+     - Linux/macOS: `extras/podman/run.sh --gpu-check`
+4) Install vLLM in editable mode (compiles extensions):
+     - Windows: `./extras/podman/run.ps1 -Setup -WorkVolume vllm-work -Progress`
+     - Linux/macOS: `extras/podman/run.sh --setup --work-volume vllm-work --progress`
+
+Notes for Windows/WSL
+
+- The launcher maps /dev/dxg and WSL libraries automatically; NV env vars are set safely (no "void").
+- PowerShell quoting for inline Python:
+    - `./extras/podman/run.ps1 -Command 'python -c "import torch;print(torch.__version__)"'`
+- Scripts avoid in-place edits on the mounted repo to prevent permission errors.
+
+## Patches
+
+Place `.diff` files in `extras/patches/`. On container start, a helper normalizes CRLF, applies patches, or uses targeted Python fallbacks for known fragile hunks. No source-file changes are committed to the host by design.
+
+## Storage and caches
+
+Use a named volume for large builds and cache:
+
+- `-WorkVolume vllm-work` (PowerShell)
+- `--work-volume vllm-work` (bash)
+
+## Testing
+
+See `extras/testing/README.md` for matrix and run helpers.
+
+## Secrets
+
+See `extras/secrets/README.md` for token handling.
diff --git a/extras/configs/README.md b/extras/configs/README.md
new file mode 100644
index 000000000000..062170cbd2a6
--- /dev/null
+++ b/extras/configs/README.md
@@ -0,0 +1,16 @@
+# configs README
+
+This folder centralizes editable configuration for images/builds:
+
+- build.env: Bash-exported defaults (CUDA/UBI/Python/vLLM tag, arch list, volumes)
+- build.yaml (optional): YAML equivalent for tools that prefer structured configs
+- versions.json (optional): Machine-friendly manifest for automation
+
+Consumers (scripts/Containerfiles) should read values from here and allow runtime overrides via environment variables.
+
+CUDA 13 arch policy
+
+- TORCH_CUDA_ARCH_LIST defaults to: "8.0 8.6 8.9 9.0 12.0 13.0"
+- CUDAARCHS defaults to: "80;86;89;90;120"
+
+Both `extras/podman/run.ps1` and `extras/podman/run.sh` read build.env and pass these values into builds and setup runs.
diff --git a/extras/configs/build.env b/extras/configs/build.env
new file mode 100644
index 000000000000..c2b015526e65
--- /dev/null
+++ b/extras/configs/build.env
@@ -0,0 +1,38 @@
+# Build configuration
+#
+# Scripts should source this file to obtain default versions.
+# Values can be overridden by environment variables provided at runtime.
+
+# CUDA / UBI / Python baselines
+export CUDA_VERSION=${CUDA_VERSION:-13.0.0}
+export UBI_VERSION=${UBI_VERSION:-9}
+export PYTHON_VERSION=${PYTHON_VERSION:-3.12}
+export BASE_FLAVOR=${BASE_FLAVOR:-rockylinux9}
+
+# vLLM branch/tag to use inside the container when cloning or referring
+export VLLM_TAG=${VLLM_TAG:-main}
+
+## Architectures (space separated) for PyTorch/NVCC
+# CUDA 13+ no longer supports SM70/SM75; default to supported archs only.
+# Include Blackwell via sm_120 (13.0) while keeping Hopper/Ada.
+export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-"8.0 8.6 8.9 9.0 12.0 13.0"}
+# Semicolon-separated CUDAARCHS for CMake/NVCC generators
+export CUDA_ARCHS=${CUDA_ARCHS:-"80;86;89;90;120"}
+
+# Named volume for build scratch/work dir (Podman recommended)
+export VLLM_WORK_VOLUME=${VLLM_WORK_VOLUME:-vllm-work}
+export VLLM_WORK_DIR_CONTAINER=${VLLM_WORK_DIR_CONTAINER:-/opt/work}
+
+# Image naming
+export VLLM_BASE_IMAGE=${VLLM_BASE_IMAGE:-"nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${BASE_FLAVOR}"}
+export VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-"vllm-cuda${CUDA_VERSION}-ubi${UBI_VERSION}"}
+
+# Torch family components: wheels only (nightly index). No source build fallbacks.
+export INSTALL_TORCHVISION=${INSTALL_TORCHVISION:-1}
+export INSTALL_TORCHAUDIO=${INSTALL_TORCHAUDIO:-1}
+
+# FFMPEG optional enforcement for torchaudio features
+# Set to 1 to enable RPM Fusion repos and install ffmpeg/ffmpeg-devel; build will fail if unavailable.
+# Set to 0 to attempt best-effort install and fallback to building torchaudio without FFMPEG when headers are missing.
+#export REQUIRE_FFMPEG=${REQUIRE_FFMPEG:-0}
+export REQUIRE_FFMPEG=${REQUIRE_FFMPEG:-1}
\ No newline at end of file
diff --git a/extras/configs/build.yaml b/extras/configs/build.yaml
new file mode 100644
index 000000000000..d90e66a116c8
--- /dev/null
+++ b/extras/configs/build.yaml
@@ -0,0 +1,15 @@
+cuda:
+  version: "13.0.0"
+  tag: "latest"
+  base_flavor: "rockylinux9"
+ubi:
+  version: "9"
+  tag: "latest"
+python:
+  version: "3.12"
+  tag: "latest"
+vllm:
+  tag: main
+arch:
+  torch_cuda_arch_list: "8.0 8.6 8.9 9.0 12.0 13.0"
+  cuda_archs: "80;86;89;90;120"
diff --git a/extras/patches/0001-cumem-alloc-env-fallback.diff b/extras/patches/0001-cumem-alloc-env-fallback.diff
new file mode 100644
index 000000000000..c2a322024961
--- /dev/null
+++ b/extras/patches/0001-cumem-alloc-env-fallback.diff
@@ -0,0 +1,14 @@
+diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
+--- a/vllm/device_allocator/cumem.py
++++ b/vllm/device_allocator/cumem.py
+@@ -140,7 +140,9 @@ class CuMemAllocator:
+         return CuMemAllocator.instance
+ 
+     def __init__(self):
+-        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
++        # Prefer new env var; fall back to deprecated one for compatibility
++        conf = os.environ.get("PYTORCH_ALLOC_CONF",
++                              os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""))
+         assert "expandable_segments:True" not in conf, \
+             ("Expandable segments are not compatible with memory pool. "
+             "Please track https://github.com/pytorch/pytorch/issues/147851 "
diff --git a/extras/patches/0002-cub-reduce-to-sum-cuda13.diff b/extras/patches/0002-cub-reduce-to-sum-cuda13.diff
new file mode 100644
index 000000000000..3938f6e880db
--- /dev/null
+++ b/extras/patches/0002-cub-reduce-to-sum-cuda13.diff
@@ -0,0 +1,59 @@
+diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
+--- a/csrc/layernorm_kernels.cu
++++ b/csrc/layernorm_kernels.cu
+@@
+   using BlockReduce = cub::BlockReduce<float, 1024>;
+   __shared__ typename BlockReduce::TempStorage reduceStore;
+-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++  // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum()
++  variance = BlockReduce(reduceStore).Sum(variance);
+@@
+   using BlockReduce = cub::BlockReduce<float, 1024>;
+   __shared__ typename BlockReduce::TempStorage reduceStore;
+-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++  // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum()
++  variance = BlockReduce(reduceStore).Sum(variance);
+@@
+   using BlockReduce = cub::BlockReduce<float, 1024>;
+   __shared__ typename BlockReduce::TempStorage reduceStore;
+-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
++  // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum()
++  variance = BlockReduce(reduceStore).Sum(variance);
+
+   diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
+   --- a/csrc/layernorm_quant_kernels.cu
+   +++ b/csrc/layernorm_quant_kernels.cu
+   @@
+      using BlockReduce = cub::BlockReduce<float, 1024>;
+      __shared__ typename BlockReduce::TempStorage reduceStore;
+   -  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+   +  // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum()
+   +  variance = BlockReduce(reduceStore).Sum(variance);
+   @@
+      using BlockReduce = cub::BlockReduce<float, 1024>;
+      __shared__ typename BlockReduce::TempStorage reduceStore;
+   -  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+   +  // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum()
+   +  variance = BlockReduce(reduceStore).Sum(variance);
+   @@
+      using BlockReduce = cub::BlockReduce<float, 1024>;
+      __shared__ typename BlockReduce::TempStorage reduceStore;
+   -  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+   +  // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum()
+   +  variance = BlockReduce(reduceStore).Sum(variance);
+
+   diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
+   --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+   +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
+   @@
+      using BlockReduce = cub::BlockReduce<float, 1024>;
+      __shared__ typename BlockReduce::TempStorage reduceStore;
+   -  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+   +  // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum()
+   +  ss = BlockReduce(reduceStore).Sum(ss);
+   @@
+      using BlockReduce = cub::BlockReduce<float, 1024>;
+      __shared__ typename BlockReduce::TempStorage reduceStore;
+   -  ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
+   +  // CUDA 13's CUB/CCCL may not provide cub::Sum in this context; use Sum()
+   +  ss = BlockReduce(reduceStore).Sum(ss);
\ No newline at end of file
diff --git a/extras/patches/README.md b/extras/patches/README.md
new file mode 100644
index 000000000000..ff4f662c4588
--- /dev/null
+++ b/extras/patches/README.md
@@ -0,0 +1,5 @@
+# Patches and plugins scaffolding
+
+- Place unified diffs (*.diff) here.
+- Use `apply_patches.sh` to apply them before building.
+- Optionally, add Python plugins under `plugin/` and load dynamically at runtime.
diff --git a/extras/patches/apply_patches.sh b/extras/patches/apply_patches.sh
new file mode 100644
index 000000000000..4ae3f091422b
--- /dev/null
+++ b/extras/patches/apply_patches.sh
@@ -0,0 +1,123 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Normalize CRLF and re-exec if needed
+if grep -q $'\r' "$0" 2>/dev/null; then
+  TMP_SELF=$(mktemp /tmp/apply_patches_self.XXXXXX.sh)
+  tr -d '\r' < "$0" > "$TMP_SELF" || cp "$0" "$TMP_SELF"
+  chmod +x "$TMP_SELF" 2>/dev/null || true
+  exec "$TMP_SELF" "$@"
+fi
+
+# Resolve paths
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+# Treat current working directory as repo root (wrapper cd's to /workspace)
+ROOT_DIR=${ROOT_DIR:-$(pwd)}
+# Prefer patches from repo under ./extras/patches; fall back to script dir (e.g., /tmp copy)
+PRIMARY_PATCH_DIR="${ROOT_DIR}/extras/patches"
+PATCH_DIR="$PRIMARY_PATCH_DIR"
+if [ ! -d "$PATCH_DIR" ] || ! ls "$PATCH_DIR"/*.diff >/dev/null 2>&1; then
+  PATCH_DIR="$SCRIPT_DIR"
+fi
+
+pushd "$ROOT_DIR" >/dev/null
+
+shopt -s nullglob
+PATCHES=("${PATCH_DIR}"/*.diff)
+shopt -u nullglob
+
+echo "[patches] Using ROOT_DIR=$ROOT_DIR"
+echo "[patches] Scanning ${PATCH_DIR} for .diff files"
+echo "[patches] Found ${#PATCHES[@]} .diff file(s) in ${PATCH_DIR}"
+for pp in "${PATCHES[@]}"; do echo "  - $(basename "$pp")"; done
+
+for p in "${PATCHES[@]}"; do
+  echo "[patches] Applying ${p}"
+  # Normalize EOL to a temp patch file
+  TMP_PATCH=$(mktemp /tmp/patch.XXXXXX.diff)
+  tr -d '\r' < "$p" > "$TMP_PATCH" 2>/dev/null || cp "$p" "$TMP_PATCH"
+  if git apply --check "$TMP_PATCH" 2>/dev/null; then
+    git apply "$TMP_PATCH" || true
+    continue
+  fi
+  echo "[patches] git apply check failed for $(basename "$p"); attempting fallback if known"
+  case "$(basename "$p")" in
+    0001-cumem-alloc-env-fallback.diff)
+      echo "[patches] Fallback: update cumem allocator env var preference"
+      python - <<'PY'
+import io, os
+path = os.path.join('vllm','device_allocator','cumem.py')
+try:
+  with io.open(path, 'r', encoding='utf-8', newline='') as f:
+    src = f.read()
+except FileNotFoundError:
+  raise SystemExit(0)
+if 'PYTORCH_ALLOC_CONF' in src:
+  print('[patches] cumem already prefers PYTORCH_ALLOC_CONF; skipping')
+  raise SystemExit(0)
+needle = 'conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")'
+if needle in src:
+  new = src.replace(needle,
+    'conf = os.environ.get("PYTORCH_ALLOC_CONF",\n'
+    '                              os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""))')
+  with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
+    f.write(new)
+  print('[patches] Applied cumem env var fallback edit')
+else:
+  print('[patches] cumem pattern not found; skipping')
+PY
+      ;;
+    0002-cub-reduce-to-sum-cuda13.diff)
+      echo "[patches] Fallback will be handled by the post-pass rewrite"
+      ;;
+    *)
+      echo "[patches] Unknown patch; skipping fallback"
+      ;;
+  esac
+done
+
+echo "[patches] Post-pass: normalize CUB to Reduce(expr, cub::Op()) across all csrc"
+python - <<'PY'
+import io, os, re
+
+files = []
+for root, _, names in os.walk('csrc'):
+  for n in names:
+    if n.endswith(('.cu', '.cuh')):
+      files.append(os.path.join(root, n))
+
+# Patterns:
+# 1) Convert convenience methods to Reduce with functor: BlockReduce(...).Max(expr) -> BlockReduce(...).Reduce(expr, cub::Max())
+pat_method = re.compile(r"(BlockReduce\([^)]*\))\.(?P<op>Sum|Max|Min)\(\s*(?P<expr>[^)]+?)\s*\)")
+
+# 2) Ensure functor form uses parentheses not braces (cub::Op{} -> cub::Op())
+pat_functor_braces = re.compile(r"(BlockReduce\([^)]*\)\.Reduce\(\s*[^,]+,\s*cub::(Sum|Max|Min))\{\}(\s*(?:,[^)]*)?\))")
+
+changed_any = False
+for path in files:
+  try:
+    with io.open(path, 'r', encoding='utf-8', newline='') as f:
+      src = f.read()
+  except FileNotFoundError:
+    continue
+  # Method -> Reduce(functor)
+  def repl_method(m):
+    receiver = m.group(1)
+    op = m.group('op')
+    expr = m.group('expr').strip()
+    return f"{receiver}.Reduce({expr}, cub::{op}())"
+  new_src = pat_method.sub(repl_method, src)
+  # Braces -> Parens
+  new_src = pat_functor_braces.sub(r"\1()\3", new_src)
+  if new_src != src:
+    with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
+      f.write(new_src)
+    print(f"[patches] Normalized CUB Reduce in {path}")
+    changed_any = True
+if not changed_any:
+  print('[patches] Post-pass: no changes (already applied)')
+PY
+
+popd >/dev/null
+
+echo "[patches] Done."
diff --git a/extras/podman/Containerfile b/extras/podman/Containerfile
new file mode 100644
index 000000000000..d42bef4b344e
--- /dev/null
+++ b/extras/podman/Containerfile
@@ -0,0 +1,11 @@
+# syntax=docker/dockerfile:1.7-labs
+
+# Delegator Containerfile.
+# Build using the canonical Dockerfile in extras/ to avoid duplication.
+
+FROM scratch as noop
+
+# Usage:
+#   podman build -f extras/Dockerfile -t vllm-dev:latest .
+# or from this folder (wrapper script does this for you):
+#   bash build.sh
diff --git a/extras/podman/README.md b/extras/podman/README.md
new file mode 100644
index 000000000000..620398fc7895
--- /dev/null
+++ b/extras/podman/README.md
@@ -0,0 +1,30 @@
+# Podman helpers for vLLM
+
+These wrappers build and run a CUDA 13 dev container with PyTorch nightlies.
+
+Key features
+
+- Windows/WSL and Linux support (PowerShell and bash launchers)
+- Auto-apply patches on container start (CRLF-safe, idempotent)
+- CUDA arch policy aligned with CUDA 13 (no SM70/SM75)
+- Named volume mounting for faster builds (`/opt/work`)
+
+Launchers
+
+- Windows: `extras/podman/run.ps1`
+- Linux/macOS: `extras/podman/run.sh`
+
+Common options
+
+- Build: `-Build` (ps1) / `--build` (sh)
+- GPU check: `-GPUCheck` / `--gpu-check`
+- Setup (editable install): `-Setup` / `--setup`
+- Work volume: `-WorkVolume NAME` / `--work-volume NAME`
+- Progress: `-Progress` / `--progress`
+- Mirror sources: `-Mirror` / `--mirror`
+
+Notes
+
+- Scripts normalize CRLF by running a temp copy to avoid chmod/sed on Windows mounts.
+- CUDA arch defaults can be changed in `extras/configs/build.env`.
+- The entrypoint is `apply-patches-then-exec.sh`, which runs patching before your command.
diff --git a/extras/podman/build.sh b/extras/podman/build.sh
new file mode 100644
index 000000000000..f5aefa1b70b9
--- /dev/null
+++ b/extras/podman/build.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Why: Back-compat wrapper that sources central config and builds using the canonical Dockerfile.
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+ROOT_DIR=$(cd -- "${SCRIPT_DIR}/../.." &>/dev/null && pwd)
+CONFIG_DIR="${SCRIPT_DIR}/../configs"
+
+# shellcheck source=../configs/build.env
+if [ -f "${CONFIG_DIR}/build.env" ]; then
+  # shellcheck disable=SC1091
+  source "${CONFIG_DIR}/build.env"
+fi
+
+CUDA_VERSION=${CUDA_VERSION:-13.0.0}
+UBI_VERSION=${UBI_VERSION:-9}
+VLLM_IMAGE_TAG=${VLLM_IMAGE_TAG:-"vllm-cuda${CUDA_VERSION}-ubi${UBI_VERSION}"}
+
+CONTEXT="${ROOT_DIR}"
+DOCKERFILE_REL="extras/Dockerfile"
+
+echo "[podman/build] Building image ${VLLM_IMAGE_TAG} with CUDA=${CUDA_VERSION}, UBI=${UBI_VERSION}"
+
+podman build \
+  --build-arg CUDA_VERSION="${CUDA_VERSION}" \
+  --build-arg UBI_VERSION="${UBI_VERSION}" \
+  -t "${VLLM_IMAGE_TAG}" \
+  -f "${DOCKERFILE_REL}" \
+  "${CONTEXT}"
+
+echo "[podman/build] Done -> ${VLLM_IMAGE_TAG}"
diff --git a/extras/podman/dev-setup.sh b/extras/podman/dev-setup.sh
new file mode 100644
index 000000000000..abd67da41463
--- /dev/null
+++ b/extras/podman/dev-setup.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# Robust setup entrypoint: prefer extras/dev-setup.sh,
+# otherwise use the image-provided /home/vllmuser/setup_vllm_dev.sh.
+set -euo pipefail
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}" )" &>/dev/null && pwd)
+EXTRAS_DIR=$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd)
+
+try_exec() {
+	local target="$1"
+	if [[ -f "$target" ]]; then
+		# Normalize CRLF and avoid chmod on mounted FS
+		local tmp
+		tmp="$(mktemp /tmp/dev-setup-target.XXXX.sh)"
+		tr -d '\r' < "$target" > "$tmp" 2>/dev/null || cp "$target" "$tmp"
+		chmod +x "$tmp" 2>/dev/null || true
+		exec "$tmp" "$@"
+	fi
+}
+
+# 1) Current canonical path
+if [[ -f "${EXTRAS_DIR}/dev-setup.sh" ]]; then
+	try_exec "${EXTRAS_DIR}/dev-setup.sh" "$@"
+fi
+
+# 2) Fallback: perform a minimal editable install inline (avoid chmod on /tmp)
+echo "🔧 Setting up vLLM (inline fallback)..."
+cd /workspace
+
+# Ensure patches applied before building
+if command -v apply-vllm-patches >/dev/null 2>&1; then
+	apply-vllm-patches || true
+fi
+
+# Prefer /opt/work/tmp (mounted volume) if available, else /tmp
+if [[ -d /opt/work ]]; then
+	export TMPDIR=/opt/work/tmp
+else
+	export TMPDIR=/tmp
+fi
+mkdir -p "$TMPDIR" || true
+
+# Build env knobs
+export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-4}
+export VLLM_INSTALL_PUNICA_KERNELS=${VLLM_INSTALL_PUNICA_KERNELS:-0}
+export MAX_JOBS=${MAX_JOBS:-4}
+# CUDA 13 toolchain dropped SM70/75; ensure we don't pass them to nvcc
+export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-"8.0 8.6 8.9 9.0 12.0 13.0"}
+export CUDAARCHS=${CUDAARCHS:-"80;86;89;90;120"}
+
+# Install Python deps from repo (torch stack already in image)
+if [[ -f requirements/common.txt ]]; then
+	pip install -r requirements/common.txt || true
+fi
+
+# Avoid slow git describe during setuptools_scm by providing a pretend version
+export SETUPTOOLS_SCM_PRETEND_VERSION=${SETUPTOOLS_SCM_PRETEND_VERSION:-0+local}
+
+FETCHCONTENT_BASE_DIR="$TMPDIR/deps" pip install -e . --no-deps --no-build-isolation --verbose
+echo "✅ vLLM installed in editable mode (fallback)!"
+python - <<'PY'
+import vllm
+print("vLLM version:", getattr(vllm, "__version__", "unknown"))
+PY
diff --git a/extras/podman/entrypoint/apply-patches-then-exec.sh b/extras/podman/entrypoint/apply-patches-then-exec.sh
new file mode 100644
index 000000000000..9db4781c0e6a
--- /dev/null
+++ b/extras/podman/entrypoint/apply-patches-then-exec.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Apply repo patches if available; best-effort, normalization handled inside helper.
+if command -v apply-vllm-patches >/dev/null 2>&1; then
+  echo "[entrypoint] applying patches..."
+  apply-vllm-patches || true
+fi
+
+# If first args are `bash -lc <path-to-script.sh>` (single token, no spaces), normalize CRLF then exec
+if [[ "${1-}" == "bash" && "${2-}" == "-lc" ]]; then
+  arg3="${3-}"
+  # Only handle when it's a single token path ending in .sh with no spaces or shell operators
+  if [[ -n "$arg3" && "$arg3" != *' '* && "$arg3" != *';'* && "$arg3" != *'&'* && "$arg3" != *'|'* && "$arg3" == *.sh ]]; then
+    # Resolve to filesystem path if it exists
+    if [[ -f "$arg3" ]]; then
+      SRC_SCRIPT="$arg3"
+      TMP_SCRIPT="$(mktemp /tmp/entry-XXXX.sh)"
+      tr -d '\r' < "$SRC_SCRIPT" > "$TMP_SCRIPT" 2>/dev/null || cp "$SRC_SCRIPT" "$TMP_SCRIPT"
+      chmod +x "$TMP_SCRIPT" 2>/dev/null || true
+      exec bash -lc "$TMP_SCRIPT"
+    fi
+  fi
+fi
+
+exec "$@"
diff --git a/extras/podman/run.ps1 b/extras/podman/run.ps1
new file mode 100644
index 000000000000..5a2f5d44a32e
--- /dev/null
+++ b/extras/podman/run.ps1
@@ -0,0 +1,264 @@
+#!/usr/bin/env pwsh
+[CmdletBinding()] param(
+	[switch]$Build,
+	[switch]$Interactive,
+	[string]$Command = "",
+	[switch]$Setup,
+	[switch]$GPUCheck,
+	[switch]$Mirror,
+	[switch]$Recreate,
+	[string]$WorkVolume = "",
+	[string]$WorkDirHost = "",
+	[switch]$Progress,
+	[switch]$NoCache,
+	[switch]$Pull,
+	[switch]$Help
+)
+
+if ($Help) {
+	Write-Host "Usage: extras/podman/run.ps1 [options]"
+	Write-Host "  -Build                Build the dev image (reads extras/configs/build.env)"
+	Write-Host "  -Interactive          Start an interactive shell"
+	Write-Host "  -Command <cmd>        Run a command inside the dev container"
+	Write-Host "  -Setup                Run project setup inside the container"
+	Write-Host "  -GPUCheck             Run a CUDA/Torch sanity check"
+	Write-Host "  -Mirror               Use local mirror registries if configured"
+	Write-Host "  -Recreate             Recreate the container if running"
+	Write-Host "  -WorkVolume <name>    Named volume to mount at /opt/work"
+	Write-Host "  -WorkDirHost <path>   Host dir to mount at /opt/work"
+	Write-Host "  -Progress             Show progress bars in setup"
+	Write-Host "  -NoCache              Build image without using cache"
+	Write-Host "  -Pull                 Always attempt to pull newer base image"
+	return
+}
+
+if (-not $Interactive -and [string]::IsNullOrEmpty($Command) -and -not $GPUCheck -and -not $Setup) { $Interactive = $true }
+
+if (-not (Get-Command podman -ErrorAction SilentlyContinue)) { Write-Host "❌ Podman not found in PATH" -ForegroundColor Red; exit 1 }
+
+$ContainerName = "vllm-dev"
+$ImageTag = "vllm-dev:latest"
+$SourceDir = (Get-Location).Path
+
+Write-Host "🐋 vLLM Dev Container (Podman)" -ForegroundColor Green
+
+if ($Build) {
+	Write-Host "🔨 Building image (honoring extras/configs/build.env)..." -ForegroundColor Yellow
+	$configPath = Join-Path $SourceDir "extras/configs/build.env"
+	$dockerfilePath = Join-Path $SourceDir "extras/Dockerfile"
+	$cudaVer = $null
+	$baseFlavor = $null
+	$archList = $null
+	$cudaArchs = $null
+	$requireFfmpegArg = '1'
+	$tvRef = $null
+	$taRef = $null
+	function Get-DockerArgDefault([string]$name, [string]$fallback) {
+		if (Test-Path $dockerfilePath) {
+			$df = Get-Content -Raw -Path $dockerfilePath
+			$m = [regex]::Match($df, "(?m)^\s*ARG\s+${name}\s*=\s*([^\r\n]+)")
+			if ($m.Success) {
+				return $m.Groups[1].Value.Trim()
+			}
+		}
+		return $fallback
+	}
+	if (Test-Path $configPath) {
+		$cfg = Get-Content -Raw -Path $configPath
+		function Get-EnvDefault([string]$name, [string]$fallback) {
+			# Match a line like: export NAME=VALUE
+			$line = [regex]::Match($cfg, "(?m)^\s*export\s+${name}\s*=\s*([^\r\n]+)")
+			if (-not $line.Success) { return $fallback }
+			$val = $line.Groups[1].Value.Trim()
+			# Strip wrapping quotes if present
+			if (($val.StartsWith('"') -and $val.EndsWith('"')) -or ($val.StartsWith("'") -and $val.EndsWith("'"))) { $val = $val.Substring(1, $val.Length-2) }
+			# If value is Bash-style ${NAME:-default}, extract default
+			if ($val.StartsWith('${') -and $val.Contains(':-')) {
+				$idx = $val.IndexOf(':-'); $end = $val.IndexOf('}', $idx)
+				if ($idx -ge 0 -and $end -gt $idx) {
+					$def = $val.Substring($idx+2, $end-($idx+2)).Trim()
+					if (($def.StartsWith('"') -and $def.EndsWith('"')) -or ($def.StartsWith("'") -and $def.EndsWith("'"))) { $def = $def.Substring(1, $def.Length-2) }
+					return $def
+				}
+			}
+			return $val
+		}
+		$cudaVer = Get-EnvDefault -name 'CUDA_VERSION' -fallback (Get-DockerArgDefault 'CUDA_VERSION' '13.0.0')
+		$baseFlavor = Get-EnvDefault -name 'BASE_FLAVOR' -fallback (Get-DockerArgDefault 'BASE_FLAVOR' 'rockylinux9')
+		$archList = Get-EnvDefault -name 'TORCH_CUDA_ARCH_LIST' -fallback (Get-DockerArgDefault 'TORCH_CUDA_ARCH_LIST' '8.0 8.6 8.9 9.0 12.0 13.0')
+		$cudaArchs = Get-EnvDefault -name 'CUDA_ARCHS' -fallback (Get-DockerArgDefault 'CUDA_ARCHS' '80;86;89;90;120')
+	# No longer used: wheels-only installs for torchvision/torchaudio
+		$requireFfmpeg = Get-EnvDefault -name 'REQUIRE_FFMPEG' -fallback (Get-DockerArgDefault 'REQUIRE_FFMPEG' '1')
+		if ($requireFfmpeg -match '^[01]$') { $requireFfmpegArg = $requireFfmpeg } else { $requireFfmpegArg = '1' }
+	}
+	# Derive PyTorch nightly index from CUDA version (e.g., 13.0 -> cu130, 12.9 -> cu129)
+	$torchCudaIndex = if ($cudaVer -match '^13\.') { 'cu130' } elseif ($cudaVer -match '^12\.9') { 'cu129' } else {
+		$parts = $cudaVer.Split('.')
+		if ($parts.Length -ge 2) { 'cu' + $parts[0] + $parts[1] + '0' } else { 'cu129' }
+	}
+	Write-Host ("Config: CUDA={0} BASE_FLAVOR={1} TORCH_CUDA_INDEX={2} ARCH_LIST=({3}) CUDA_ARCHS={4}" -f $cudaVer,$baseFlavor,$torchCudaIndex,$archList,$cudaArchs) -ForegroundColor DarkGray
+	$buildCmd = @("build","-f","extras/Dockerfile",
+		"--build-arg","CUDA_VERSION=$cudaVer",
+		"--build-arg","BASE_FLAVOR=$baseFlavor",
+		"--build-arg","TORCH_CUDA_INDEX=$torchCudaIndex",
+		"--build-arg","TORCH_CUDA_ARCH_LIST=$archList",
+		"--build-arg","CUDA_ARCHS=$cudaArchs",
+	"--build-arg","REQUIRE_FFMPEG=$requireFfmpegArg",
+		"-t",$ImageTag,".")
+	# Use cache by default; add --no-cache only when requested
+	if ($NoCache) { $buildCmd = @($buildCmd[0],"--no-cache") + $buildCmd[1..($buildCmd.Length-1)] }
+	if ($Pull) { $buildCmd = @($buildCmd[0],"--pull=always") + $buildCmd[1..($buildCmd.Length-1)] }
+	& podman @buildCmd
+	if ($LASTEXITCODE -ne 0) { Write-Host "❌ Build failed" -ForegroundColor Red; exit 1 }
+	Write-Host "✅ Build ok" -ForegroundColor Green
+}
+
+# Already running?
+$running = podman ps --filter "name=$ContainerName" --format "{{.Names}}" 2>$null
+
+if ($Recreate -and $running -eq $ContainerName) {
+	Write-Host "♻️  Removing existing container '$ContainerName'" -ForegroundColor Yellow
+	podman rm -f $ContainerName | Out-Null
+	$running = $null
+}
+
+if ($running -eq $ContainerName) {
+	if ($GPUCheck) {
+		Write-Host "🔍 GPU check (existing container)" -ForegroundColor Yellow
+		$cmd = @'
+source /home/vllmuser/venv/bin/activate && python - <<'PY'
+import torch, os
+print("PyTorch:", getattr(torch,"__version__","n/a"))
+print("CUDA:", torch.cuda.is_available())
+print("Devices:", torch.cuda.device_count() if torch.cuda.is_available() else 0)
+print("LD_LIBRARY_PATH:", os.environ.get("LD_LIBRARY_PATH"))
+if torch.cuda.is_available():
+		try:
+				print("GPU 0:", torch.cuda.get_device_name(0))
+		except Exception as e:
+				print("GPU name error:", e)
+PY
+nvidia-smi || true
+'@
+		$cmd = "export NVIDIA_VISIBLE_DEVICES=all; " + $cmd
+		podman exec $ContainerName bash -lc $cmd
+		exit $LASTEXITCODE
+	}
+	if ($Setup) {
+		Write-Host "🔧 Running dev setup in existing container" -ForegroundColor Yellow
+		$envs = @()
+		if ($Mirror) { $envs += @('LOCAL_MIRROR=1') }
+		if ($Progress) { $envs += @('PROGRESS_WATCH=1') }
+		$envs += @('NVIDIA_VISIBLE_DEVICES=all')
+		$envStr = ($envs | ForEach-Object { "export $_;" }) -join ' '
+		$cmd = "$envStr apply-vllm-patches || true; chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh"
+		if ($Progress) { podman exec -it $ContainerName bash -lc $cmd } else { podman exec $ContainerName bash -lc $cmd }
+		exit $LASTEXITCODE
+	}
+	if ($Command) {
+		Write-Host "🚀 Running command in existing container" -ForegroundColor Green
+		$runCmd = "source /home/vllmuser/venv/bin/activate && $Command"
+		podman exec $ContainerName bash -c $runCmd
+		exit $LASTEXITCODE
+	}
+	$resp = Read-Host "Attach to running container? [Y/n]"
+	if ($resp -eq "" -or $resp -match '^[Yy]$') { podman exec -it $ContainerName bash; exit $LASTEXITCODE } else { exit 0 }
+}
+
+# Ensure image exists
+podman image exists $ImageTag
+if ($LASTEXITCODE -ne 0) { Write-Host "❌ Image missing. Use -Build." -ForegroundColor Red; exit 1 }
+
+# Base args (no default /tmp tmpfs; can be enabled via VLLM_TMPFS_TMP_SIZE)
+$runArgs = @("run","--rm","--security-opt=label=disable","--shm-size","8g","-v","${SourceDir}:/workspace:Z")
+if (-not [string]::IsNullOrWhiteSpace($WorkVolume)) { $runArgs += @('-v',"${WorkVolume}:/opt/work:Z") }
+elseif ($WorkDirHost -and (Test-Path $WorkDirHost)) { $runArgs += @('-v',"${WorkDirHost}:/opt/work:Z") }
+$runArgs += @('-w','/workspace','--name',"$ContainerName",'--user','vllmuser','--env','ENGINE=podman')
+# Use a tiny entrypoint to apply patches before executing the requested command
+$runArgs += @('--entrypoint','/workspace/extras/podman/entrypoint/apply-patches-then-exec.sh')
+
+$tmpfsSize = [Environment]::GetEnvironmentVariable('VLLM_TMPFS_TMP_SIZE')
+if (-not [string]::IsNullOrEmpty($tmpfsSize) -and $tmpfsSize -ne '0') { $runArgs += @('--tmpfs',"/tmp:size=$tmpfsSize") }
+
+if ($true) { # Request GPU via CDI hooks
+	$runArgs = @("run","--rm","--security-opt=label=disable","--device=nvidia.com/gpu=all") + $runArgs[2..($runArgs.Length-1)]
+}
+
+# WSL GPU: map /dev/dxg and mount WSL libs
+$runArgs += @('--device','/dev/dxg','-v','/usr/lib/wsl:/usr/lib/wsl:ro')
+if ($Mirror) { $runArgs += @('--env','LOCAL_MIRROR=1') }
+foreach ($ev in 'NVIDIA_VISIBLE_DEVICES','NVIDIA_DRIVER_CAPABILITIES','NVIDIA_REQUIRE_CUDA') {
+	$val = [Environment]::GetEnvironmentVariable($ev)
+	if ($val) { $runArgs += @('--env',"$ev=$val") }
+}
+$runArgs += @('--env','ENGINE=podman','--env','NVIDIA_VISIBLE_DEVICES=all','--env','NVIDIA_DRIVER_CAPABILITIES=compute,utility','--env','NVIDIA_REQUIRE_CUDA=')
+
+if ($GPUCheck) {
+	$pyDiag = @'
+import json, torch, os
+out = {
+		"torch_version": getattr(torch, "__version__", "n/a"),
+		"torch_cuda_version": getattr(getattr(torch, "version", None), "cuda", "n/a"),
+		"cuda_available": torch.cuda.is_available(),
+		"ld_library_path": os.environ.get("LD_LIBRARY_PATH"),
+}
+try:
+		out["device_count"] = torch.cuda.device_count()
+except Exception as e:
+		out["device_count_error"] = str(e)
+if out["cuda_available"] and out.get("device_count", 0) > 0:
+		try:
+				cap = torch.cuda.get_device_capability(0)
+				out["device_0"] = {"name": torch.cuda.get_device_name(0), "capability": f"sm_{cap[0]}{cap[1]}"}
+		except Exception as e:
+				out["device_0_error"] = str(e)
+else:
+		out["diagnostics"] = ["Missing /dev/nvidia* or podman machine without GPU passthrough"]
+print(json.dumps(out, indent=2))
+'@
+	$pyB64 = [Convert]::ToBase64String([Text.Encoding]::UTF8.GetBytes($pyDiag))
+	$gpuScript = @'
+echo '=== GPU Check ==='
+which nvidia-smi && nvidia-smi || echo 'nvidia-smi unavailable'
+echo '--- /dev/nvidia* ---'
+ls -l /dev/nvidia* 2>/dev/null || echo 'no /dev/nvidia* nodes'
+echo '--- Environment (NVIDIA_*) ---'
+env | grep -E '^NVIDIA_' || echo 'no NVIDIA_* env vars'
+if [ "$NVIDIA_VISIBLE_DEVICES" = "void" ]; then echo 'WARN: NVIDIA_VISIBLE_DEVICES=void (no GPU mapped)'; fi
+echo '--- LD_LIBRARY_PATH ---'
+echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+source /home/vllmuser/venv/bin/activate 2>/dev/null || true
+echo __PY_B64__ | base64 -d > /tmp/gpucheck.py
+python /tmp/gpucheck.py || true
+rm -f /tmp/gpucheck.py
+'@
+	$gpuScript = "export NVIDIA_VISIBLE_DEVICES=all; export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/lib/wsl/drivers:`$LD_LIBRARY_PATH; " + ($gpuScript -replace '__PY_B64__', $pyB64) -replace "`r",""
+	$runArgs += @('--user','root', $ImageTag,'bash','-lc',$gpuScript)
+} elseif ($Setup) {
+	# Use robust setup entrypoint that finds the right script (extras/dev-setup.sh or image helper)
+	# Avoid in-place edits on Windows-mounted files; run a CRLF-normalized temp copy instead
+	$prefix = 'TMP_RUN=$(mktemp /tmp/run-dev-setup.XXXX.sh); tr -d "\r" < ./extras/podman/dev-setup.sh > "$TMP_RUN" || cp ./extras/podman/dev-setup.sh "$TMP_RUN"; chmod +x "$TMP_RUN" 2>/dev/null || true; apply-vllm-patches || true; '
+	$envPrefix = ''
+	if ($Mirror) { $envPrefix += 'export LOCAL_MIRROR=1; ' }
+	if ($Progress) { $envPrefix += 'export PROGRESS_WATCH=1; ' }
+	# Pass configured archs from build.env (the Dockerfile already defaults to safe values)
+	if ($archList) { $envPrefix += "export TORCH_CUDA_ARCH_LIST='$archList'; " }
+	if ($cudaArchs) { $envPrefix += "export CUDAARCHS='$cudaArchs'; " }
+	$envPrefix += 'export TMPDIR=/opt/work/tmp; export TMP=/opt/work/tmp; export TEMP=/opt/work/tmp; mkdir -p /opt/work/tmp; '
+	$setupCmd = $prefix + $envPrefix + '"$TMP_RUN"'
+	if ($Progress) { $runArgs += @('-it', $ImageTag, 'bash','-lc', $setupCmd) } else { $runArgs += @($ImageTag, 'bash','-lc', $setupCmd) }
+	Write-Host "🔧 Running dev setup" -ForegroundColor Green
+} elseif ($Interactive -and -not $Command) {
+	$runArgs += @('-it',$ImageTag,'bash')
+	Write-Host "🚀 Interactive shell" -ForegroundColor Green
+} elseif ($Command) {
+	$runArgs += @($ImageTag,'bash','-lc',"export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/lib/wsl/drivers:`$LD_LIBRARY_PATH; source /home/vllmuser/venv/bin/activate && $Command")
+	Write-Host "🚀 Running command" -ForegroundColor Green
+} else {
+	$runArgs += @($ImageTag)
+}
+
+Write-Host "Command: podman $($runArgs -join ' ')" -ForegroundColor Gray
+& podman @runArgs
+
+if ($LASTEXITCODE -eq 0 -and $Interactive) { Write-Host "Exited cleanly" -ForegroundColor Green }
diff --git a/extras/podman/run.sh b/extras/podman/run.sh
new file mode 100644
index 000000000000..278113440be2
--- /dev/null
+++ b/extras/podman/run.sh
@@ -0,0 +1,188 @@
+#!/usr/bin/env bash
+# Unified lightweight vLLM dev container launcher (Podman-first, Linux/macOS)
+set -euo pipefail
+
+IMAGE_TAG="vllm-dev:latest"
+CONTAINER_NAME="vllm-dev"
+SOURCE_DIR="$(pwd)"
+BUILD_NO_CACHE=0
+BUILD_PULL=0
+
+show_help() {
+	cat <<EOF
+Usage: ./extras/podman/run.sh [options]
+
+Options:
+	-b, --build        Build (or rebuild) the image first
+		--no-cache     Build without using cache
+		--pull         Always attempt to pull newer base image
+	-c, --command CMD  Run CMD inside container then exit
+	-g, --gpu-check    Run lightweight GPU diagnostics inside container
+	-s, --setup        Run ./extras/dev-setup.sh inside container
+	-p, --progress     Enable in-place progress display during setup
+	-m, --mirror       Copy sources into container (LOCAL_MIRROR=1) for faster build on slow mounts
+	--work-volume NAME Mount named volume NAME at /opt/work (preferred for large builds)
+	-n, --name NAME    Override container name (default: ${CONTAINER_NAME})
+	-h, --help         Show this help and exit
+
+Interactive shell is default if no command/gpu-check specified.
+Examples:
+	extras/podman/run.sh -b
+	extras/podman/run.sh -c "python -c 'import torch;print(torch.cuda.is_available())'"
+	extras/podman/run.sh -g
+EOF
+}
+
+BUILD=0
+GPU_CHECK=0
+SETUP=0
+CMD=""
+MIRROR=0
+PROGRESS=0
+WORK_VOLUME=""
+
+while [[ $# -gt 0 ]]; do
+	case "$1" in
+	-b|--build) BUILD=1; shift ;;
+	--no-cache) BUILD_NO_CACHE=1; shift ;;
+	--pull) BUILD_PULL=1; shift ;;
+		-c|--command) CMD="${2:-}"; shift 2 ;;
+		-g|--gpu-check) GPU_CHECK=1; shift ;;
+		-s|--setup) SETUP=1; shift ;;
+		-h|--help) show_help; exit 0 ;;
+		-m|--mirror) MIRROR=1; shift ;;
+		--work-volume) WORK_VOLUME="${2:-}"; shift 2 ;;
+		-n|--name) CONTAINER_NAME="${2:-}"; shift 2 ;;
+		-p|--progress) PROGRESS=1; shift ;;
+		*) echo "Unknown option: $1" >&2; show_help; exit 1 ;;
+	esac
+done
+
+if ! command -v podman >/dev/null 2>&1; then
+	echo "Error: podman not found in PATH" >&2
+	exit 1
+fi
+
+echo "[vLLM] Engine: podman  Image: $IMAGE_TAG  Container: $CONTAINER_NAME"
+
+if [[ $BUILD -eq 1 ]]; then
+	echo "[vLLM] Building image..."
+	BUILD_ARGS=(-f extras/Dockerfile -t "$IMAGE_TAG")
+	# Load defaults from configs/build.env if present
+	if [[ -f extras/configs/build.env ]]; then
+		# shellcheck disable=SC1091
+		. extras/configs/build.env
+		[[ -n "${CUDA_VERSION:-}" ]] && BUILD_ARGS+=(--build-arg "CUDA_VERSION=$CUDA_VERSION")
+		[[ -n "${BASE_FLAVOR:-}" ]] && BUILD_ARGS+=(--build-arg "BASE_FLAVOR=$BASE_FLAVOR")
+		# Derive torch nightly index from CUDA version when not set
+		if [[ -z "${TORCH_CUDA_INDEX:-}" ]]; then
+		  if [[ "${CUDA_VERSION:-}" =~ ^13\. ]]; then TORCH_CUDA_INDEX=cu130; elif [[ "${CUDA_VERSION:-}" =~ ^12\.9 ]]; then TORCH_CUDA_INDEX=cu129; fi
+		fi
+		[[ -n "${TORCH_CUDA_INDEX:-}" ]] && BUILD_ARGS+=(--build-arg "TORCH_CUDA_INDEX=${TORCH_CUDA_INDEX}")
+		[[ -n "${TORCH_CUDA_ARCH_LIST:-}" ]] && BUILD_ARGS+=(--build-arg "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST")
+		[[ -n "${CUDA_ARCHS:-}" ]] && BUILD_ARGS+=(--build-arg "CUDA_ARCHS=$CUDA_ARCHS")
+		[[ -n "${REQUIRE_FFMPEG:-}" ]] && BUILD_ARGS+=(--build-arg "REQUIRE_FFMPEG=$REQUIRE_FFMPEG")
+	fi
+	[[ $BUILD_NO_CACHE -eq 1 ]] && BUILD_ARGS=(--no-cache "${BUILD_ARGS[@]}")
+	[[ $BUILD_PULL -eq 1 ]] && BUILD_ARGS=(--pull=always "${BUILD_ARGS[@]}")
+	if ! podman build "${BUILD_ARGS[@]}" .; then
+		echo "[vLLM] Build failed" >&2
+		exit 1
+	fi
+	echo "[vLLM] Build complete"
+fi
+
+# If container running, attach / exec
+RUNNING=$(podman ps --filter "name=${CONTAINER_NAME}" --format '{{.Names}}' 2>/dev/null || true)
+
+if [[ "$RUNNING" == "$CONTAINER_NAME" ]]; then
+	if [[ $GPU_CHECK -eq 1 ]]; then
+		echo "[vLLM] GPU check (existing container)"
+		exec podman exec "$CONTAINER_NAME" bash -lc 'source /home/vllmuser/venv/bin/activate 2>/dev/null || true; which nvidia-smi && nvidia-smi || true; python - <<PY
+import torch, os
+print("PyTorch:", getattr(torch, "__version__", "n/a"))
+print("CUDA available:", torch.cuda.is_available())
+print("Devices:", torch.cuda.device_count() if torch.cuda.is_available() else 0)
+if torch.cuda.is_available():
+		try: print("GPU 0:", torch.cuda.get_device_name(0))
+		except Exception as e: print("GPU name error:", e)
+PY'
+	fi
+	if [[ $SETUP -eq 1 ]]; then
+		echo "[vLLM] Running dev setup in existing container"
+		if [[ $MIRROR -eq 1 ]]; then
+			exec podman exec "$CONTAINER_NAME" bash -lc 'export LOCAL_MIRROR=1; chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh'
+		else
+			exec podman exec "$CONTAINER_NAME" bash -lc 'chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh'
+		fi
+	fi
+	if [[ -n "$CMD" ]]; then
+		echo "[vLLM] Exec command in existing container"
+		podman exec "$CONTAINER_NAME" bash -lc "source /home/vllmuser/venv/bin/activate 2>/dev/null || true; $CMD"
+		exit $?
+	fi
+	read -r -p "Attach to running container ${CONTAINER_NAME}? [Y/n] " RESP || true
+	if [[ -z "$RESP" || "$RESP" =~ ^[Yy]$ ]]; then
+		exec podman exec -it "$CONTAINER_NAME" bash
+	else
+		exit 0
+	fi
+fi
+
+# Ensure image exists if not building
+if [[ $BUILD -ne 1 ]]; then
+	if ! podman image exists "$IMAGE_TAG"; then
+		echo "Image $IMAGE_TAG missing. Use --build." >&2; exit 1
+	fi
+fi
+
+# Base run args (use entrypoint to auto-apply patches before commands)
+RUN_ARGS=(run --rm --device=nvidia.com/gpu=all --security-opt=label=disable --shm-size 8g --name "$CONTAINER_NAME" -v "${SOURCE_DIR}:/workspace:Z" -w /workspace --user vllmuser --env ENGINE=podman --entrypoint /workspace/extras/podman/entrypoint/apply-patches-then-exec.sh)
+
+# Prefer named volume for /opt/work if provided
+if [[ -n "$WORK_VOLUME" ]]; then
+	RUN_ARGS+=(-v "${WORK_VOLUME}:/opt/work:Z")
+fi
+
+# Allow configurable /tmp tmpfs size via VLLM_TMPFS_TMP_SIZE (default 0=disabled)
+TMPFS_SIZE="${VLLM_TMPFS_TMP_SIZE:-0}"
+if [[ -n "$TMPFS_SIZE" && "$TMPFS_SIZE" != "0" ]]; then
+	RUN_ARGS+=(--tmpfs "/tmp:size=${TMPFS_SIZE}")
+fi
+
+# Ensure sane NVIDIA env defaults inside container to avoid 'void' and missing caps
+RUN_ARGS+=(--env "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}" \
+					--env "NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-compute,utility}" \
+					--env "NVIDIA_REQUIRE_CUDA=")
+
+if [[ $GPU_CHECK -eq 1 ]]; then
+	GPU_SCRIPT=$'echo "=== GPU Check ==="; which nvidia-smi && nvidia-smi || echo "nvidia-smi unavailable"; echo "--- /dev/nvidia* ---"; ls -l /dev/nvidia* 2>/dev/null || echo "no /dev/nvidia* nodes"; echo "--- Environment (NVIDIA_*) ---"; env | grep -E "^NVIDIA_" || echo "no NVIDIA_* env vars"; if [ "$NVIDIA_VISIBLE_DEVICES" = "void" ]; then echo "WARN: NVIDIA_VISIBLE_DEVICES=void (no GPU mapped)"; fi; echo "--- LD_LIBRARY_PATH ---"; echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"; source /home/vllmuser/venv/bin/activate 2>/dev/null || true; python - <<PY\nimport json,torch,os\nout={\n \t\'torch_version\':getattr(torch,\'__version__\',\'n/a\'),\n \t\'torch_cuda_version\':getattr(getattr(torch,\'version\',None),\'cuda\',\'n/a\'),\n \t\'cuda_available\':torch.cuda.is_available(),\n \t\'ld_library_path\':os.environ.get(\'LD_LIBRARY_PATH\')\n}\ntry: out[\'device_count\']=torch.cuda.device_count()\nexcept Exception as e: out[\'device_count_error\']=str(e)\nif out[\'cuda_available\'] and out.get(\'device_count\',0)>0:\n\ttry:\n\t\tcap=torch.cuda.get_device_capability(0)\n\t\tout[\'device_0\']={\'name\':torch.cuda.get_device_name(0),\'capability\':f"sm_{cap[0]}{cap[1]}"}\n\texcept Exception as e:\n\t\tout[\'device_0_error\']=str(e)\nelse:\n\tout[\'diagnostics\']=[\'Missing /dev/nvidia* or podman machine without GPU passthrough\']\nprint(json.dumps(out,indent=2))\nPY'
+	RUN_ARGS+=("$IMAGE_TAG" bash -lc "$GPU_SCRIPT")
+elif [[ $SETUP -eq 1 ]]; then
+	# Pass arch policy from configs/build.env if present
+	if [[ -f extras/configs/build.env ]]; then
+		# shellcheck disable=SC1091
+		. extras/configs/build.env
+		[[ -n "${TORCH_CUDA_ARCH_LIST:-}" ]] && RUN_ARGS+=(--env "TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}")
+		[[ -n "${CUDA_ARCHS:-}" ]] && RUN_ARGS+=(--env "CUDAARCHS=${CUDA_ARCHS}")
+	fi
+	[[ $MIRROR -eq 1 ]] && RUN_ARGS+=(--env LOCAL_MIRROR=1)
+	[[ $PROGRESS -eq 1 ]] && RUN_ARGS+=(--env PROGRESS_WATCH=1)
+	SETUP_CMD='TMP_RUN=$(mktemp /tmp/run-dev-setup.XXXX.sh); tr -d "\r" < ./extras/podman/dev-setup.sh > "$TMP_RUN" || cp ./extras/podman/dev-setup.sh "$TMP_RUN"; chmod +x "$TMP_RUN" 2>/dev/null || true; apply-vllm-patches || true; "$TMP_RUN"'
+	if [[ $PROGRESS -eq 1 ]]; then
+		RUN_ARGS+=("-it" "$IMAGE_TAG" bash -lc "$SETUP_CMD")
+	else
+		RUN_ARGS+=("$IMAGE_TAG" bash -lc "$SETUP_CMD")
+	fi
+elif [[ -n "$CMD" ]]; then
+	RUN_ARGS+=("$IMAGE_TAG" bash -lc "source /home/vllmuser/venv/bin/activate 2>/dev/null || true; $CMD")
+else
+	RUN_ARGS+=("-it" "$IMAGE_TAG" bash)
+	echo "[vLLM] Interactive shell. Helpful inside container:"
+	echo "  ./extras/dev-setup.sh            # Build/install editable vLLM"
+	echo "  python -c 'import torch;print(torch.cuda.is_available())'"
+	echo "  python -c 'import vllm'"
+fi
+
+echo "[vLLM] Command: podman ${RUN_ARGS[*]}"
+exec podman "${RUN_ARGS[@]}"
diff --git a/extras/podman/scripts/gpu_status.sh b/extras/podman/scripts/gpu_status.sh
new file mode 100644
index 000000000000..a50c78b01c03
--- /dev/null
+++ b/extras/podman/scripts/gpu_status.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Helper to show GPU/CDI status under Podman (Linux/WSL)
+
+podman info --format json | jq '.host' || podman info || true
+
+# Show CDI devices if available
+podman cdi list || true
diff --git a/extras/secrets/.gitignore b/extras/secrets/.gitignore
new file mode 100644
index 000000000000..d4895ec18947
--- /dev/null
+++ b/extras/secrets/.gitignore
@@ -0,0 +1,4 @@
+# Ensure this directory stays out of git; keep this file only.
+*
+!.gitignore
+!README.md
diff --git a/extras/secrets/README.md b/extras/secrets/README.md
new file mode 100644
index 000000000000..b519087af198
--- /dev/null
+++ b/extras/secrets/README.md
@@ -0,0 +1,12 @@
+# secrets directory
+
+This directory is gitignored and intended for local-only secret material such as model hub tokens.
+
+Files are expected to be simple KEY=VALUE lines that can be sourced by shell scripts.
+
+Examples:
+
+- hf-credentials.env
+- cn-modelhub-credentials.env
+
+Do NOT commit secrets. See README for details.
diff --git a/extras/storage/README.md b/extras/storage/README.md
new file mode 100644
index 000000000000..d106b6d7378c
--- /dev/null
+++ b/extras/storage/README.md
@@ -0,0 +1,7 @@
+# Storage helpers
+
+Declare and manage external volumes for models and caches.
+
+- storage-config.yaml: Declarative host/container paths
+- setup_local.sh: Helper to prepare a local volume or directory
+- scripts/: Utilities for warmup, cache management, mounts
diff --git a/extras/storage/scripts/warm_cache.sh b/extras/storage/scripts/warm_cache.sh
new file mode 100644
index 000000000000..1d97b7f044f6
--- /dev/null
+++ b/extras/storage/scripts/warm_cache.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Placeholder for cache warmup logic.
+# Example usage: ./warm_cache.sh meta-llama/Llama-3-8B /models
+MODEL_ID=${1:-meta-llama/Llama-3-8B}
+TARGET=${2:-/models}
+mkdir -p "$TARGET"
+echo "(scaffold) Would warm cache for $MODEL_ID under $TARGET"
diff --git a/extras/storage/setup_local.sh b/extras/storage/setup_local.sh
new file mode 100644
index 000000000000..101826bc7396
--- /dev/null
+++ b/extras/storage/setup_local.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Prepare a local directory for models and ensure reasonable permissions.
+TARGET=${1:-/mnt/ml-models}
+mkdir -p "$TARGET"
+chmod 775 "$TARGET" || true
+
+echo "Model storage prepared at: $TARGET"
diff --git a/extras/storage/storage-config.yaml b/extras/storage/storage-config.yaml
new file mode 100644
index 000000000000..90310b572b3c
--- /dev/null
+++ b/extras/storage/storage-config.yaml
@@ -0,0 +1,4 @@
+model_volume:
+  path_host: "/mnt/ml-models"
+  path_container: "/models"
+  shared: true
diff --git a/extras/testing/README.md b/extras/testing/README.md
new file mode 100644
index 000000000000..2c64d538ac97
--- /dev/null
+++ b/extras/testing/README.md
@@ -0,0 +1,7 @@
+# Testing and benchmarking harness
+
+- Define a matrix of models/environments in `test_matrix.yaml`.
+- Run via `python extras/testing/run_tests.py --output-dir extras/testing/results/$(date +%F_%H-%M)`.
+- Store results in `results/` with timestamps for regression tracking.
+
+This scaffolding is intentionally minimal; models and benchmarks can be added incrementally.
diff --git a/extras/testing/compare_results.py b/extras/testing/compare_results.py
new file mode 100644
index 000000000000..f6c91bdd6667
--- /dev/null
+++ b/extras/testing/compare_results.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import argparse
+import json
+
+
+def load(path: str) -> dict:
+    with open(path, encoding="utf-8") as f:
+        return json.load(f)
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("a")
+    p.add_argument("b")
+    args = p.parse_args()
+
+    A = load(args.a)
+    B = load(args.b)
+
+    # Placeholder comparison: print keys that differ
+    diffs = sorted(set(A.keys()) ^ set(B.keys()))
+    print(json.dumps({"diff_keys": diffs}))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/extras/testing/run_tests.py b/extras/testing/run_tests.py
new file mode 100644
index 000000000000..1dcea180b6d2
--- /dev/null
+++ b/extras/testing/run_tests.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Minimal, non-destructive test harness that prints a JSON line per test.
+This is a scaffold; integrate with your local launchers or CI as needed.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from datetime import datetime
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--cuda-version",
+                   default=os.getenv("CUDA_VERSION", "13.0.0"))
+    p.add_argument("--ubi-version", default=os.getenv("UBI_VERSION", "9"))
+    p.add_argument("--models", default="Example-Llama3-8B")
+    p.add_argument("--output-dir",
+                   default=os.path.join("extras", "testing", "results",
+                                        datetime.now().strftime("%F_%H-%M")))
+    args = p.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    result = {
+        "ts": datetime.utcnow().isoformat() + "Z",
+        "cuda": args.cuda_version,
+        "ubi": args.ubi_version,
+        "models": args.models.split(","),
+        "status": "scaffold",
+        "notes": "Integrate with vLLM server/client to collect real metrics.",
+    }
+
+    out_path = os.path.join(args.output_dir, "scaffold.json")
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2)
+
+    print(json.dumps({"written": out_path}))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/extras/testing/test_matrix.yaml b/extras/testing/test_matrix.yaml
new file mode 100644
index 000000000000..fcd9e878adf9
--- /dev/null
+++ b/extras/testing/test_matrix.yaml
@@ -0,0 +1,16 @@
+models:
+  - name: Example-Llama3-8B
+    id: meta-llama/Llama-3-8B
+    chat_template: chat_templates/llama-3-instruct.jinja
+    params:
+      max_tokens: 64
+      temperature: 0.7
+
+environments:
+  - cuda: 13.0.0
+    ubi: 9
+
+benchmarks:
+  - name: inference_speed
+    input: "Summarize: vLLM extras modularization plan."
+    metrics: [latency_ms, tokens_per_sec]
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 7963fb15c419..69f38fd0a178 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -143,7 +143,9 @@ def get_instance() -> "CuMemAllocator":
         return CuMemAllocator.instance
 
     def __init__(self):
-        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+                # Prefer new env var; fall back to deprecated one for compatibility
+        conf = os.environ.get("PYTORCH_ALLOC_CONF",
+                                      os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""))
         assert "expandable_segments:True" not in conf, \
             ("Expandable segments are not compatible with memory pool. "
             "Please track https://github.com/pytorch/pytorch/issues/147851 "