From 2b5f07df63be41f55943641c69ce6e321ec67373 Mon Sep 17 00:00:00 2001 From: velaraptor-runpod Date: Wed, 4 Mar 2026 16:38:40 -0600 Subject: [PATCH] feat: Update to 0.16.0, remove NUM_GPU_BLOCKS_OVERRIDE in hub default since 0 will break --- .runpod/hub.json | 9 --------- Dockerfile | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/.runpod/hub.json b/.runpod/hub.json index ece04835..a45aa552 100644 --- a/.runpod/hub.json +++ b/.runpod/hub.json @@ -280,15 +280,6 @@ "advanced": true } }, - { - "key": "NUM_GPU_BLOCKS_OVERRIDE", - "input": { - "name": "Num GPU Blocks Override", - "type": "number", - "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.", - "advanced": true - } - }, { "key": "MAX_NUM_BATCHED_TOKENS", "input": { diff --git a/Dockerfile b/Dockerfile index d4cbeffd..6bb5c465 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ RUN ldconfig /usr/local/cuda-12.9/compat/ # Install vLLM with FlashInfer - use CUDA 12.8 PyTorch wheels (compatible with vLLM 0.15.1) RUN python3 -m pip install --upgrade pip && \ - python3 -m pip install "vllm[flashinfer]==0.15.1" --extra-index-url https://download.pytorch.org/whl/cu129 + python3 -m pip install "vllm[flashinfer]==0.16.0" --extra-index-url https://download.pytorch.org/whl/cu129