Skip to content
This repository was archived by the owner on Sep 18, 2025. It is now read-only.
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions k8s/vllm-8b-cache-1-21.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
---
apiVersion: v1
kind: Service
metadata:
labels:
app: vllm-llama-app
name: vllm-llama-svc
namespace: workloads
spec:
ports:
- port: 8000
protocol: TCP
targetPort: 8000
selector:
app: vllm-llama-app
type: NodePort

---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: vllm-llama-app
name: vllm-llama-70b
namespace: workloads
spec:
replicas: 1
selector:
matchLabels:
app: vllm-llama-app
template:
metadata:
labels:
app: vllm-llama-app
spec:
containers:
- image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu24.04/habanalabs/pytorch-installer-2.6.0:latest
name: vllm-llama-openai
imagePullPolicy: Always
workingDir: /root
env:
- name: HF_HOME
value: /storage/huggingface
- name: LLM_MODEL
value: meta-llama/Llama-3.1-8B-Instruct
- name: HUGGING_FACE_HUB_TOKEN
value: "<HF_TOken>"
- name: HABANA_VISIBLE_DEVICES
value: all
- name: NUM_HPU
value: "1"
- name: OMPI_MCA_btl_vader_single_copy_mechanism
value: none
- name: PT_HPU_ENABLE_LAZY_COLLECTIVES
value: "true"
command:
- "/bin/bash"
- "-c"
- |
if [ ! -d /storage/huggingface/vllm/llama3_8b_recipe_cache ]; then
export PT_HPU_RECIPE_CACHE_CONFIG='/storage/huggingface/vllm/llama3_8b_recipe_cache/',True,8192
else
cp -r /storage/huggingface/vllm/llama3_8b_recipe_cache /tmp
export PT_HPU_RECIPE_CACHE_CONFIG='/tmp/llama3_8b_recipe_cache/',False,8192
fi
git clone -b v0.7.2+Gaudi-1.21.0 https://github.com/HabanaAI/vllm-fork.git
cd vllm-fork
export VLLM_TARGET_DEVICE=hpu
export PT_HPU_WEIGHT_SHARING=0
export PT_HPU_MAX_COMPOUND_OP_SIZE=30
pip install -r requirements-hpu.txt
python3 setup.py install
VLLM_PROMPT_BS_BUCKET_MAX=256 VLLM_DECODE_BS_BUCKET_MIN=128 VLLM_DECODE_BS_BUCKET_STEP=128 VLLM_DECODE_BS_BUCKET_MAX=128 VLLM_PROMPT_SEQ_BUCKET_MAX=1024 VLLM_DECODE_BLOCK_BUCKET_MAX=1024 PT_HPU_WEIGHT_SHARING=0 PT_HPU_MAX_COMPOUND_OP_SIZE=30 PT_HPU_LAZY_MODE=1 PT_HPU_ENABLE_LAZY_COLLECTIVES=true vllm serve meta-llama/Llama-3.1-8B-instruct -tp 1 --weights-load-device cpu --max-model-len 8192
ports:
- containerPort: 8000
protocol: TCP
resources:
limits:
habana.ai/gaudi: 1
cpu: 40
memory: 400Gi
hugepages-2Mi: 9800Mi
requests:
habana.ai/gaudi: 1
cpu: 40
memory: 400Gi
hugepages-2Mi: 9800Mi
volumeMounts:
- name: datasets
mountPath: /storage
volumes:
- name: datasets
persistentVolumeClaim:
claimName: shared-model
readOnly: false