diff --git a/k8s/vllm-8b-cache-1-21.yaml b/k8s/vllm-8b-cache-1-21.yaml new file mode 100644 index 00000000..726d7d30 --- /dev/null +++ b/k8s/vllm-8b-cache-1-21.yaml @@ -0,0 +1,95 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: vllm-llama-app + name: vllm-llama-svc + namespace: workloads +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 8000 + selector: + app: vllm-llama-app + type: NodePort + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: vllm-llama-app + name: vllm-llama-70b + namespace: workloads +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-llama-app + template: + metadata: + labels: + app: vllm-llama-app + spec: + containers: + - image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu24.04/habanalabs/pytorch-installer-2.6.0:latest + name: vllm-llama-openai + imagePullPolicy: Always + workingDir: /root + env: + - name: HF_HOME + value: /storage/huggingface + - name: LLM_MODEL + value: meta-llama/Llama-3.1-8B-Instruct + - name: HUGGING_FACE_HUB_TOKEN + value: "" + - name: HABANA_VISIBLE_DEVICES + value: all + - name: NUM_HPU + value: "1" + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: "true" + command: + - "/bin/bash" + - "-c" + - | + if [ ! -d /storage/huggingface/vllm/llama3_8b_recipe_cache ]; then + export PT_HPU_RECIPE_CACHE_CONFIG='/storage/huggingface/vllm/llama3_8b_recipe_cache/',True,8192 + else + cp -r /storage/huggingface/vllm/llama3_8b_recipe_cache /tmp + export PT_HPU_RECIPE_CACHE_CONFIG='/tmp/llama3_8b_recipe_cache/',False,8192 + fi + git clone -b v0.7.2+Gaudi-1.21.0 https://github.com/HabanaAI/vllm-fork.git + cd vllm-fork + export VLLM_TARGET_DEVICE=hpu + export PT_HPU_WEIGHT_SHARING=0 + export PT_HPU_MAX_COMPOUND_OP_SIZE=30 + pip install -r requirements-hpu.txt + python3 setup.py install + VLLM_PROMPT_BS_BUCKET_MAX=256 VLLM_DECODE_BS_BUCKET_MIN=128 VLLM_DECODE_BS_BUCKET_STEP=128 VLLM_DECODE_BS_BUCKET_MAX=128 VLLM_PROMPT_SEQ_BUCKET_MAX=1024 VLLM_DECODE_BLOCK_BUCKET_MAX=1024 PT_HPU_WEIGHT_SHARING=0 PT_HPU_MAX_COMPOUND_OP_SIZE=30 PT_HPU_LAZY_MODE=1 PT_HPU_ENABLE_LAZY_COLLECTIVES=true vllm serve meta-llama/Llama-3.1-8B-instruct -tp 1 --weights-load-device cpu --max-model-len 8192 + ports: + - containerPort: 8000 + protocol: TCP + resources: + limits: + habana.ai/gaudi: 1 + cpu: 40 + memory: 400Gi + hugepages-2Mi: 9800Mi + requests: + habana.ai/gaudi: 1 + cpu: 40 + memory: 400Gi + hugepages-2Mi: 9800Mi + volumeMounts: + - name: datasets + mountPath: /storage + volumes: + - name: datasets + persistentVolumeClaim: + claimName: shared-model + readOnly: false