diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index e84b680..3c21815 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -129,7 +129,7 @@ data:
- name: dshm
emptyDir:
medium: Memory
- sizeLimit: 1Gi
+ sizeLimit: 16Gi
{{ `{{- if .HFModelName }}` }}
- name: model-cache
emptyDir: {}
@@ -185,6 +185,8 @@ data:
- name: VLLM_LOGGING_LEVEL
value: {{ .Values.modelservice.vllm.logLevel }}
{{- end }}
+ - name: VLLM_IS_PREFILL
+ value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5557"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
@@ -230,7 +232,7 @@ data:
- name: dshm
emptyDir:
medium: Memory
- sizeLimit: 1Gi
+ sizeLimit: 16Gi
{{ `{{ if .HFModelName }}` }}
- name: model-cache
emptyDir: {}
diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml
index 6ba5c22..913a086 100644
--- a/charts/llm-d/templates/sample-application/modelservice.yaml
+++ b/charts/llm-d/templates/sample-application/modelservice.yaml
@@ -30,7 +30,7 @@ spec:
{{- range .Values.sampleApplication.decode.extraArgs }}
- {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
{{- end }}
- resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
+ resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
env:
{{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
- name: HF_TOKEN
@@ -49,7 +49,7 @@ spec:
{{- range .Values.sampleApplication.prefill.extraArgs }}
- {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
{{- end }}
- resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
+ resources: {{ .Values.sampleApplication.prefill.resources | toYaml | nindent 8 }}
env:
{{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
- name: HF_TOKEN
diff --git a/charts/llm-d/values.schema.json b/charts/llm-d/values.schema.json
index a1910e9..b405e0a 100644
--- a/charts/llm-d/values.schema.json
+++ b/charts/llm-d/values.schema.json
@@ -10471,6 +10471,65 @@
"description": "number of desired decode replicas",
"required": [],
"title": "replicas"
+ },
+ "resources": {
+ "description": "ResourceRequirements describes the compute resource requirements.",
+ "properties": {
+ "claims": {
+ "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
+ "items": {
+ "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
+ "properties": {
+ "name": {
+ "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
+ "type": "string"
+ },
+ "request": {
+ "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
+ "type": "string"
+ }
+ },
+ "required": [
+ "name"
+ ],
+ "type": "object"
+ },
+ "type": "array",
+ "x-kubernetes-list-map-keys": [
+ "name"
+ ],
+ "x-kubernetes-list-type": "map"
+ },
+ "limits": {
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "number"
+ }
+ ]
+ },
+ "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+ "type": "object"
+ },
+ "requests": {
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "number"
+ }
+ ]
+ },
+ "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+ "type": "object"
+ }
+ },
+ "type": "object"
}
},
"required": [],
@@ -10688,69 +10747,69 @@
"description": "number of desired prefill replicas",
"required": [],
"title": "replicas"
- }
- },
- "required": [],
- "title": "prefill",
- "type": "object"
- },
- "resources": {
- "description": "ResourceRequirements describes the compute resource requirements.",
- "properties": {
- "claims": {
- "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
- "items": {
- "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
- "properties": {
- "name": {
- "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
- "type": "string"
+ },
+ "resources": {
+ "description": "ResourceRequirements describes the compute resource requirements.",
+ "properties": {
+ "claims": {
+ "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
+ "items": {
+ "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
+ "properties": {
+ "name": {
+ "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
+ "type": "string"
+ },
+ "request": {
+ "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
+ "type": "string"
+ }
+ },
+ "required": [
+ "name"
+ ],
+ "type": "object"
},
- "request": {
- "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
- "type": "string"
- }
+ "type": "array",
+ "x-kubernetes-list-map-keys": [
+ "name"
+ ],
+ "x-kubernetes-list-type": "map"
},
- "required": [
- "name"
- ],
- "type": "object"
- },
- "type": "array",
- "x-kubernetes-list-map-keys": [
- "name"
- ],
- "x-kubernetes-list-type": "map"
- },
- "limits": {
- "additionalProperties": {
- "oneOf": [
- {
- "type": "string"
+ "limits": {
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "number"
+ }
+ ]
},
- {
- "type": "number"
- }
- ]
- },
- "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
- "type": "object"
- },
- "requests": {
- "additionalProperties": {
- "oneOf": [
- {
- "type": "string"
+ "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+ "type": "object"
+ },
+ "requests": {
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "number"
+ }
+ ]
},
- {
- "type": "number"
- }
- ]
+ "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+ "type": "object"
+ }
},
- "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
"type": "object"
}
},
+ "required": [],
+ "title": "prefill",
"type": "object"
}
},
diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
index 0d9e000..d0aa57a 100644
--- a/charts/llm-d/values.yaml
+++ b/charts/llm-d/values.yaml
@@ -125,22 +125,22 @@ sampleApplication:
# -- Key within the secret under which the token is located
key: HF_TOKEN
- # @schema
- # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
- # @schema
- # -- Modify resource limits/requests available to the pods
- # -- Resource requests/limits
- #
Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
- resources:
- limits:
- nvidia.com/gpu: "1"
- requests:
- nvidia.com/gpu: "1"
-
# -- InferencePool port configuration
inferencePoolPort: 8000
prefill:
+ # @schema
+ # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
+ # @schema
+ # -- Modify resource limits/requests available to the pods
+ # -- Resource requests/limits
+ #
Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+ resources:
+ limits:
+ nvidia.com/gpu: "1"
+ requests:
+ nvidia.com/gpu: "1"
+
# -- number of desired prefill replicas
replicas: 1
@@ -152,6 +152,18 @@ sampleApplication:
extraArgs: []
decode:
+ # @schema
+ # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
+ # @schema
+ # -- Modify resource limits/requests available to the pods
+ # -- Resource requests/limits
+ #
Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+ resources:
+ limits:
+ nvidia.com/gpu: "1"
+ requests:
+ nvidia.com/gpu: "1"
+
# -- number of desired decode replicas
replicas: 1
diff --git a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
new file mode 100644
index 0000000..c3b13a1
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
@@ -0,0 +1,77 @@
+sampleApplication:
+ baseConfigMapRefName: basic-gpu-with-nixl-preset
+ model:
+ modelArtifactURI: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+ modelName: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+ auth:
+ hfToken:
+ name: llm-d-hf-token
+ key: HF_TOKEN
+ prefill:
+ replicas: 4
+ resources:
+ limits:
+ nvidia.com/gpu: 1
+ rdma/ib: 1
+ requests:
+ nvidia.com/gpu: 1
+ rdma/ib: 1
+ cpu: "32"
+ memory: 128Gi
+ extraArgs:
+ - "--tensor-parallel-size"
+ - "1"
+ - "--disable-log-requests"
+ - "--max-model-len"
+ - "32768"
+ - "--block-size"
+ - "128"
+ decode:
+ replicas: 1
+ resources:
+ limits:
+ nvidia.com/gpu: 4
+ rdma/ib: 1
+ requests:
+ nvidia.com/gpu: 4
+ rdma/ib: 1
+ cpu: "32"
+ memory: 128Gi
+ extraArgs:
+ - "--tensor-parallel-size"
+ - "4"
+ - "--disable-log-requests"
+ - "--max-model-len"
+ - "32768"
+ - "--block-size"
+ - "128"
+modelservice:
+ vllm:
+ image:
+ registry: docker.io
+ repository: robertgouldshaw2/vllm-nixl
+ tag: nixl-oh-debug-fixed-0.3
+ epp:
+ defaultEnvVarsOverride:
+ - name: ENABLE_KVCACHE_AWARE_SCORER
+ value: "false"
+ - name: ENABLE_PREFIX_AWARE_SCORER
+ value: "false"
+ - name: ENABLE_LOAD_AWARE_SCORER
+ value: "true"
+ - name: ENABLE_SESSION_AWARE_SCORER
+ value: "false"
+ - name: PD_ENABLED
+ value: "true"
+ - name: PD_PROMPT_LEN_THRESHOLD
+ value: "10"
+ - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+ value: "false"
+ - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+ value: "true"
+ - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+ value: "false"
+ - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+ value: "false"
+redis:
+ enabled: false
\ No newline at end of file
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
new file mode 100644
index 0000000..4b2c3f0
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -0,0 +1,40 @@
+NAMESPACE := "pete-davidson"
+
+logs POD:
+ kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v ".get_finished" | grep -v ".transfer_batched"
+
+logs-stats POD:
+ kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -e "Engine 000:"
+
+get-pods:
+ kubectl get pods -n {{NAMESPACE}} -o wide
+
+hf-token:
+ kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN="$HF_TOKEN" -n {{NAMESPACE}}
+
+[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
+install VALUES:
+ ./llmd-installer.sh \
+ --namespace {{NAMESPACE}} \
+ --storage-class shared-vast --storage-size 300Gi \
+ --values-file ./examples/rob-benchmarking/{{VALUES}} --skip-infra
+
+[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
+uninstall:
+ ./llmd-installer.sh \
+ --namespace {{NAMESPACE}} \
+ --uninstall --skip-infra
+
+gh-token GH_TOKEN:
+ kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}
+
+# Interactive benchmark commands:
+start-bench:
+ kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
+
+delete-bench:
+ kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
+
+exec-bench:
+ kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
+ kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
new file mode 100644
index 0000000..4e3d64f
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/Justfile.remote
@@ -0,0 +1,71 @@
+# Use this Justfile within the cluster.
+
+MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
+BASE_URL := "http://llm-d-inference-gateway"
+
+pull:
+ cd vllm && git pull
+
+eval:
+ lm_eval --model local-completions --tasks gsm8k \
+ --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=0,tokenized_requests=False \
+ --limit 1000
+
+benchmark_one INPUT_LEN:
+ cd vllm && git fetch && git checkout 3c6fd286b40ada67bba98216ed410bb3a0d38b16 && uv pip install pybase64 && \
+ python benchmarks/benchmark_one_concurrent.py \
+ --base-url {{BASE_URL}} \
+ --model {{MODEL}} \
+ --input-len {{INPUT_LEN}} \
+ --output-len 1 \
+ --num-requests 10 \
+ --seed $(date +%s)
+
+benchmark_one_no_pd POD_IP INPUT_LEN:
+ cd vllm && git fetch && git checkout 3c6fd286b40ada67bba98216ed410bb3a0d38b16 && uv pip install pybase64 && \
+ python benchmarks/benchmark_one_concurrent.py \
+ --base-url http://{{POD_IP}}:8000 \
+ --model {{MODEL}} \
+ --input-len {{INPUT_LEN}} \
+ --output-len 1 \
+ --num-requests 10 \
+ --seed $(date +%s)
+
+benchmark CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+ python vllm/benchmarks/benchmark_serving.py \
+ --base-url {{BASE_URL}} \
+ --model {{MODEL}} \
+ --dataset-name random \
+ --random-input-len {{INPUT_LEN}} \
+ --random-output-len {{OUTPUT_LEN}} \
+ --max-concurrency {{CONCURRENCY}} \
+ --seed $(date +%M%H%M%S) \
+ --num-prompts {{NUM_REQUESTS}} \
+ --percentile-metrics ttft,tpot,itl,e2el \
+ --metric-percentiles 90,95,99 \
+ --ignore-eos
+
+benchmark_no_pd POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+ python vllm/benchmarks/benchmark_serving.py \
+ --base-url http://{{POD_IP}}:8000 \
+ --model {{MODEL}} \
+ --dataset-name random \
+ --random-input-len {{INPUT_LEN}} \
+ --random-output-len {{OUTPUT_LEN}} \
+ --max-concurrency {{CONCURRENCY}} \
+ --num-prompts {{NUM_REQUESTS}} \
+ --seed $(date +%M%H%M%S) \
+ --percentile-metrics ttft,tpot,itl,e2el \
+ --metric-percentiles 90,95,99 \
+ --ignore-eos
+
+
+send_request:
+ curl -X POST {{BASE_URL}}/v1/completions \
+ -H "Content-Type: application/json" \
+ -d '{ \
+ "model": "{{MODEL}}", \
+ "prompt": "Red Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
+ "max_tokens": 150, \
+ "temperature": 0.7 \
+ }'
diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
new file mode 100644
index 0000000..0a0eeb2
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
@@ -0,0 +1,32 @@
+# benchmark-client-interactive-pod.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+ name: benchmark-interactive
+ labels:
+ app: benchmark-interactive
+spec:
+ containers:
+ - name: benchmark-runner
+ image: "quay.io/tms/pd-disagg-benchmark:0.0.6"
+ imagePullPolicy: Always
+ stdin: true
+ tty: true
+ resources:
+ requests:
+ cpu: "16"
+ memory: "64Gi"
+ limits:
+ cpu: "16"
+ memory: "64Gi"
+ env:
+ - name: PROXY_HOST
+ value: "custom-llm-proxy-service"
+ - name: PROXY_PORT
+ value: "80"
+ - name: HF_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: hf-token-secret # set up with just hf_token
+ key: HF_TOKEN
+ restartPolicy: Never