Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ data:
- "--port=8000"
- "--vllm-port=8001"
- "--connector=nixlv2"
env:
- name: OTEL_TRACING_ENABLED
value: '{{ if and .Values.tracing.enabled .Values.tracing.components.routingProxy }}"true"{{ else }}"false"{{ end }}'
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: '{{ .Values.tracing.otelCollectorEndpoint }}'
- name: OTEL_SERVICE_NAME
value: "llm-d-routing-sidecar"
- name: OTEL_SAMPLING_RATE
value: '{{ .Values.tracing.samplingRate }}'
ports:
- containerPort: 8000
protocol: TCP
Expand Down
64 changes: 64 additions & 0 deletions charts/llm-d/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -6867,6 +6867,70 @@
"required": [],
"title": "nameOverride"
},
"tracing": {
"additionalProperties": false,
"default": {
"enabled": false
},
"description": "Distributed tracing configuration for llm-d components",
"properties": {
"alwaysPropagateContext": {
"default": true,
"description": "Always propagate trace context even when tracing is disabled",
"type": "boolean"
},
"apiToken": {
"default": "",
"description": "API token for trace export (if required by collector)",
"type": "string"
},
"components": {
"additionalProperties": false,
"description": "Per-component tracing configuration",
"properties": {
"eppInferenceScheduler": {
"default": true,
"description": "Enable tracing for EPP inference scheduler (includes kv-cache-manager)",
"type": "boolean"
},
"inferenceGateway": {
"default": true,
"description": "Enable tracing for inference gateway",
"type": "boolean"
},
"routingProxy": {
"default": true,
"description": "Enable tracing for routing proxy (llm-d-routing-sidecar)",
"type": "boolean"
},
"vllm": {
"default": true,
"description": "Enable tracing for vLLM instances",
"type": "boolean"
}
},
"type": "object"
},
"enabled": {
"default": false,
"description": "Global tracing enablement (can be overridden per component)",
"type": "boolean"
},
"otelCollectorEndpoint": {
"default": "http://otel-collector:4317",
"description": "OpenTelemetry collector endpoint",
"type": "string"
},
"samplingRate": {
"default": 0.1,
"description": "Sampling rate for traces (0.0 to 1.0)",
"type": "number"
}
},
"required": [],
"title": "tracing",
"type": "object"
},
"redis": {
"$schema": "http://json-schema.org/schema#",
"properties": {
Expand Down
37 changes: 37 additions & 0 deletions charts/llm-d/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,14 @@ modelservice:
value: "false"
- name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
value: "1"
- name: OTEL_TRACING_ENABLED
value: '{{ if and .Values.tracing.enabled .Values.tracing.components.eppInferenceScheduler }}"true"{{ else }}"false"{{ end }}'
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: '{{ .Values.tracing.otelCollectorEndpoint }}'
- name: OTEL_SERVICE_NAME
value: "llm-d-kv-cache-manager"
- name: OTEL_SAMPLING_RATE
value: '{{ .Values.tracing.samplingRate }}'

# @schema
# items:
Expand Down Expand Up @@ -927,6 +935,35 @@ modelservice:
# -- Enable the creation of RBAC resources
create: true

# -- Distributed tracing configuration for llm-d components
# @default -- Tracing disabled by default
tracing:
# -- Global tracing enablement (can be overridden per component)
enabled: true

# -- OpenTelemetry collector endpoint
otelCollectorEndpoint: "http://otel-collector:4317"

# -- API token for trace export (if required by collector)
apiToken: ""

# -- Sampling rate for traces (0.0 to 1.0)
samplingRate: 0.1

# -- Per-component tracing configuration
components:
# -- Enable tracing for EPP inference scheduler (includes kv-cache-manager)
eppInferenceScheduler: true

# -- Enable tracing for inference gateway
inferenceGateway: true

# -- Enable tracing for routing proxy (llm-d-routing-sidecar)
routingProxy: true

# -- Enable tracing for vLLM instances
vllm: true

# @schema
# $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json
# @schema
Expand Down
83 changes: 83 additions & 0 deletions quickstart/examples/choose-adventure.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Tested on AWS g6.12xlarge, minikube setup
# ./llmd-installer.sh --minikube --values-file examples/choose-adventure.yaml \

sampleApplication:
baseConfigMapRefName: basic-gpu-with-nixl-and-redis-lookup-preset
model:
modelArtifactURI: hf://meta-llama/Llama-3.2-3B-Instruct
modelName: "meta-llama/Llama-3.2-3B-Instruct"
resources:
limits:
nvidia.com/gpu: 2
requests:
nvidia.com/gpu: 2
prefill:
replicas: 1
extraArgs:
- "--tensor-parallel-size"
- "2"
- "--distributed-executor-backend"
- "mp"
- "--max-model-len"
- "20000"
- '--enable-auto-tool-choice'
- '--tool-call-parser'
- llama3_json
- '--chat-template'
- /workspace/vllm/examples/tool_chat_template_llama3.2_json.jinja
decode:
replicas: 1
extraArgs:
- "--tensor-parallel-size"
- "2"
- "--distributed-executor-backend"
- "mp"
- '--enable-auto-tool-choice'
- '--tool-call-parser'
- llama3_json
- '--chat-template'
- /workspace/vllm/examples/tool_chat_template_llama3.2_json.jinja
- "--max-model-len"
- "20000"
redis:
enabled: true
modelservice:
routingProxy:
image:
registry: quay.io
repository: sallyom/llm-d-routing-sidecar
tag: tracing-dev
epp:
image:
registry: quay.io
repository: sallyom/llm-d-inference-scheduler
tag: tracing-dev-1
defaultEnvVarsOverride:
- name: ENABLE_KVCACHE_AWARE_SCORER
value: "true"
- name: ENABLE_PREFIX_AWARE_SCORER
value: "true"
- name: ENABLE_LOAD_AWARE_SCORER
value: "true"
- name: ENABLE_SESSION_AWARE_SCORER
value: "true"
- name: PD_ENABLED
value: "true"
- name: PD_PROMPT_LEN_THRESHOLD
value: "10"
- name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
value: "true"
- name: PREFILL_ENABLE_LOAD_AWARE_SCORER
value: "true"
- name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
value: "true"
- name: PREFILL_ENABLE_SESSION_AWARE_SCORER
value: "true"
tracing:
enabled: true
otelCollectorEndpoint: "otel-collector-collector.tracing.svc.cluster.local:4317"
samplingRate: 0.1
alwaysPropagateContext: true
components:
eppInferenceScheduler: true
routingProxy: true
Loading