llm-d · sallyom · Jun 27, 2025 · Jul 2, 2025
diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-and-redis-lookup-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-and-redis-lookup-preset.yaml
@@ -54,6 +54,15 @@ data:
                 - "--port=8000"
                 - "--vllm-port=8001"
                 - "--connector=nixlv2"
+              env:
+                - name: OTEL_TRACING_ENABLED
+                  value: '{{ if and .Values.tracing.enabled .Values.tracing.components.routingProxy }}"true"{{ else }}"false"{{ end }}'
+                - name: OTEL_EXPORTER_OTLP_ENDPOINT
+                  value: '{{ .Values.tracing.otelCollectorEndpoint }}'
+                - name: OTEL_SERVICE_NAME
+                  value: "llm-d-routing-sidecar"
+                - name: OTEL_SAMPLING_RATE
+                  value: '{{ .Values.tracing.samplingRate }}'
               ports:
                 - containerPort: 8000
                   protocol: TCP

diff --git a/charts/llm-d/values.schema.json b/charts/llm-d/values.schema.json
@@ -6867,6 +6867,70 @@
             "required": [],
             "title": "nameOverride"
         },
+        "tracing": {
+            "additionalProperties": false,
+            "default": {
+                "enabled": false
+            },
+            "description": "Distributed tracing configuration for llm-d components",
+            "properties": {
+                "alwaysPropagateContext": {
+                    "default": true,
+                    "description": "Always propagate trace context even when tracing is disabled",
+                    "type": "boolean"
+                },
+                "apiToken": {
+                    "default": "",
+                    "description": "API token for trace export (if required by collector)",
+                    "type": "string"
+                },
+                "components": {
+                    "additionalProperties": false,
+                    "description": "Per-component tracing configuration",
+                    "properties": {
+                        "eppInferenceScheduler": {
+                            "default": true,
+                            "description": "Enable tracing for EPP inference scheduler (includes kv-cache-manager)",
+                            "type": "boolean"
+                        },
+                        "inferenceGateway": {
+                            "default": true,
+                            "description": "Enable tracing for inference gateway",
+                            "type": "boolean"
+                        },
+                        "routingProxy": {
+                            "default": true,
+                            "description": "Enable tracing for routing proxy (llm-d-routing-sidecar)",
+                            "type": "boolean"
+                        },
+                        "vllm": {
+                            "default": true,
+                            "description": "Enable tracing for vLLM instances",
+                            "type": "boolean"
+                        }
+                    },
+                    "type": "object"
+                },
+                "enabled": {
+                    "default": false,
+                    "description": "Global tracing enablement (can be overridden per component)",
+                    "type": "boolean"
+                },
+                "otelCollectorEndpoint": {
+                    "default": "http://otel-collector:4317",
+                    "description": "OpenTelemetry collector endpoint",
+                    "type": "string"
+                },
+                "samplingRate": {
+                    "default": 0.1,
+                    "description": "Sampling rate for traces (0.0 to 1.0)",
+                    "type": "number"
+                }
+            },
+            "required": [],
+            "title": "tracing",
+            "type": "object"
+        },
         "redis": {
             "$schema": "http://json-schema.org/schema#",
             "properties": {

diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
@@ -607,6 +607,14 @@ modelservice:
         value: "false"
       - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
         value: "1"
+      - name: OTEL_TRACING_ENABLED
+        value: '{{ if and .Values.tracing.enabled .Values.tracing.components.eppInferenceScheduler }}"true"{{ else }}"false"{{ end }}'
+      - name: OTEL_EXPORTER_OTLP_ENDPOINT
+        value: '{{ .Values.tracing.otelCollectorEndpoint }}'
+      - name: OTEL_SERVICE_NAME
+        value: "llm-d-kv-cache-manager"
+      - name: OTEL_SAMPLING_RATE
+        value: '{{ .Values.tracing.samplingRate }}'
 
     # @schema
     # items:
@@ -927,6 +935,35 @@ modelservice:
     # -- Enable the creation of RBAC resources
     create: true
 
+# -- Distributed tracing configuration for llm-d components
+# @default -- Tracing disabled by default
+tracing:
+  # -- Global tracing enablement (can be overridden per component)
+  enabled: true
+
+  # -- OpenTelemetry collector endpoint
+  otelCollectorEndpoint: "http://otel-collector:4317"
+
+  # -- API token for trace export (if required by collector)
+  apiToken: ""
+
+  # -- Sampling rate for traces (0.0 to 1.0)
+  samplingRate: 0.1
+
+  # -- Per-component tracing configuration
+  components:
+    # -- Enable tracing for EPP inference scheduler (includes kv-cache-manager)
+    eppInferenceScheduler: true
+
+    # -- Enable tracing for inference gateway
+    inferenceGateway: true
+
+    # -- Enable tracing for routing proxy (llm-d-routing-sidecar)
+    routingProxy: true
+
+    # -- Enable tracing for vLLM instances
+    vllm: true
+
 # @schema
 # $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json
 # @schema

diff --git a/quickstart/examples/choose-adventure.yaml b/quickstart/examples/choose-adventure.yaml
@@ -0,0 +1,83 @@
+# Tested on AWS g6.12xlarge, minikube setup
+# ./llmd-installer.sh  --minikube --values-file examples/choose-adventure.yaml \
+
+sampleApplication:
+  baseConfigMapRefName: basic-gpu-with-nixl-and-redis-lookup-preset
+  model:
+    modelArtifactURI: hf://meta-llama/Llama-3.2-3B-Instruct
+    modelName: "meta-llama/Llama-3.2-3B-Instruct"
+  resources:
+    limits:
+      nvidia.com/gpu: 2
+    requests:
+      nvidia.com/gpu: 2
+  prefill:
+    replicas: 1
+    extraArgs:
+      - "--tensor-parallel-size"
+      - "2"
+      - "--distributed-executor-backend"
+      - "mp"
+      - "--max-model-len"
+      - "20000"
+      - '--enable-auto-tool-choice'
+      - '--tool-call-parser'
+      - llama3_json
+      - '--chat-template'
+      - /workspace/vllm/examples/tool_chat_template_llama3.2_json.jinja
+  decode:
+    replicas: 1
+    extraArgs:
+      - "--tensor-parallel-size"
+      - "2"
+      - "--distributed-executor-backend"
+      - "mp"
+      - '--enable-auto-tool-choice'
+      - '--tool-call-parser'
+      - llama3_json
+      - '--chat-template'
+      - /workspace/vllm/examples/tool_chat_template_llama3.2_json.jinja
+      - "--max-model-len"
+      - "20000"
+redis:
+  enabled: true
+modelservice:
+  routingProxy:
+    image:
+      registry: quay.io
+      repository: sallyom/llm-d-routing-sidecar
+      tag: tracing-dev
+  epp:
+    image:
+      registry: quay.io
+      repository: sallyom/llm-d-inference-scheduler
+      tag: tracing-dev-1
+    defaultEnvVarsOverride:
+      - name: ENABLE_KVCACHE_AWARE_SCORER
+        value: "true"
+      - name: ENABLE_PREFIX_AWARE_SCORER
+        value: "true"
+      - name: ENABLE_LOAD_AWARE_SCORER
+        value: "true"
+      - name: ENABLE_SESSION_AWARE_SCORER
+        value: "true"
+      - name: PD_ENABLED
+        value: "true"
+      - name: PD_PROMPT_LEN_THRESHOLD
+        value: "10"
+      - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+        value: "true"
+      - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+        value: "true"
+      - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+        value: "true"
+      - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+        value: "true"
+tracing:
+  enabled: true
+  otelCollectorEndpoint: "otel-collector-collector.tracing.svc.cluster.local:4317"
+  samplingRate: 0.1
+  alwaysPropagateContext: true
+  components:
+    eppInferenceScheduler: true
+    routingProxy: true