add init models fall back

JaredforReal · JaredforReal · commit f9d1346d9822 · 2025-10-20T15:04:01.000+08:00
Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;
diff --git a/deploy/kubernetes/base/deployment.yaml b/deploy/kubernetes/base/deployment.yaml
@@ -77,7 +77,6 @@ spec:
           env:
             - name: HF_HUB_CACHE
               value: /tmp/hf_cache
-          # Reduced resource requirements for init container
           resources:
             requests:
               memory: "512Mi"
@@ -91,6 +90,7 @@ spec:
       containers:
         - name: semantic-router
           image: ghcr.io/vllm-project/semantic-router/extproc:latest
+          imagePullPolicy: IfNotPresent
           args: ["--secure=true"]
           securityContext:
             runAsNonRoot: false
@@ -128,14 +128,13 @@ spec:
             periodSeconds: 30
             timeoutSeconds: 10
             failureThreshold: 3
-          # Significantly reduced resource requirements for kind cluster
           resources:
             requests:
-              memory: "3Gi" # Reduced from 8Gi
-              cpu: "1" # Reduced from 2
+              memory: "3Gi"
+              cpu: "1"
             limits:
-              memory: "6Gi" # Reduced from 12Gi
-              cpu: "2" # Reduced from 4
+              memory: "6Gi"
+              cpu: "2"
       volumes:
         - name: config-volume
           configMap:
diff --git a/deploy/kubernetes/base/kustomization.yaml b/deploy/kubernetes/base/kustomization.yaml
@@ -5,6 +5,7 @@ resources:
   - ./namespace.yaml
   - ./pvc.yaml
   - ./service.yaml
+  - ./deployment.yaml
 
 configMapGenerator:
   - name: semantic-router-config
@@ -13,7 +14,3 @@ configMapGenerator:
       - ./tools_db.json
 
 namespace: vllm-semantic-router-system
-
-images:
-  - name: ghcr.io/vllm-project/semantic-router/extproc
-    newTag: latest
diff --git a/deploy/kubernetes/base/pv.example.yaml b/deploy/kubernetes/base/pv.example.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: semantic-router-models-pv
+  labels:
+    app: semantic-router
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: standard
+  persistentVolumeReclaimPolicy: Retain
+  hostPath:
+    path: /tmp/hostpath-provisioner/models
+    type: DirectoryOrCreate
diff --git a/deploy/kubernetes/deployment.katan.yaml b/deploy/kubernetes/deployment.katan.yaml
diff --git a/deploy/kubernetes/overlays/core/kustomization.yaml b/deploy/kubernetes/overlays/core/kustomization.yaml
@@ -3,4 +3,3 @@ kind: Kustomization
 
 resources:
   - ../../base
-  - ../../deployment.yaml
diff --git a/deploy/kubernetes/overlays/llm-katan/kustomization.yaml b/deploy/kubernetes/overlays/llm-katan/kustomization.yaml
@@ -3,4 +3,9 @@ kind: Kustomization
 
 resources:
   - ../../base
-  - ../../deployment.katan.yaml
+
+patches:
+  - target:
+      kind: Deployment
+      name: semantic-router
+    path: patch-llm-katan.yaml
diff --git a/deploy/kubernetes/overlays/llm-katan/patch-llm-katan.yaml b/deploy/kubernetes/overlays/llm-katan/patch-llm-katan.yaml
@@ -0,0 +1,30 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: semantic-router
+spec:
+  template:
+    spec:
+      containers:
+        - name: semantic-router
+          imagePullPolicy: IfNotPresent
+        - name: llm-katan
+          image: ghcr.io/vllm-project/semantic-router/llm-katan:latest
+          imagePullPolicy: IfNotPresent
+          args:
+            - llm-katan
+            - --model
+            - /app/models/Qwen/Qwen3-0.6B
+            - --served-model-name
+            - qwen3
+            - --host
+            - 0.0.0.0
+            - --port
+            - "8002"
+          ports:
+            - containerPort: 8002
+              name: katan
+              protocol: TCP
+          volumeMounts:
+            - name: models-volume
+              mountPath: /app/models
diff --git a/website/docs/installation/kubernetes.md b/website/docs/installation/kubernetes.md
@@ -42,6 +42,38 @@ Configure the semantic router by editing `deploy/kubernetes/config.yaml`. This f
 - llm-katan: semantic-router + an llm-katan sidecar listening on 8002 and serving model name `qwen3`
   - Path: `deploy/kubernetes/overlays/llm-katan`
 
+### Repository layout (deploy/kubernetes/)
+
+```
+deploy/kubernetes/
+  base/
+    kustomization.yaml        # base kustomize: namespace, PVC, service, deployment
+    namespace.yaml            # Namespace for all resources
+    pvc.yaml                  # PVC for models (storageClass and size adjustable)
+    service.yaml              # Service exposing gRPC/metrics/HTTP ports
+    deployment.yaml           # Semantic Router Deployment (init downloads by default)
+    config.yaml               # Router config (mounted via ConfigMap)
+    tools_db.json             # Tools DB (mounted via ConfigMap)
+    pv.example.yaml           # OPTIONAL: hostPath PV example for local models
+  overlays/
+    core/
+      kustomization.yaml      # Uses only base
+    llm-katan/
+      kustomization.yaml      # Patches base to add llm-katan sidecar
+      patch-llm-katan.yaml    # Strategic-merge patch injecting sidecar
+  kustomization.yaml          # Root points to overlays/core by default
+  README.md                   # Additional notes
+  namespace.yaml, pvc.yaml, service.yaml (top-level shortcuts kept for backward compat)
+```
+
+Notes:
+
+- The base deployment includes an initContainer that downloads required models on first run.
+- If your cluster has limited egress, prefer mounting local models via a PV/PVC and skip downloads:
+  - Copy `base/pv.example.yaml` to `base/pv.yaml`, apply it, and ensure `base/pvc.yaml` is bound to that PV.
+  - Mount point remains `/app/models` in the pod.
+  - See “Network Tips” for details on hostPath PV, image mirrors, and preloading images.
+
 Important notes before you apply manifests:
 
 - `vllm_endpoints.address` must be an IP address (not hostname) reachable from inside the cluster. If your LLM backends run as K8s Services, use the ClusterIP (for example `10.96.0.10`) and set `port` accordingly. Do not include protocol or path.
@@ -69,9 +101,9 @@ To run with the llm-katan overlay instead:
 
 ```bash
 kubectl apply -k deploy/kubernetes/overlays/llm-katan
-````
+```
 
-````
+Note: The llm-katan overlay no longer references parent files directly. It uses a local patch (`deploy/kubernetes/overlays/llm-katan/patch-llm-katan.yaml`) to inject the sidecar, avoiding kustomize parent-directory restrictions.
 
 ## Step 3: Install Envoy Gateway
 
diff --git a/website/docs/troubleshooting/network-tips.md b/website/docs/troubleshooting/network-tips.md

Original file line number	Diff line number	Diff line change
`@@ -3,4 +3,3 @@ kind: Kustomization`
`3`	`3`
`4`	`4`	`resources:`
`5`	`5`	`- ../../base`
`6`		`- - ../../deployment.yaml`