diff --git a/.gitignore b/.gitignore
index b7faf40..7baf72d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -143,6 +143,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+my_env/
 
 # Spyder project settings
 .spyderproject
@@ -205,3 +206,6 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+#kfp
+pipelines/github_rag_pipeline.yaml
\ No newline at end of file
diff --git a/README.md b/README.md
deleted file mode 100644
index 5d3a8e7..0000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# docs-agent
-Kubeflow Documentation AI Agent to power the Kubeflow Website
diff --git a/manifests/inference-service.yaml b/manifests/inference-service.yaml
new file mode 100644
index 0000000..ca61feb
--- /dev/null
+++ b/manifests/inference-service.yaml
@@ -0,0 +1,38 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: llama
+  namespace: santhosh
+spec:
+  predictor:
+    model:
+      modelFormat:
+        name: huggingface
+        version: "1"
+      runtime: llm-runtime
+      args:
+        - --model_name=llama3.1-8B
+        - --model_id=RedHatAI/Llama-3.1-8B-Instruct
+        - --backend=vllm
+        - --max-model-len=32768
+        - --gpu-memory-utilization=0.90
+        - --enable-auto-tool-choice
+        - --tool-call-parser=llama3_json
+        - --enable-tool-call-parser
+      env:
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: huggingface-secret
+              key: token
+        - name: CUDA_VISIBLE_DEVICES
+          value: "0"
+      resources:
+        requests:
+          cpu: "4"
+          memory: "16Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "6"
+          memory: "24Gi"
+          nvidia.com/gpu: "1"
diff --git a/manifests/milvus-deployment.yaml b/manifests/milvus-deployment.yaml
new file mode 100644
index 0000000..38c5dd9
--- /dev/null
+++ b/manifests/milvus-deployment.yaml
@@ -0,0 +1,256 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  annotations:
+    sidecar.istio.io/inject: "false"
+  creationTimestamp: "2025-08-03T16:17:18Z"
+  generateName: milvus-standalone-final-5cb655b8d6-
+  generation: 1
+  labels:
+    app: milvus-standalone-final
+    pod-template-hash: 5cb655b8d6
+  name: milvus-standalone-final-5cb655b8d6-6ngrn
+  namespace: santhosh
+  ownerReferences:
+  - apiVersion: apps/v1
+    blockOwnerDeletion: true
+    controller: true
+    kind: ReplicaSet
+    name: milvus-standalone-final-5cb655b8d6
+    uid: 7859a7a3-5ff8-41af-8e07-a533a298b141
+  resourceVersion: "14225268"
+  uid: 804d95ff-f4e4-47a4-a7b3-d5c6edaa5042
+spec:
+  containers:
+  - command:
+    - milvus
+    - run
+    - standalone
+    env:
+    - name: ETCD_ENDPOINTS
+      value: localhost:2379
+    - name: MINIO_ADDRESS
+      value: localhost:9000
+    - name: MINIO_ACCESS_KEY_ID
+      value: minioadmin
+    - name: MINIO_SECRET_ACCESS_KEY
+      value: minioadmin
+    image: milvusdb/milvus:v2.3.4
+    imagePullPolicy: IfNotPresent
+    name: milvus
+    ports:
+    - containerPort: 19530
+      protocol: TCP
+    - containerPort: 9091
+      protocol: TCP
+    readinessProbe:
+      failureThreshold: 3
+      httpGet:
+        path: /healthz
+        port: 9091
+        scheme: HTTP
+      initialDelaySeconds: 30
+      periodSeconds: 10
+      successThreshold: 1
+      timeoutSeconds: 5
+    resources: {}
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+    volumeMounts:
+    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+      name: kube-api-access-2hjlp
+      readOnly: true
+  - command:
+    - etcd
+    - --advertise-client-urls=http://127.0.0.1:2379
+    - --listen-client-urls=http://0.0.0.0:2379
+    - --data-dir=/etcd-data
+    image: quay.io/coreos/etcd:v3.5.0
+    imagePullPolicy: IfNotPresent
+    name: etcd
+    ports:
+    - containerPort: 2379
+      protocol: TCP
+    resources: {}
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+    volumeMounts:
+    - mountPath: /etcd-data
+      name: etcd-data
+    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+      name: kube-api-access-2hjlp
+      readOnly: true
+  - command:
+    - minio
+    - server
+    - /minio-data
+    - --console-address
+    - :9001
+    env:
+    - name: MINIO_ROOT_USER
+      value: minioadmin
+    - name: MINIO_ROOT_PASSWORD
+      value: minioadmin
+    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+    imagePullPolicy: IfNotPresent
+    name: minio
+    ports:
+    - containerPort: 9000
+      protocol: TCP
+    resources: {}
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+    volumeMounts:
+    - mountPath: /minio-data
+      name: minio-data
+    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+      name: kube-api-access-2hjlp
+      readOnly: true
+  dnsPolicy: ClusterFirst
+  enableServiceLinks: true
+  nodeName: 10.0.10.183
+  preemptionPolicy: PreemptLowerPriority
+  priority: 0
+  restartPolicy: Always
+  schedulerName: default-scheduler
+  securityContext: {}
+  serviceAccount: default
+  serviceAccountName: default
+  terminationGracePeriodSeconds: 30
+  tolerations:
+  - effect: NoExecute
+    key: node.kubernetes.io/not-ready
+    operator: Exists
+    tolerationSeconds: 300
+  - effect: NoExecute
+    key: node.kubernetes.io/unreachable
+    operator: Exists
+    tolerationSeconds: 300
+  volumes:
+  - emptyDir: {}
+    name: etcd-data
+  - emptyDir: {}
+    name: minio-data
+  - name: kube-api-access-2hjlp
+    projected:
+      defaultMode: 420
+      sources:
+      - serviceAccountToken:
+          expirationSeconds: 3607
+          path: token
+      - configMap:
+          items:
+          - key: ca.crt
+            path: ca.crt
+          name: kube-root-ca.crt
+      - downwardAPI:
+          items:
+          - fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.namespace
+            path: namespace
+status:
+  conditions:
+  - lastProbeTime: null
+    lastTransitionTime: "2025-08-03T16:17:30Z"
+    status: "True"
+    type: PodReadyToStartContainers
+  - lastProbeTime: null
+    lastTransitionTime: "2025-08-03T16:17:18Z"
+    status: "True"
+    type: Initialized
+  - lastProbeTime: null
+    lastTransitionTime: "2025-08-03T16:18:01Z"
+    status: "True"
+    type: Ready
+  - lastProbeTime: null
+    lastTransitionTime: "2025-08-03T16:18:01Z"
+    status: "True"
+    type: ContainersReady
+  - lastProbeTime: null
+    lastTransitionTime: "2025-08-03T16:17:18Z"
+    status: "True"
+    type: PodScheduled
+  containerStatuses:
+  - containerID: cri-o://974cb9b3a881ab0f7965bd5e31621686474764d8d01550ef3710e9b2058e48a7
+    image: quay.io/coreos/etcd:v3.5.0
+    imageID: quay.io/coreos/etcd@sha256:28759af54acd6924b2191dc1a1d096e2fa2e219717a21b9d8edf89717db3631b
+    lastState: {}
+    name: etcd
+    ready: true
+    resources: {}
+    restartCount: 0
+    started: true
+    state:
+      running:
+        startedAt: "2025-08-03T16:17:27Z"
+    user:
+      linux:
+        gid: 0
+        supplementalGroups:
+        - 0
+        uid: 0
+    volumeMounts:
+    - mountPath: /etcd-data
+      name: etcd-data
+    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+      name: kube-api-access-2hjlp
+      readOnly: true
+      recursiveReadOnly: Disabled
+  - containerID: cri-o://b864b918e771fcb9e1fdb41baeb8f46f5024ecfd08f88679210603506682463b
+    image: docker.io/milvusdb/milvus:v2.3.4
+    imageID: docker.io/milvusdb/milvus@sha256:efd6ef720b6ad0de62d006319996ba18504842ffaa543e3b072aeb5963305907
+    lastState: {}
+    name: milvus
+    ready: true
+    resources: {}
+    restartCount: 0
+    started: true
+    state:
+      running:
+        startedAt: "2025-08-03T16:17:25Z"
+    user:
+      linux:
+        gid: 0
+        supplementalGroups:
+        - 0
+        uid: 0
+    volumeMounts:
+    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+      name: kube-api-access-2hjlp
+      readOnly: true
+      recursiveReadOnly: Disabled
+  - containerID: cri-o://4a19f6821fa31b1d3f5db68a0b49f0df03cba496770ba4d46922cb3308ff781e
+    image: docker.io/minio/minio:RELEASE.2023-03-20T20-16-18Z
+    imageID: docker.io/minio/minio@sha256:6d770d7f255cda1f18d841ffc4365cb7e0d237f6af6a15fcdb587480cd7c3b93
+    lastState: {}
+    name: minio
+    ready: true
+    resources: {}
+    restartCount: 0
+    started: true
+    state:
+      running:
+        startedAt: "2025-08-03T16:17:29Z"
+    user:
+      linux:
+        gid: 0
+        supplementalGroups:
+        - 0
+        uid: 0
+    volumeMounts:
+    - mountPath: /minio-data
+      name: minio-data
+    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+      name: kube-api-access-2hjlp
+      readOnly: true
+      recursiveReadOnly: Disabled
+  hostIP: 10.0.10.183
+  hostIPs:
+  - ip: 10.0.10.183
+  phase: Running
+  podIP: 10.0.10.93
+  podIPs:
+  - ip: 10.0.10.93
+  qosClass: BestEffort
+  startTime: "2025-08-03T16:17:18Z"
diff --git a/manifests/serving-runtime.yaml b/manifests/serving-runtime.yaml
new file mode 100644
index 0000000..5f45810
--- /dev/null
+++ b/manifests/serving-runtime.yaml
@@ -0,0 +1,23 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  name: llm-runtime
+  namespace: santhosh
+spec:
+  supportedModelFormats:
+    - name: huggingface
+      version: "1"
+      autoSelect: true
+  containers:
+    - name: kserve-container
+      image: kserve/huggingfaceserver:latest-gpu
+      command: ["python", "-m", "huggingfaceserver"]
+      resources:
+        requests:
+          cpu: "4"
+          memory: "16Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "6"
+          memory: "24Gi"
+          nvidia.com/gpu: "1"
\ No newline at end of file
diff --git a/pipelines/github_rag_pipeline.yaml b/pipelines/github_rag_pipeline.yaml
new file mode 100644
index 0000000..e6c8bc8
--- /dev/null
+++ b/pipelines/github_rag_pipeline.yaml
@@ -0,0 +1,393 @@
+# PIPELINE DEFINITION
+# Name: github-rag-full-build
+# Description: RAG pipeline for processing GitHub documentation
+# Inputs:
+#    base_url: str [Default: 'https://www.kubeflow.org/docs']
+#    chunk_overlap: int [Default: 100.0]
+#    chunk_size: int [Default: 1000.0]
+#    collection_name: str [Default: 'docs_rag']
+#    directory_path: str [Default: 'content/en']
+#    github_token: str [Default: '']
+#    milvus_host: str [Default: 'milvus-standalone-final.santhosh.svc.cluster.local']
+#    milvus_port: str [Default: '19530']
+#    repo_name: str [Default: 'website']
+#    repo_owner: str [Default: 'kubeflow']
+components:
+  comp-chunk-and-embed:
+    executorLabel: exec-chunk-and-embed
+    inputDefinitions:
+      artifacts:
+        github_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+      parameters:
+        base_url:
+          parameterType: STRING
+        chunk_overlap:
+          parameterType: NUMBER_INTEGER
+        chunk_size:
+          parameterType: NUMBER_INTEGER
+        repo_name:
+          parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        embedded_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-download-github-directory:
+    executorLabel: exec-download-github-directory
+    inputDefinitions:
+      parameters:
+        directory_path:
+          parameterType: STRING
+        github_token:
+          parameterType: STRING
+        repo_name:
+          parameterType: STRING
+        repo_owner:
+          parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        github_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-store-milvus:
+    executorLabel: exec-store-milvus
+    inputDefinitions:
+      artifacts:
+        embedded_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+      parameters:
+        collection_name:
+          parameterType: STRING
+        milvus_host:
+          parameterType: STRING
+        milvus_port:
+          parameterType: STRING
+deploymentSpec:
+  executors:
+    exec-chunk-and-embed:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - chunk_and_embed
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'sentence-transformers'\
+          \ 'langchain'  &&  python3 -m pip install --quiet --no-warn-script-location\
+          \ 'kfp==2.14.2' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\
+          3.9\"' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef chunk_and_embed(\n    github_data: dsl.Input[dsl.Dataset],\n\
+          \    repo_name: str,\n    base_url: str,\n    chunk_size: int,\n    chunk_overlap:\
+          \ int,\n    embedded_data: dsl.Output[dsl.Dataset]\n):\n    import json\n\
+          \    import os\n    import re\n    import torch\n    from sentence_transformers\
+          \ import SentenceTransformer\n    from langchain.text_splitter import RecursiveCharacterTextSplitter\n\
+          \n    device = 'cuda' if torch.cuda.is_available() else 'cpu'\n    model\
+          \ = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)\n\
+          \    print(f\"Model loaded on {device}\")\n\n    records = []\n\n    with\
+          \ open(github_data.path, 'r', encoding='utf-8') as f:\n        for line\
+          \ in f:\n            file_data = json.loads(line)\n            content =\
+          \ file_data['content']\n\n            # AGGRESSIVE CLEANING FOR BETTER EMBEDDINGS\n\
+          \n            # Remove Hugo frontmatter (both --- and +++ styles)\n    \
+          \        content = re.sub(r'^\\s*[+\\-]{3,}.*?[+\\-]{3,}\\s*', '', content,\
+          \ flags=re.DOTALL | re.MULTILINE)\n\n            # Remove Hugo template\
+          \ syntax\n            content = re.sub(r'\\{\\{.*?\\}\\}', '', content,\
+          \ flags=re.DOTALL)\n\n            # Remove HTML comments and tags\n    \
+          \        content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)\n\
+          \            content = re.sub(r'<[^>]+>', ' ', content)\n\n            #\
+          \ Remove navigation/menu artifacts\n            content = re.sub(r'\\b(Get\
+          \ Started|Contribute|GenAI|Home|Menu|Navigation)\\b', '', content, flags=re.IGNORECASE)\n\
+          \n            # Clean up URLs and links\n            content = re.sub(r'https?://[^\\\
+          s]+', '', content)\n            content = re.sub(r'\\[([^\\]]+)\\]\\([^\\\
+          )]+\\)', r'\\1', content)  # Convert [text](url) to text\n\n           \
+          \ # Remove excessive whitespace and normalize\n            content = re.sub(r'\\\
+          s+', ' ', content)  # Multiple spaces to single\n            content = re.sub(r'\\\
+          n\\s*\\n\\s*\\n+', '\\n\\n', content)  # Multiple newlines to double\n \
+          \           content = content.strip()\n\n            # Skip files that are\
+          \ too short after cleaning\n            if len(content) < 50:\n        \
+          \        print(f\"Skipping file after cleaning: {file_data['path']} ({len(content)}\
+          \ chars)\")\n                continue\n\n            # Build citation URL\
+          \ (same as before)\n            path_parts = file_data['path'].split('/')\n\
+          \            if 'content/en/docs' in file_data['path']:\n              \
+          \  docs_index = path_parts.index('docs')\n                url_path = '/'.join(path_parts[docs_index+1:])\n\
+          \                url_path = os.path.splitext(url_path)[0]\n            \
+          \    citation_url = f\"{base_url}/{url_path}\"\n            else:\n    \
+          \            citation_url = f\"{base_url}/{file_data['path']}\"\n\n    \
+          \        file_unique_id = f\"{repo_name}:{file_data['path']}\"\n\n     \
+          \       # Create splitter\n            text_splitter = RecursiveCharacterTextSplitter(\n\
+          \                chunk_size=chunk_size,\n                chunk_overlap=chunk_overlap,\n\
+          \                length_function=len,\n                separators=[\"\\\
+          n\\n\", \"\\n\", \". \", \" \", \"\"]\n            )\n\n            # Split\
+          \ into chunks\n            chunks = text_splitter.split_text(content)\n\n\
+          \            print(f\"File: {file_data['path']} -> {len(chunks)} chunks\
+          \ (avg: {sum(len(c) for c in chunks)/len(chunks):.0f} chars)\")\n\n    \
+          \        # Create embeddings\n            for chunk_idx, chunk in enumerate(chunks):\n\
+          \                embedding = model.encode(chunk).tolist()\n            \
+          \    records.append({\n                    'file_unique_id': file_unique_id,\n\
+          \                    'repo_name': repo_name,\n                    'file_path':\
+          \ file_data['path'],\n                    'file_name': file_data['file_name'],\n\
+          \                    'citation_url': citation_url[:1024],\n            \
+          \        'chunk_index': chunk_idx,\n                    'content_text':\
+          \ chunk[:2000],\n                    'embedding': embedding\n          \
+          \      })\n\n    print(f\"Created {len(records)} total chunks\")\n\n   \
+          \ with open(embedded_data.path, 'w', encoding='utf-8') as f:\n        for\
+          \ record in records:\n            f.write(json.dumps(record, ensure_ascii=False)\
+          \ + '\\n')\n\n"
+        image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+    exec-download-github-directory:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - download_github_directory
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'requests' 'beautifulsoup4'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.2'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef download_github_directory(\n    repo_owner: str,\n    repo_name:\
+          \ str,\n    directory_path: str,\n    github_token: str,\n    github_data:\
+          \ dsl.Output[dsl.Dataset]\n):\n    import requests\n    import json\n  \
+          \  import base64\n    from bs4 import BeautifulSoup\n\n    headers = {\"\
+          Authorization\": f\"token {github_token}\"} if github_token else {}\n  \
+          \  api_url = f\"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{directory_path}\"\
+          \n\n    def get_files_recursive(url):\n        files = []\n        try:\n\
+          \            response = requests.get(url, headers=headers)\n           \
+          \ response.raise_for_status()\n            items = response.json()\n\n \
+          \           for item in items:\n                if item['type'] == 'file'\
+          \ and (item['name'].endswith('.md') or item['name'].endswith('.html')):\n\
+          \                    file_response = requests.get(item['url'], headers=headers)\n\
+          \                    file_response.raise_for_status()\n                \
+          \    file_data = file_response.json()\n                    content = base64.b64decode(file_data['content']).decode('utf-8')\n\
+          \n                    # Extract text from HTML files\n                 \
+          \   if item['name'].endswith('.html'):\n                        soup = BeautifulSoup(content,\
+          \ 'html.parser')\n                        content = soup.get_text(separator='\
+          \ ', strip=True)\n\n                    files.append({\n               \
+          \         'path': item['path'],\n                        'content': content,\n\
+          \                        'file_name': item['name']\n                   \
+          \ })\n                elif item['type'] == 'dir':\n                    files.extend(get_files_recursive(item['url']))\n\
+          \        except Exception as e:\n            print(f\"Error fetching {url}:\
+          \ {e}\")\n        return files\n\n    files = get_files_recursive(api_url)\n\
+          \    print(f\"Downloaded {len(files)} files\")\n\n    with open(github_data.path,\
+          \ 'w', encoding='utf-8') as f:\n        for file_data in files:\n      \
+          \      f.write(json.dumps(file_data, ensure_ascii=False) + '\\n')\n\n"
+        image: python:3.9
+    exec-store-milvus:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - store_milvus
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'pymilvus' 'numpy'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.14.2'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef store_milvus(\n    embedded_data: dsl.Input[dsl.Dataset],\n \
+          \   milvus_host: str,\n    milvus_port: str,\n    collection_name: str\n\
+          ):\n    from pymilvus import connections, utility, FieldSchema, CollectionSchema,\
+          \ DataType, Collection\n    import json\n    from datetime import datetime\n\
+          \n    connections.connect(\"default\", host=milvus_host, port=milvus_port)\n\
+          \n    # DROP existing collection to fix schema mismatch\n    if utility.has_collection(collection_name):\n\
+          \        utility.drop_collection(collection_name)\n        print(f\"Dropped\
+          \ existing collection: {collection_name}\")\n\n    # Enhanced schema with\
+          \ 768 dimensions\n    fields = [\n        FieldSchema(name=\"id\", dtype=DataType.INT64,\
+          \ is_primary=True, auto_id=True),\n        FieldSchema(name=\"file_unique_id\"\
+          , dtype=DataType.VARCHAR, max_length=512),\n        FieldSchema(name=\"\
+          repo_name\", dtype=DataType.VARCHAR, max_length=256),\n        FieldSchema(name=\"\
+          file_path\", dtype=DataType.VARCHAR, max_length=512),\n        FieldSchema(name=\"\
+          file_name\", dtype=DataType.VARCHAR, max_length=256),\n        FieldSchema(name=\"\
+          citation_url\", dtype=DataType.VARCHAR, max_length=1024),\n        FieldSchema(name=\"\
+          chunk_index\", dtype=DataType.INT64),\n        FieldSchema(name=\"content_text\"\
+          , dtype=DataType.VARCHAR, max_length=2000),\n        FieldSchema(name=\"\
+          vector\", dtype=DataType.FLOAT_VECTOR, dim=768),  # Updated for all-mpnet-base-v2\n\
+          \        FieldSchema(name=\"last_updated\", dtype=DataType.INT64)\n    ]\n\
+          \n    # Create new collection with correct schema\n    schema = CollectionSchema(fields,\
+          \ \"RAG collection for documentation\")\n    collection = Collection(collection_name,\
+          \ schema)\n    print(f\"Created new collection: {collection_name}\")\n\n\
+          \    # Rest of your existing code remains the same...\n    records = []\n\
+          \    timestamp = int(datetime.now().timestamp())\n\n    with open(embedded_data.path,\
+          \ 'r', encoding='utf-8') as f:\n        for line in f:\n            record\
+          \ = json.loads(line)\n            records.append({\n                \"file_unique_id\"\
+          : record[\"file_unique_id\"],\n                \"repo_name\": record[\"\
+          repo_name\"],\n                \"file_path\": record[\"file_path\"],\n \
+          \               \"file_name\": record[\"file_name\"],\n                \"\
+          citation_url\": record[\"citation_url\"],\n                \"chunk_index\"\
+          : record[\"chunk_index\"],\n                \"content_text\": record[\"\
+          content_text\"],\n                \"vector\": record[\"embedding\"],\n \
+          \               \"last_updated\": timestamp\n            })\n\n    if records:\n\
+          \        batch_size = 1000\n        for i in range(0, len(records), batch_size):\n\
+          \            batch = records[i:i + batch_size]\n            collection.insert(batch)\n\
+          \n        collection.flush()\n\n        # Create index\n        index_params\
+          \ = {\n            \"metric_type\": \"COSINE\",\n            \"index_type\"\
+          : \"IVF_FLAT\", \n            \"params\": {\"nlist\": min(1024, len(records))}\n\
+          \        }\n        collection.create_index(\"vector\", index_params)\n\
+          \        collection.load()\n        print(f\"\u2705 Inserted {len(records)}\
+          \ records. Total: {collection.num_entities}\")\n\n"
+        image: python:3.9
+pipelineInfo:
+  description: RAG pipeline for processing GitHub documentation
+  name: github-rag-full-build
+root:
+  dag:
+    tasks:
+      chunk-and-embed:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-chunk-and-embed
+        dependentTasks:
+        - download-github-directory
+        inputs:
+          artifacts:
+            github_data:
+              taskOutputArtifact:
+                outputArtifactKey: github_data
+                producerTask: download-github-directory
+          parameters:
+            base_url:
+              componentInputParameter: base_url
+            chunk_overlap:
+              componentInputParameter: chunk_overlap
+            chunk_size:
+              componentInputParameter: chunk_size
+            repo_name:
+              componentInputParameter: repo_name
+        taskInfo:
+          name: chunk-and-embed
+      download-github-directory:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-download-github-directory
+        inputs:
+          parameters:
+            directory_path:
+              componentInputParameter: directory_path
+            github_token:
+              componentInputParameter: github_token
+            repo_name:
+              componentInputParameter: repo_name
+            repo_owner:
+              componentInputParameter: repo_owner
+        taskInfo:
+          name: download-github-directory
+      store-milvus:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-store-milvus
+        dependentTasks:
+        - chunk-and-embed
+        inputs:
+          artifacts:
+            embedded_data:
+              taskOutputArtifact:
+                outputArtifactKey: embedded_data
+                producerTask: chunk-and-embed
+          parameters:
+            collection_name:
+              componentInputParameter: collection_name
+            milvus_host:
+              componentInputParameter: milvus_host
+            milvus_port:
+              componentInputParameter: milvus_port
+        taskInfo:
+          name: store-milvus
+  inputDefinitions:
+    parameters:
+      base_url:
+        defaultValue: https://www.kubeflow.org/docs
+        isOptional: true
+        parameterType: STRING
+      chunk_overlap:
+        defaultValue: 100.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      chunk_size:
+        defaultValue: 1000.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      collection_name:
+        defaultValue: docs_rag
+        isOptional: true
+        parameterType: STRING
+      directory_path:
+        defaultValue: content/en
+        isOptional: true
+        parameterType: STRING
+      github_token:
+        defaultValue: ''
+        isOptional: true
+        parameterType: STRING
+      milvus_host:
+        defaultValue: milvus-standalone-final.santhosh.svc.cluster.local
+        isOptional: true
+        parameterType: STRING
+      milvus_port:
+        defaultValue: '19530'
+        isOptional: true
+        parameterType: STRING
+      repo_name:
+        defaultValue: website
+        isOptional: true
+        parameterType: STRING
+      repo_owner:
+        defaultValue: kubeflow
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.14.2
diff --git a/pipelines/kubeflow-pipeline.py b/pipelines/kubeflow-pipeline.py
new file mode 100644
index 0000000..96fd83f
--- /dev/null
+++ b/pipelines/kubeflow-pipeline.py
@@ -0,0 +1,292 @@
+import kfp
+from kfp import dsl
+from kfp.dsl import *
+from typing import *
+
+@dsl.component(
+    base_image="python:3.9",
+    packages_to_install=["requests", "beautifulsoup4"]
+)
+def download_github_directory(
+    repo_owner: str,
+    repo_name: str,
+    directory_path: str,
+    github_token: str,
+    github_data: dsl.Output[dsl.Dataset]
+):
+    import requests
+    import json
+    import base64
+    from bs4 import BeautifulSoup
+
+    headers = {"Authorization": f"token {github_token}"} if github_token else {}
+    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{directory_path}"
+
+    def get_files_recursive(url):
+        files = []
+        try:
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            items = response.json()
+
+            for item in items:
+                if item['type'] == 'file' and (item['name'].endswith('.md') or item['name'].endswith('.html')):
+                    file_response = requests.get(item['url'], headers=headers)
+                    file_response.raise_for_status()
+                    file_data = file_response.json()
+                    content = base64.b64decode(file_data['content']).decode('utf-8')
+
+                    # Extract text from HTML files
+                    if item['name'].endswith('.html'):
+                        soup = BeautifulSoup(content, 'html.parser')
+                        content = soup.get_text(separator=' ', strip=True)
+
+                    files.append({
+                        'path': item['path'],
+                        'content': content,
+                        'file_name': item['name']
+                    })
+                elif item['type'] == 'dir':
+                    files.extend(get_files_recursive(item['url']))
+        except Exception as e:
+            print(f"Error fetching {url}: {e}")
+        return files
+
+    files = get_files_recursive(api_url)
+    print(f"Downloaded {len(files)} files")
+
+    with open(github_data.path, 'w', encoding='utf-8') as f:
+        for file_data in files:
+            f.write(json.dumps(file_data, ensure_ascii=False) + '\n')
+
+
+@dsl.component(
+    base_image="pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime",
+    packages_to_install=["sentence-transformers", "langchain"]
+)
+def chunk_and_embed(
+    github_data: dsl.Input[dsl.Dataset],
+    repo_name: str,
+    base_url: str,
+    chunk_size: int,
+    chunk_overlap: int,
+    embedded_data: dsl.Output[dsl.Dataset]
+):
+    import json
+    import os
+    import re
+    import torch
+    from sentence_transformers import SentenceTransformer
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
+    print(f"Model loaded on {device}")
+
+    records = []
+
+    with open(github_data.path, 'r', encoding='utf-8') as f:
+        for line in f:
+            file_data = json.loads(line)
+            content = file_data['content']
+
+            # AGGRESSIVE CLEANING FOR BETTER EMBEDDINGS
+
+            # Remove Hugo frontmatter (both --- and +++ styles)
+            content = re.sub(r'^\s*[+\-]{3,}.*?[+\-]{3,}\s*', '', content, flags=re.DOTALL | re.MULTILINE)
+
+            # Remove Hugo template syntax
+            content = re.sub(r'\{\{.*?\}\}', '', content, flags=re.DOTALL)
+
+            # Remove HTML comments and tags
+            content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
+            content = re.sub(r'<[^>]+>', ' ', content)
+
+            # Remove navigation/menu artifacts
+            content = re.sub(r'\b(Get Started|Contribute|GenAI|Home|Menu|Navigation)\b', '', content, flags=re.IGNORECASE)
+
+            # Clean up URLs and links
+            content = re.sub(r'https?://[^\s]+', '', content)
+            content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)  # Convert [text](url) to text
+
+            # Remove excessive whitespace and normalize
+            content = re.sub(r'\s+', ' ', content)  # Multiple spaces to single
+            content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)  # Multiple newlines to double
+            content = content.strip()
+
+            # Skip files that are too short after cleaning
+            if len(content) < 50:
+                print(f"Skipping file after cleaning: {file_data['path']} ({len(content)} chars)")
+                continue
+
+            # Build citation URL (same as before)
+            path_parts = file_data['path'].split('/')
+            if 'content/en/docs' in file_data['path']:
+                docs_index = path_parts.index('docs')
+                url_path = '/'.join(path_parts[docs_index+1:])
+                url_path = os.path.splitext(url_path)[0]
+                citation_url = f"{base_url}/{url_path}"
+            else:
+                citation_url = f"{base_url}/{file_data['path']}"
+
+            file_unique_id = f"{repo_name}:{file_data['path']}"
+
+            # Create splitter
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                length_function=len,
+                separators=["\n\n", "\n", ". ", " ", ""]
+            )
+
+            # Split into chunks
+            chunks = text_splitter.split_text(content)
+
+            print(f"File: {file_data['path']} -> {len(chunks)} chunks (avg: {sum(len(c) for c in chunks)/len(chunks):.0f} chars)")
+
+            # Create embeddings
+            for chunk_idx, chunk in enumerate(chunks):
+                embedding = model.encode(chunk).tolist()
+                records.append({
+                    'file_unique_id': file_unique_id,
+                    'repo_name': repo_name,
+                    'file_path': file_data['path'],
+                    'file_name': file_data['file_name'],
+                    'citation_url': citation_url[:1024],
+                    'chunk_index': chunk_idx,
+                    'content_text': chunk[:2000],
+                    'embedding': embedding
+                })
+
+    print(f"Created {len(records)} total chunks")
+
+    with open(embedded_data.path, 'w', encoding='utf-8') as f:
+        for record in records:
+            f.write(json.dumps(record, ensure_ascii=False) + '\n')
+
+
+@dsl.component(
+    base_image="python:3.9",
+    packages_to_install=["pymilvus", "numpy"]
+)
+def store_milvus(
+    embedded_data: dsl.Input[dsl.Dataset],
+    milvus_host: str,
+    milvus_port: str,
+    collection_name: str
+):
+    from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection
+    import json
+    from datetime import datetime
+
+    connections.connect("default", host=milvus_host, port=milvus_port)
+
+    # DROP existing collection to fix schema mismatch
+    if utility.has_collection(collection_name):
+        utility.drop_collection(collection_name)
+        print(f"Dropped existing collection: {collection_name}")
+
+    # Enhanced schema with 768 dimensions
+    fields = [
+        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+        FieldSchema(name="file_unique_id", dtype=DataType.VARCHAR, max_length=512),
+        FieldSchema(name="repo_name", dtype=DataType.VARCHAR, max_length=256),
+        FieldSchema(name="file_path", dtype=DataType.VARCHAR, max_length=512),
+        FieldSchema(name="file_name", dtype=DataType.VARCHAR, max_length=256),
+        FieldSchema(name="citation_url", dtype=DataType.VARCHAR, max_length=1024),
+        FieldSchema(name="chunk_index", dtype=DataType.INT64),
+        FieldSchema(name="content_text", dtype=DataType.VARCHAR, max_length=2000),
+        FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=768),  # Updated for all-mpnet-base-v2
+        FieldSchema(name="last_updated", dtype=DataType.INT64)
+    ]
+
+    # Create new collection with correct schema
+    schema = CollectionSchema(fields, "RAG collection for documentation")
+    collection = Collection(collection_name, schema)
+    print(f"Created new collection: {collection_name}")
+
+    # Rest of your existing code remains the same...
+    records = []
+    timestamp = int(datetime.now().timestamp())
+
+    with open(embedded_data.path, 'r', encoding='utf-8') as f:
+        for line in f:
+            record = json.loads(line)
+            records.append({
+                "file_unique_id": record["file_unique_id"],
+                "repo_name": record["repo_name"],
+                "file_path": record["file_path"],
+                "file_name": record["file_name"],
+                "citation_url": record["citation_url"],
+                "chunk_index": record["chunk_index"],
+                "content_text": record["content_text"],
+                "vector": record["embedding"],
+                "last_updated": timestamp
+            })
+
+    if records:
+        batch_size = 1000
+        for i in range(0, len(records), batch_size):
+            batch = records[i:i + batch_size]
+            collection.insert(batch)
+
+        collection.flush()
+
+        # Create index
+        index_params = {
+            "metric_type": "COSINE",
+            "index_type": "IVF_FLAT", 
+            "params": {"nlist": min(1024, len(records))}
+        }
+        collection.create_index("vector", index_params)
+        collection.load()
+        print(f"✅ Inserted {len(records)} records. Total: {collection.num_entities}")
+
+
+@dsl.pipeline(
+    name="github-rag-full-build",
+    description="RAG pipeline for processing GitHub documentation"
+)
+def github_rag_pipeline(
+    repo_owner: str = "kubeflow",
+    repo_name: str = "website", 
+    directory_path: str = "content/en",
+    github_token: str = "",
+    base_url: str = "https://www.kubeflow.org/docs",
+    chunk_size: int = 1000,
+    chunk_overlap: int = 100,
+    milvus_host: str = "milvus-standalone-final.santhosh.svc.cluster.local",
+    milvus_port: str = "19530",
+    collection_name: str = "docs_rag"
+):
+    # Download GitHub directory
+    download_task = download_github_directory(
+        repo_owner=repo_owner,
+        repo_name=repo_name,
+        directory_path=directory_path,
+        github_token=github_token
+    )
+    
+    # Chunk and embed the content
+    chunk_task = chunk_and_embed(
+        github_data=download_task.outputs["github_data"],
+        repo_name=repo_name,
+        base_url=base_url,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap
+    )
+    
+    # Store in Milvus
+    store_task = store_milvus(
+        embedded_data=chunk_task.outputs["embedded_data"],
+        milvus_host=milvus_host,
+        milvus_port=milvus_port,
+        collection_name=collection_name
+    )
+
+if __name__ == "__main__":
+    # Compile the pipeline
+    kfp.compiler.Compiler().compile(
+        pipeline_func=github_rag_pipeline,
+        package_path="github_rag_pipeline.yaml"
+    )
\ No newline at end of file
diff --git a/server/Dockerfile b/server/Dockerfile
new file mode 100644
index 0000000..0eb7d80
--- /dev/null
+++ b/server/Dockerfile
@@ -0,0 +1,29 @@
+FROM python:3.11-slim
+
+# Create non-root user
+RUN useradd -m -u 1000 appuser
+WORKDIR /app
+
+# Install deps
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Writable caches for HF/Transformers/Sentence-Transformers
+ENV HF_HOME=/home/appuser/.cache/huggingface \
+    TRANSFORMERS_CACHE=/home/appuser/.cache/huggingface \
+    SENTENCE_TRANSFORMERS_HOME=/home/appuser/.cache/sentence_transformers \
+    XDG_CACHE_HOME=/home/appuser/.cache \
+    PORT=8000
+
+# Ensure directories exist and owned by appuser
+RUN mkdir -p $HF_HOME $SENTENCE_TRANSFORMERS_HOME $XDG_CACHE_HOME && \
+    chown -R appuser:appuser /home/appuser
+
+# Switch to non-root before running the app
+USER appuser
+
+# App
+COPY app.py /app/
+
+EXPOSE 8000
+CMD ["python", "-u", "app.py"]
diff --git a/server/app.py b/server/app.py
new file mode 100644
index 0000000..8fd4596
--- /dev/null
+++ b/server/app.py
@@ -0,0 +1,247 @@
+import os
+import json
+import asyncio
+import httpx
+import websockets
+from websockets.server import serve
+from websockets.exceptions import InvalidMessage, ConnectionClosedError
+import logging
+from pymilvus import connections, Collection
+from sentence_transformers import SentenceTransformer
+
+# Config
+KSERVE_URL = os.getenv("KSERVE_URL", "http://llama.santhosh.svc.cluster.local/openai/v1/chat/completions")
+MODEL = os.getenv("MODEL", "llama3.1-8B")
+PORT = int(os.getenv("PORT", "8000"))
+MILVUS_HOST = "milvus-standalone-final.santhosh.svc.cluster.local"
+MILVUS_PORT = "19530"
+COLLECTION_NAME = "docs_rag"
+
+# Load embedding model once
+embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
+
+def rag_search(query: str) -> str:
+    """Search Milvus for relevant documents"""
+    try:
+        connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
+        collection = Collection(COLLECTION_NAME)
+        collection.load()
+        
+        query_vec = embedding_model.encode(query).tolist()
+        results = collection.search(
+            data=[query_vec],
+            anns_field="vector", 
+            param={"metric_type": "COSINE", "params": {"nprobe": 32}},
+            limit=3,
+            output_fields=["file_path", "content_text", "citation_url"]
+        )
+        
+        context = ""
+        for hit in results[0]:
+            content = hit.entity.get('content_text', '')
+            url = hit.entity.get('citation_url', '')
+            context += f"Source: {url}\nContent: {content}\n\n"
+        
+        connections.disconnect("default")
+        return context
+    except Exception as e:
+        return f"RAG search failed: {e}"
+
+# System prompt to control tool usage
+SYSTEM_PROMPT = """You are a helpful AI assistant with access to Kubeflow documentation through a search function.
+
+IMPORTANT: You have two response modes:
+
+1. KUBEFLOW MODE:
+   - For questions about: Kubeflow, KServe, Katib
+   - Use the search function silently (don't mention it)
+   - Start responses with "[KUBEFLOW]"
+   - Be detailed but concise
+
+2. GENERAL MODE:
+   - For everything else (jokes, math, chat)
+   - Don't use search function
+   - Start responses with "[GENERAL]"
+   - Keep responses short
+
+CRITICAL RULES:
+- NEVER describe or mention the search function
+- NEVER output function calls or JSON
+- NEVER explain what you're going to do
+- Just DO it and give the answer
+
+Example good responses:
+[KUBEFLOW] Kubeflow can be installed using...
+[GENERAL] Here's a joke: Why did the...
+
+Example BAD responses:
+"I'll use the search function..."
+"The function to use is..."
+{any JSON or function calls}"""
+
+RAG_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "search_kubeflow_docs",
+        "description": "Search Kubeflow documentation. ONLY call this function, do not describe it. NEVER output the function call details.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "query": {"type": "string", "description": "Search query"}
+            },
+            "required": ["query"]
+        }
+    }
+}
+
+async def handle_chat(message: str, websocket) -> None:
+    """Handle chat with RAG tool support - simple non-streaming approach"""
+    
+    try:
+        async with httpx.AsyncClient(timeout=120) as client:
+            # Single request with tools - let LLM decide if RAG is needed
+            payload = {
+                "model": MODEL, 
+                "messages": [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": message}
+                ], 
+                "tools": [RAG_TOOL],
+                "tool_choice": "auto",
+                "stream": False,
+                "max_tokens": 1500
+            }
+            
+            response = await client.post(KSERVE_URL, json=payload, headers={"content-type": "application/json"})
+            
+            if response.status_code != 200:
+                await websocket.send(f"ERROR: HTTP {response.status_code}: {response.text}")
+                return
+            
+            result = response.json()
+            choice = result.get("choices", [{}])[0]
+            message_obj = choice.get("message", {})
+            
+            # Get direct content and tool calls
+            direct_content = message_obj.get("content", "")
+            tool_calls = message_obj.get("tool_calls", [])
+
+            # Handle Kubeflow questions with tool calls
+            if tool_calls and any(tc["function"]["name"] == "search_kubeflow_docs" for tc in tool_calls):
+                try:
+                    # Get the search query
+                    tool_call = next(tc for tc in tool_calls if tc["function"]["name"] == "search_kubeflow_docs")
+                    args = json.loads(tool_call["function"]["arguments"])
+                    query = args["query"]
+                    
+                    # Perform RAG search silently
+                    context = rag_search(query)
+                    
+                    if "RAG search failed" in context:
+                        # Fallback to general response
+                        fallback_payload = {
+                            "model": MODEL,
+                            "messages": [
+                                {"role": "system", "content": SYSTEM_PROMPT},
+                                {"role": "user", "content": message}
+                            ],
+                            "stream": False,
+                            "max_tokens": 800
+                        }
+                        fallback_response = await client.post(KSERVE_URL, json=fallback_payload, headers={"content-type": "application/json"})
+                        
+                        if fallback_response.status_code == 200:
+                            fallback_result = fallback_response.json()
+                            fallback_content = fallback_result.get("choices", [{}])[0].get("message", {}).get("content", "")
+                            await websocket.send(f"[GENERAL] {fallback_content}")
+                        else:
+                            await websocket.send("[ERROR] Could not generate response")
+                        return
+                    
+                    # Generate response with context
+                    final_payload = {
+                        "model": MODEL,
+                        "messages": [
+                            {"role": "system", "content": SYSTEM_PROMPT},
+                            {"role": "user", "content": message},
+                            {"role": "system", "content": f"Use this documentation to answer:\n\n{context}"}
+                        ],
+                        "stream": False,
+                        "max_tokens": 800
+                    }
+                    
+                    final_response = await client.post(KSERVE_URL, json=final_payload, headers={"content-type": "application/json"})
+                    
+                    if final_response.status_code == 200:
+                        final_result = final_response.json()
+                        final_content = final_result.get("choices", [{}])[0].get("message", {}).get("content", "")
+                        
+                        if final_content.strip():
+                            # Ensure response starts with [KUBEFLOW]
+                            if not final_content.startswith("[KUBEFLOW]"):
+                                final_content = f"[KUBEFLOW] {final_content}"
+                            await websocket.send(final_content)
+                        else:
+                            await websocket.send("[ERROR] Empty response from LLM")
+                    else:
+                        await websocket.send(f"[ERROR] Failed to generate response: {final_response.status_code}")
+                        
+                except Exception as e:
+                    await websocket.send(f"[ERROR] Processing failed: {e}")
+                    
+            else:
+                # Handle direct responses (non-Kubeflow)
+                if direct_content.strip():
+                    # Ensure response starts with [GENERAL]
+                    if not direct_content.startswith("[GENERAL]"):
+                        direct_content = f"[GENERAL] {direct_content}"
+                    await websocket.send(direct_content)
+                else:
+                    await websocket.send("[ERROR] No response generated")
+                    
+    except Exception as e:
+        await websocket.send(f"ERROR: Request failed: {e}")
+
+async def handle_websocket(websocket, path):
+    """Handle WebSocket connections with proper error handling"""
+    try:
+        print(f"New WebSocket connection from {websocket.remote_address}")
+        async for message in websocket:
+            print(f"Received message: {message[:100]}...")
+            await handle_chat(message, websocket)
+    except ConnectionClosedError:
+        print("WebSocket connection closed normally")
+    except Exception as e:
+        print(f"WebSocket error: {e}")
+        try:
+            await websocket.send(f"ERROR: Connection error: {str(e)}")
+        except:
+            pass
+
+async def health_check(path, request_headers):
+    """Handle HTTP health checks"""
+    if path == "/health":
+        return 200, [("Content-Type", "text/plain")], b"OK"
+    return None
+
+async def main():
+    print(f"Starting RAG WebSocket server on 0.0.0.0:{PORT}")
+    print(f"Llama service: {KSERVE_URL}")
+    print(f"Milvus: {MILVUS_HOST}:{MILVUS_PORT}")
+    
+    # Configure logging to reduce noise
+    logging.getLogger("websockets").setLevel(logging.WARNING)
+    
+    async with serve(
+        handle_websocket, 
+        "0.0.0.0", 
+        PORT,
+        process_request=health_check,
+        ping_interval=None,  # Disable ping/pong
+        ping_timeout=None
+    ):
+        print("WebSocket server is running...")
+        await asyncio.Future()
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/server/deployment.yaml b/server/deployment.yaml
new file mode 100644
index 0000000..dcdec4c
--- /dev/null
+++ b/server/deployment.yaml
@@ -0,0 +1,96 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ws-proxy
+  labels:
+    app: ws-proxy
+    version: v1
+spec:
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+      maxSurge: 1
+  selector:
+    matchLabels:
+      app: ws-proxy
+  template:
+    metadata:
+      labels:
+        app: ws-proxy
+        version: v1
+    spec:
+      containers:
+        - name: ws-proxy
+          image: santhoshtoorpu/ws-proxy:latest
+          imagePullPolicy: IfNotPresent   # actually reduce pulls
+          resources:
+            requests:
+              cpu: "300m"
+              memory: "1Gi"
+              ephemeral-storage: "4Gi"
+            limits:
+              cpu: "1500m"
+              memory: "2Gi"
+              ephemeral-storage: "8Gi"
+          env:
+            # Match your app.py variable names
+            - name: KSERVE_URL
+              value: "http://llama.santhosh.svc.cluster.local/openai/v1/chat/completions"
+            - name: MODEL
+              value: "llama3.1-8B"
+            - name: PORT
+              value: "8000"
+            - name: PYTHONUNBUFFERED
+              value: "1"
+
+            # Make cache paths explicit and writable (align with Dockerfile)
+            - name: HF_HOME
+              value: "/home/appuser/.cache/huggingface"
+            - name: TRANSFORMERS_CACHE
+              value: "/home/appuser/.cache/huggingface"
+            - name: SENTENCE_TRANSFORMERS_HOME
+              value: "/home/appuser/.cache/sentence_transformers"
+            - name: XDG_CACHE_HOME
+              value: "/home/appuser/.cache"
+
+          ports:
+            - containerPort: 8000
+              name: http
+              protocol: TCP
+
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 3
+
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            initialDelaySeconds: 10
+            periodSeconds: 5
+            timeoutSeconds: 3
+            failureThreshold: 3
+
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+            runAsUser: 1000
+            capabilities:
+              drop: ["ALL"]
+
+          # Optional: mount an emptyDir to persist cache across container restarts
+          volumeMounts:
+            - name: hf-cache
+              mountPath: /home/appuser/.cache
+
+      volumes:
+        - name: hf-cache
+          emptyDir:
+            sizeLimit: 5Gi
diff --git a/server/requirements.txt b/server/requirements.txt
new file mode 100644
index 0000000..fba0111
--- /dev/null
+++ b/server/requirements.txt
@@ -0,0 +1,13 @@
+# Core dependencies
+websockets
+httpx
+
+# Milvus client
+pymilvus
+
+# Optimized ML dependencies - use lighter alternatives
+sentence-transformers
+torch --extra-index-url https://download.pytorch.org/whl/cpu
+
+# Essential only
+numpy
diff --git a/server/service.yaml b/server/service.yaml
new file mode 100644
index 0000000..f7a058d
--- /dev/null
+++ b/server/service.yaml
@@ -0,0 +1,28 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: ws-proxy
+  labels:
+    app: ws-proxy
+    version: v1
+  annotations:
+    # Add service annotations for better observability
+    prometheus.io/scrape: "false"  # Disable if no metrics endpoint
+spec:
+  type: ClusterIP  # Explicit service type
+  selector:
+    app: ws-proxy
+  ports:
+    - name: http
+      port: 80
+      targetPort: 8000
+      protocol: TCP
+    - name: websocket
+      port: 8000
+      targetPort: 8000
+      protocol: TCP
+  # Session affinity for websocket connections
+  sessionAffinity: ClientIP
+  sessionAffinityConfig:
+    clientIP:
+      timeoutSeconds: 3600  # 1 hour timeout for websocket sessions
\ No newline at end of file