kubernetes · volatilemolotov · Jun 26, 2025 · Sep 11, 2025 · Sep 29, 2025 · Oct 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,10 @@ cscope.*
 
 /bazel-*
 *.pyc
+
+# Helm chart dependecies cache
+**/Chart.lock
+**/charts/*.tgz
+
+# Helm chart output directory
+ai/ai-starter-kit/out
diff --git a/ai/ai-starter-kit/Makefile b/ai/ai-starter-kit/Makefile
@@ -0,0 +1,64 @@
+.PHONY: check_hf_token check_OCI_target package_helm lint dep_update install install_gke start uninstall push_helm
+
+check_hf_token:
+ifndef HF_TOKEN
+	$(error HF_TOKEN is not set)
+endif
+
+check_OCI_target:
+ifndef OCI_HELM_TARGET
+	$(error OCI_HELM_TARGET is not set)
+endif
+
+package_helm:
+	helm package helm-chart/ai-starter-kit/ --destination out/
+
+push_helm: check_OCI_target
+	helm push out/ai-starter-kit* oci://$$OCI_HELM_TARGET
+
+lint:
+	helm lint helm-chart/ai-starter-kit
+
+dep_update:
+	helm dependency update helm-chart/ai-starter-kit
+
+install: check_hf_token
+	helm upgrade --install ai-starter-kit helm-chart/ai-starter-kit --set huggingface.token="$$HF_TOKEN" --timeout 10m -f helm-chart/ai-starter-kit/values.yaml
+
+start:
+	mkdir -p /tmp/models-cache
+	minikube start --cpus 4 --memory 15000 --mount --mount-string="$$HOME/models-cache:/tmp/models-cache"
+
+start_gpu:
+	mkdir -p $HOME/models-cache
+	minikube start --driver krunkit --cpus 4 --memory 15000 --mount --mount-string="$HOME/models-cache:$HOME/models-cache"
+
+uninstall:
+	helm uninstall ai-starter-kit
+	kubectl delete pod jupyter-user
+	kubectl delete pvc ai-starter-kit-jupyterhub-hub-db-dir
+
+destroy:
+	minikube delete
+
+validate_jupyterhub:
+	kubectl get pods; \
+    kubectl wait --for=condition=Ready pods -l 'component!=continuous-image-puller' --timeout=1800s; \
+    kubectl get pods; \
+    kubectl get services; \
+    kubectl port-forward service/ai-starter-kit-jupyterhub-proxy-public 8081:80 & \
+    PID=$$!; \
+    echo "Port-forward PID=$${PID}"; \
+    sleep 5s; \
+    python3 ./ci/test_hub.py "127.0.0.1:8081"; \
+    kill $$PID
+
+validate_ray:
+	kubectl wait --for=condition=Ready pods -l 'app.kubernetes.io/created-by=kuberay-operator' --timeout=1800s; \
+	kubectl get pods; \
+	kubectl get services; \
+	kubectl port-forward service/ai-starter-kit-kuberay-head-svc 8265:8265 & \
+	PID=$$!; \
+	sleep 10s; \
+	ray job submit --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"; \
+	kill $$PID
diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/.helmignore b/ai/ai-starter-kit/helm-chart/ai-starter-kit/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/Chart.yaml b/ai/ai-starter-kit/helm-chart/ai-starter-kit/Chart.yaml
@@ -0,0 +1,45 @@
+apiVersion: v2
+name: ai-starter-kit
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "0.1.0"
+
+
+dependencies:
+  - name: kuberay-operator
+    condition: ray-cluster.enabled
+    version: "1.3.0"
+    repository: "https://ray-project.github.io/kuberay-helm"
+  - condition: ray-cluster.enabled
+    name: ray-cluster
+    version: "1.3.0"
+    repository: "https://ray-project.github.io/kuberay-helm"
+  - name: jupyterhub
+    version: "4.2.0"
+    repository: "https://hub.jupyter.org/helm-chart/"
+  - name: mlflow
+    version: "0.12.0"
+    repository: "https://community-charts.github.io/helm-charts"
+  - name: ollama
+    condition: ollama.enabled
+    version: "1.27.0"
+    repository: "https://helm.otwld.com"
diff --git a/ai/ai-starter-kit/helm-chart/ai-starter-kit/README.md b/ai/ai-starter-kit/helm-chart/ai-starter-kit/README.md
@@ -0,0 +1,233 @@
+# AI Starter Kit
+
+A comprehensive Helm chart for deploying a complete AI/ML development environment on Kubernetes. This starter kit provides a ready-to-use platform with JupyterHub notebooks, model serving capabilities, and experiment tracking - perfect for teams starting their AI journey or prototyping AI applications.
+
+## Purpose
+
+The AI Starter Kit simplifies the deployment of AI infrastructure by providing:
+
+- **JupyterHub**: Multi-user notebook environment with pre-configured AI/ML libraries
+- **Model Serving**: Support for both Ollama and Ramalama model servers
+- **MLflow**: Experiment tracking and model management
+- **Model Caching**: Persistent storage for efficient model management
+- **Example Notebooks**: Pre-loaded notebooks to get you started immediately
+
+## Prerequisites
+
+### General Requirements
+- Kubernetes cluster (minikube, GKE)
+- Helm 3.x installed
+- kubectl configured to access your cluster
+- Hugging Face token for accessing models
+
+### Platform-Specific Requirements
+
+#### Minikube (Local Development)
+- Docker Desktop or similar container runtime
+- Minimum 4 CPU cores and 16GB RAM available
+- 40GB+ free disk space
+
+## Installation
+
+### Quick Start (Minikube)
+
+1. **Create a folder for the persistent storage:**
+```bash
+mkdir -p /$HOME/models-cache
+```
+
+2. **Start minikube with persistent storage:**
+```bash
+minikube start --cpus 4 --memory 15000 \
+  --mount --mount-string="/$HOME/models-cache:/tmp/models-cache"
+```
+
+3. **Install the chart:**
+```bash
+cd ai/ai-starter-kit/helm-chart/ai-starter-kit
+helm dependency update
+helm install ai-starter-kit . \
+  --set huggingface.token="YOUR_HF_TOKEN" \
+  -f values.yaml
+```
+
+4. **Access JupyterHub:**
+```bash
+kubectl port-forward svc/ai-starter-kit-jupyterhub-proxy-public 8080:80
+```
+Navigate to http://localhost:8080 and login with any username and password `password`
+
+## Configuration
+
+### Key Configuration Options
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `huggingface.token` | HuggingFace token for models | `"YOUR_HF_TOKEN"` |
+| `ollama.enabled` | Enable Ollama model server | `true` |
+| `ramalama.enabled` | Enable Ramalama model server | `true` |
+| `modelsCachePvc.size` | Size of model cache storage | `10Gi` |
+| `jupyterhub.singleuser.defaultUrl` | Default notebook path | `/lab/tree/welcome.ipynb` |
+| `mlflow.enabled` | Enable MLflow tracking server | `true` |
+
+### Storage Configuration
+
+The chart supports different storage configurations:
+
+- **Local Development**: Uses hostPath volumes with minikube mount
+- **Custom**: Configure via `modelsCachePvc.storageClassName`
+
+### Using GPUs
+
+In order to use GPUs for AI/ML workloads we need to add the necessary config to the services. Check the dependency charts documentation for the values. For example jupyterhub config would be:
+
+  ```yaml
+  juypterhub:
+  ...
+    extraResource:
+        limits:
+          nvidia.com/gpu: 1
+        guarantees:
+          nvidia.com/gpu: 1
+
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+  ```
+
+### Model Servers
+
+#### Ollama
+Ollama is enabled by default and provides:
+- Easy model management
+- REST API for inference
+- Support for popular models (Llama, Gemma, Qwen, etc.)
+- GPU acceleration support
+
+#### Ramalama
+Ramalama provides:
+- Alternative model serving solution
+- Support for CUDA and Metal (macOS) acceleration
+- Lightweight deployment option
+
+
+
+## Usage
+
+### Accessing Services
+
+#### JupyterHub
+```bash
+# Port forward to access JupyterHub
+kubectl port-forward svc/ai-starter-kit-jupyterhub-proxy-public 8080:80
+# Access at: http://localhost:8080
+# Default password: sneakypass
+```
+
+#### MLflow
+```bash
+# Port forward to access MLflow UI
+kubectl port-forward svc/ai-starter-kit-mlflow 5000:5000
+# Access at: http://localhost:5000
+```
+
+#### Ollama/Ramalama API
+```bash
+# For Ollama
+kubectl port-forward svc/ai-starter-kit-ollama 11434:11434
+
+# For Ramalama
+kubectl port-forward svc/ai-starter-kit-ramalama 8080:8080
+```
+
+### Pre-loaded Example Notebooks
+
+The JupyterHub environment comes with pre-loaded example notebooks:
+- `ray.ipynb`: Simple Ray nad MLflow example
+- `chat_bot.ipynb`: Simple chatbot interface using Ollama for conversational AI.
+- `multi-agent.ipynb`:Multi-agent workflow demonstration using Ray.
+- `multi-agent-ollama.ipynb`: Similar multi-agent workflow demonstration using Ollama.
+- `multi-agent-ramalama.ipynb`: Similar multi-agent workflow using RamaLama runtime for comparison.
+- `welcome.ipynb`: Introduction notebook with embedding model examples using Qwen models.
+
+These notebooks are automatically copied to your workspace on first login.
+
+## Architecture
+
+The AI Starter Kit consists of:
+
+1. **JupyterHub**: Multi-user notebook server with persistent storage
+2. **Model Serving**: Choice of Ollama or Ramalama for LLM inference
+3. **MLflow**: Experiment tracking and model registry
+4. **Persistent Storage**: Shared model cache to avoid redundant downloads
+5. **Init Containers**: Automated setup of models and notebooks
+
+## Cleanup
+
+### Uninstall the chart
+```bash
+helm uninstall ai-starter-kit
+```
+
+### Delete persistent volumes (optional)
+```bash
+kubectl delete pvc ai-starter-kit-models-cache-pvc
+kubectl delete pvc ai-starter-kit-jupyterhub-hub-db-dir
+```
+
+### Delete GKE cluster
+```bash
+gcloud container clusters delete ${CLUSTER_NAME} --region=${REGION}
+```
+
+### Stop minikube
+```bash
+minikube stop
+minikube delete  # To completely remove the cluster
+```
+
+## Troubleshooting
+
+### Common Issues
+
+#### Pods stuck in Pending state
+- Check available resources: `kubectl describe pod <pod-name>`
+- Increase cluster resources or reduce resource requests
+
+#### Model download failures
+- Verify Hugging Face token is set correctly
+- Check internet connectivity from pods
+- Increase init container timeout in values
+
+#### GPU not detected
+- Verify GPU nodes are available: `kubectl get nodes -o wide`
+- Check GPU driver installation
+- Ensure correct node selectors and tolerations
+
+#### Storage issues
+- Verify PVC is bound: `kubectl get pvc`
+- Check storage class availability: `kubectl get storageclass`
+- Ensure sufficient disk space
+
+### Debug Commands
+```bash
+# Check pod status
+kubectl get pods -n default
+
+# View pod logs
+kubectl logs -f <pod-name>
+
+# Describe pod for events
+kubectl describe pod <pod-name>
+
+# Check resource usage
+kubectl top nodes
+kubectl top pods
+```
+
+## Resources
+
+- [JupyterHub Documentation](https://jupyterhub.readthedocs.io/)
+- [MLflow Documentation](https://mlflow.org/docs/latest/index.html)
+- [Ollama Documentation](https://ollama.ai/docs)
+- [Kubernetes Documentation](https://kubernetes.io/docs/)
+- [Helm Documentation](https://helm.sh/docs/)