run-sdk-pr-e2e-tests #2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: SDK PR Notebook E2E Runner | |
| on: | |
| repository_dispatch: | |
| types: [run-sdk-pr-e2e-tests] | |
| jobs: | |
| run-notebook-test: | |
| runs-on: kubeflow-devx-testing | |
| permissions: | |
| contents: write | |
| steps: | |
| - name: Checkout SDK PR Code | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: ${{ github.event.client_payload.sdk_repo }} | |
| ref: ${{ github.event.client_payload.sdk_sha }} | |
| path: sdk-repo # Checkout the SDK code into a subdirectory | |
| - name: Checkout Trainer Repo (Test Asset) | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: opendatahub-io/trainer | |
| ref: main | |
| path: trainer-repo # Checkout the Trainer code into a subdirectory | |
| # Kubernetes Cluster Setup | |
| # This step may not be needed with the larger runner | |
| - name: Free up disk space | |
| run: | | |
| echo "Available disk space before cleanup:" | |
| df -h | |
| # Remove unnecessary packages and files to free up space | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /usr/local/lib/android | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf /opt/hostedtoolcache/CodeQL | |
| # Clean up Docker | |
| docker system prune -af | |
| echo "Available disk space after cleanup:" | |
| df -h | |
| - name: Set up Kind Kubernetes Cluster | |
| uses: helm/kind-action@v1 | |
| with: | |
| cluster_name: kubeflow-test | |
| wait: 300s | |
| - name: Verify Kind Cluster | |
| run: | | |
| kubectl cluster-info | |
| kubectl get nodes | |
| kubectl get pods -A | |
| - name: Install Kubeflow Trainer | |
| run: | | |
| echo "Installing Kubeflow Trainer..." | |
| # Install the Trainer manager using kubectl with server-side apply to handle large CRDs | |
| kubectl apply --server-side -k "github.com/opendatahub-io/trainer/manifests/overlays/manager?ref=master" | |
| # Wait for the trainer deployment to be ready | |
| echo "Waiting for Trainer controller to be ready..." | |
| kubectl wait --for=condition=available --timeout=300s \ | |
| deployment/kubeflow-trainer-controller-manager -n kubeflow-system | |
| # Wait a bit more for webhooks to be fully ready | |
| echo "Waiting for webhooks to be ready..." | |
| sleep 30 | |
| # Install the default training runtimes | |
| echo "Installing Trainer runtimes..." | |
| kubectl apply --server-side -k "github.com/opendatahub-io/trainer/manifests/overlays/runtimes?ref=master" | |
| # Verify installation | |
| echo "Verifying Trainer installation..." | |
| kubectl get deployment -n kubeflow-system | |
| kubectl get pods -n kubeflow-system | |
| # Verify runtimes are installed | |
| echo "Verifying ClusterTrainingRuntimes..." | |
| kubectl get clustertrainingruntimes | |
| # Test Environment Setup | |
| - name: Set up Python 3.9 | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: '3.9' | |
| - name: Setup Environment and Install Dependencies | |
| shell: bash | |
| run: | | |
| echo "Installing tools and dependencies..." | |
| # 1. Install Papermill and core execution tools | |
| pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5 | |
| # 2. Install main dependencies (Manually listed from pyproject.toml [project.dependencies]) | |
| echo "Installing main SDK dependencies..." | |
| pip install kubernetes>=27.2.0 pydantic>=2.10.0 kubeflow-trainer-api>=2.0.0 kubeflow-katib-api>=0.19.0 | |
| # 3. Install the SDK Code itself in editable mode | |
| cd sdk-repo | |
| echo "Installing SDK PR code in editable mode..." | |
| pip install -e .[dev] | |
| # 4. Configure Notebook Kernel | |
| PYTHON_BIN=$(which python) | |
| $PYTHON_BIN -m ipykernel install --user --name=sdk-test-kernel --display-name "Python (SDK Test)" | |
| cd .. | |
| mkdir -p artifacts/notebooks # Create artifact directory | |
| - name: Configure Kubernetes Access | |
| run: | | |
| # Ensure kubeconfig is accessible | |
| mkdir -p ~/.kube | |
| kind get kubeconfig --name kubeflow-test > ~/.kube/config | |
| chmod 600 ~/.kube/config | |
| # Verify access | |
| kubectl get nodes | |
| kubectl get namespaces | |
| - name: Run E2E Notebook Test with Papermill | |
| id: run-test | |
| run: | | |
| # Use the python executable where Papermill was installed | |
| PAPERMILL_BIN=$(which papermill) | |
| echo "Using Papermill from: $PAPERMILL_BIN" | |
| # Set the notebook paths | |
| NOTEBOOK_INPUT="trainer-repo/examples/pytorch/image-classification/mnist.ipynb" | |
| NOTEBOOK_OUTPUT="artifacts/notebooks/sdk-pr-test-output.ipynb" | |
| echo "Executing notebook: $NOTEBOOK_INPUT" | |
| # Ensure the output directory exists | |
| mkdir -p "$(dirname "$NOTEBOOK_OUTPUT")" | |
| # Execute the notebook using Papermill | |
| $PAPERMILL_BIN "$NOTEBOOK_INPUT" "$NOTEBOOK_OUTPUT" \ | |
| --kernel "sdk-test-kernel" \ | |
| --log-output \ | |
| --log-level INFO | |
| echo "Notebook test execution finished successfully." | |
| - name: Upload Executed Notebook Artifact | |
| uses: actions/upload-artifact@v5 | |
| if: always() | |
| with: | |
| name: sdk-pr-notebook-result-${{ github.event.client_payload.sdk_sha }} | |
| path: artifacts/notebooks/sdk-pr-test-output.ipynb | |
| retention-days: 1 | |
| - name: Cleanup Kind Cluster | |
| if: always() | |
| run: | | |
| kind delete cluster --name kubeflow-test || true |