kubeflow · kaikaila · Sep 15, 2025 · Aug 29, 2025
diff --git a/.github/actions/deploy/action.yml b/.github/actions/deploy/action.yml
@@ -4,16 +4,16 @@ description: "Step to start and configure KFP on Kind"
 inputs:
   pipeline_store:
     description: "Flag to deploy KFP with K8s Native API"
-    default: 'database'
+    default: "database"
     required: false
   proxy:
     description: "If KFP should be deployed with proxy configuration"
     required: false
-    default: 'false'
+    default: "false"
   cache_enabled:
     description: "If KFP should be deployed with cache enabled globally"
     required: false
-    default: 'true'
+    default: "true"
   image_tag:
     required: true
     description: "Provide the image tag your image was tagged with"
@@ -26,26 +26,30 @@ inputs:
   multi_user:
     description: "If KFP should be deployed in multi-user mode"
     required: false
-    default: 'false'
+    default: "false"
   artifact_proxy:
     description: "Enables artifact proxy"
     required: false
-    default: 'false'
+    default: "false"
   storage_backend:
     description: "Storage backend to use (minio or seaweedfs)"
     required: false
-    default: 'seaweedfs'
+    default: "seaweedfs"
   argo_version:
     required: false
     description: "Argo version to use for the cluster"
+  db_type:
+    description: "The type of database to deploy for testing (mysql or pgx)."
+    required: false
+    default: ""
   forward_port:
     required: false
-    default: 'true'
+    default: "true"
     description: "If you want to forward API server port to localhost:8888"
   pod_to_pod_tls_enabled:
     description: "If KFP should be deployed with TLS pod-to-pod communication."
     required: false
-    default: 'false'
+    default: "false"
 
 runs:
   using: "composite"
@@ -64,13 +68,13 @@ runs:
     - name: Load Docker Images
       shell: bash
       run: |
-        APPS=("apiserver" "driver" "launcher" "scheduledworkflow" "persistenceagent" "frontend" "metadata-writer")
+        APPS=("apiserver" "driver" "launcher" "scheduledworkflow" "persistenceagent" "frontend" "metadata-writer" "cache-server")
         for app in "${APPS[@]}"; do
           docker image load -i ${{ inputs.image_path }}/$app/$app.tar
           docker push ${{ inputs.image_registry }}/$app:${{ inputs.image_tag }}
           rm ${{ inputs.image_path }}/$app/$app.tar
           docker image rm ${{ inputs.image_registry }}/$app:${{ inputs.image_tag }}
-        done  
+        done
 
     - name: Configure Args
       shell: bash
@@ -115,6 +119,12 @@ runs:
         if [ "${{inputs.pod_to_pod_tls_enabled }}" = "true" ]; then
           ARGS="${ARGS} --tls-enabled"
         fi
+
+        if [ -n "${{ inputs.db_type }}" ]; then
+          echo "Deploying with database type ${{ inputs.db_type }}"
+          ARGS="${ARGS} --db-type ${{ inputs.db_type }}"
+        fi
+
         echo "ARGS=$ARGS" >> "$GITHUB_OUTPUT"
 
     - name: Deploy KFP

diff --git a/.github/resources/manifests/multiuser/base/kustomization.yaml b/.github/resources/manifests/multiuser/base/kustomization.yaml
@@ -17,3 +17,6 @@ images:
 - name: ghcr.io/kubeflow/kfp-frontend
   newName: kind-registry:5000/frontend
   newTag: latest
+- name: ghcr.io/kubeflow/kfp-cache-server
+  newName: kind-registry:5000/cache-server
+  newTag: latest
diff --git a/.github/resources/manifests/standalone/base/kustomization.yaml b/.github/resources/manifests/standalone/base/kustomization.yaml
@@ -17,6 +17,9 @@ images:
 - name: ghcr.io/kubeflow/kfp-frontend
   newName: kind-registry:5000/frontend
   newTag: latest
+- name: ghcr.io/kubeflow/kfp-cache-server
+  newName: kind-registry:5000/cache-server
+  newTag: latest
 
 patches:
   - path: apiserver-env.yaml
diff --git a/.github/resources/manifests/standalone/postgresql/apiserver-env.yaml b/.github/resources/manifests/standalone/postgresql/apiserver-env.yaml
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ml-pipeline
+spec:
+  template:
+    spec:
+      containers:
+        - name: ml-pipeline-api-server
+          env:
+            - name: V2_DRIVER_IMAGE
+              value: kind-registry:5000/driver:latest
+            - name: V2_LAUNCHER_IMAGE
+              value: kind-registry:5000/launcher:latest
+            - name: LOG_LEVEL
+              value: "debug"
diff --git a/.github/resources/manifests/standalone/postgresql/kustomization.yaml b/.github/resources/manifests/standalone/postgresql/kustomization.yaml
@@ -0,0 +1,26 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+# This CI overlay for PostgreSQL testing does three things:
+# 1. It uses `platform-agnostic-postgresql` as its base. This is the project's
+#    standard way to deploy KFP with PostgreSQL, which correctly includes both
+#    the KFP core components and the third-party PostgreSQL instance, and
+#    patches the API server to use the 'pgx' driver.
+# 2. It applies an additional patch (`apiserver-env.yaml`) to inject
+#    CI-specific environment variables, like the V2 image path. This aligns
+#    with the pattern used in other CI overlays like `minio`.
+# 3. It overrides the image names to use the locally built images from the
+#    Kind registry, which is standard practice for all CI tests.
+resources:
+  - ../../../../../manifests/kustomize/env/platform-agnostic-postgresql
+
+images:
+  - name: ghcr.io/kubeflow/kfp-api-server
+    newName: kind-registry:5000/apiserver
+    newTag: latest
+  - name: ghcr.io/kubeflow/kfp-cache-server
+    newName: kind-registry:5000/cache-server
+    newTag: latest
+
+patches:
+  - path: apiserver-env.yaml
diff --git a/.github/resources/manifests/standalone/tls-enabled/kustomization.yaml b/.github/resources/manifests/standalone/tls-enabled/kustomization.yaml
@@ -20,5 +20,8 @@ images:
   - name: ghcr.io/kubeflow/kfp-metadata-writer
     newName: kind-registry:5000/metadata-writer
     newTag: latest
+  - name: ghcr.io/kubeflow/kfp-cache-server
+    newName: kind-registry:5000/cache-server
+    newTag: latest
 patches:
   - path: apiserver-env.yaml
diff --git a/.github/resources/scripts/collect-logs.sh b/.github/resources/scripts/collect-logs.sh
@@ -51,7 +51,36 @@ function display_pod_info {
             kubectl describe pod "${POD_NAME}" -n "${NAMESPACE}" | grep -A 100 Events || echo "No events found for pod ${POD_NAME}."
 
             echo "----- LOGS -----"
-            kubectl logs "${POD_NAME}" -n "${NAMESPACE}" || echo "No logs found for pod ${POD_NAME}."
+
+            # Get all containers (init + regular) from the pod
+            INIT_CONTAINERS=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null || echo "")
+            CONTAINERS=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null || echo "")
+
+            # Collect logs from init containers
+            if [[ -n "${INIT_CONTAINERS}" ]]; then
+                for CONTAINER in ${INIT_CONTAINERS}; do
+                    echo "----- Init Container: ${CONTAINER} (current) -----"
+                    kubectl logs "${POD_NAME}" -c "${CONTAINER}" -n "${NAMESPACE}" 2>&1 || echo "No current logs found for init container ${CONTAINER}."
+
+                    echo "----- Init Container: ${CONTAINER} (previous) -----"
+                    kubectl logs "${POD_NAME}" -c "${CONTAINER}" -n "${NAMESPACE}" --previous 2>&1 || echo "No previous logs found for init container ${CONTAINER}."
+                done
+            fi
+
+            # Collect logs from regular containers
+            if [[ -n "${CONTAINERS}" ]]; then
+                for CONTAINER in ${CONTAINERS}; do
+                    echo "----- Container: ${CONTAINER} (current) -----"
+                    kubectl logs "${POD_NAME}" -c "${CONTAINER}" -n "${NAMESPACE}" 2>&1 || echo "No current logs found for container ${CONTAINER}."
+
+                    echo "----- Container: ${CONTAINER} (previous) -----"
+                    kubectl logs "${POD_NAME}" -c "${CONTAINER}" -n "${NAMESPACE}" --previous 2>&1 || echo "No previous logs found for container ${CONTAINER}."
+                done
+            else
+                # Fallback: try to get logs without specifying container (for single-container pods)
+                echo "----- Default Container -----"
+                kubectl logs "${POD_NAME}" -n "${NAMESPACE}" 2>&1 || echo "No logs found for pod ${POD_NAME}."
+            fi
 
             echo "==========================="
             echo ""

diff --git a/.github/resources/scripts/deploy-kfp.sh b/.github/resources/scripts/deploy-kfp.sh
@@ -31,6 +31,7 @@ CACHE_DISABLED=false
 ARTIFACT_PROXY_ENABLED=false
 MULTI_USER=false
 STORAGE_BACKEND="seaweedfs"
+DB_TYPE=""
 AWF_VERSION=""
 POD_TO_POD_TLS_ENABLED=false
 SEAWEEDFS_INIT_TIMEOUT=300s
@@ -64,6 +65,10 @@ while [ "$#" -gt 0 ]; do
       STORAGE_BACKEND="$2"
       shift 2
       ;;
+    --db-type)
+      DB_TYPE="$2"
+      shift 2
+      ;;
     --argo-version)
       shift
       if [[ -n "$1" ]]; then
@@ -139,22 +144,40 @@ fi
 # Manifests will be deployed according to the flag provided
 if [ "${MULTI_USER}" == "false" ] && [ "${PIPELINES_STORE}" != "kubernetes" ]; then
   TEST_MANIFESTS="${TEST_MANIFESTS}/standalone"
-  if $CACHE_DISABLED; then
-    TEST_MANIFESTS="${TEST_MANIFESTS}/cache-disabled"
-  elif $USE_PROXY; then
-    TEST_MANIFESTS="${TEST_MANIFESTS}/proxy"
-  elif [ "${STORAGE_BACKEND}" == "minio" ]; then
-    TEST_MANIFESTS="${TEST_MANIFESTS}/minio"
+
+  # Priority 1: TLS-enabled (mutually exclusive with other options)
+  if $POD_TO_POD_TLS_ENABLED; then
+    TEST_MANIFESTS="${TEST_MANIFESTS}/tls-enabled"
+
+  # Priority 2: PostgreSQL (mutually exclusive with default MySQL setup)
+  elif [ "${DB_TYPE}" == "pgx" ]; then
+    TEST_MANIFESTS="${TEST_MANIFESTS}/postgresql"
+
+  # Priority 3: Check for cache-disabled + proxy + minio combination
+  elif $CACHE_DISABLED && $USE_PROXY && [ "${STORAGE_BACKEND}" == "minio" ]; then
+    TEST_MANIFESTS="${TEST_MANIFESTS}/cache-disabled-proxy-minio"
+
+  # Priority 4: Check for cache-disabled + proxy combination
   elif $CACHE_DISABLED && $USE_PROXY; then
     TEST_MANIFESTS="${TEST_MANIFESTS}/cache-disabled-proxy"
+
+  # Priority 5: Check for cache-disabled + minio combination
   elif $CACHE_DISABLED && [ "${STORAGE_BACKEND}" == "minio" ]; then
     TEST_MANIFESTS="${TEST_MANIFESTS}/cache-disabled-minio"
+
+  # Priority 6: Check for proxy + minio combination
   elif $USE_PROXY && [ "${STORAGE_BACKEND}" == "minio" ]; then
     TEST_MANIFESTS="${TEST_MANIFESTS}/proxy-minio"
-  elif $CACHE_DISABLED && $USE_PROXY && [ "${STORAGE_BACKEND}" == "minio" ]; then
-    TEST_MANIFESTS="${TEST_MANIFESTS}/cache-disabled-proxy-minio"
-  elif $POD_TO_POD_TLS_ENABLED; then
-    TEST_MANIFESTS="${TEST_MANIFESTS}/tls-enabled"
+
+  # Priority 7: Check for single flags (cache-disabled, proxy, or minio)
+  elif $CACHE_DISABLED; then
+    TEST_MANIFESTS="${TEST_MANIFESTS}/cache-disabled"
+  elif $USE_PROXY; then
+    TEST_MANIFESTS="${TEST_MANIFESTS}/proxy"
+  elif [ "${STORAGE_BACKEND}" == "minio" ]; then
+    TEST_MANIFESTS="${TEST_MANIFESTS}/minio"
+
+  # Default: seaweedfs with cache enabled
   else
     TEST_MANIFESTS="${TEST_MANIFESTS}/default"
   fi

diff --git a/.github/workflows/api-server-test-Postgres.yml b/.github/workflows/api-server-test-Postgres.yml
@@ -0,0 +1,111 @@
+name: API Server Tests - Postgres
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    paths:
+      - "backend/**"
+      - "manifests/kustomize/third-party/postgresql/**"
+      - ".github/resources/manifests/standalone/**"
+      - ".github/workflows/kfp-backend-v2-postgres-tests.yml"
+      - "!**/*.md"
+      - "!**/OWNERS"
+env:
+  NAMESPACE: kubeflow
+  POSTGRES_NAMESPACE: kubeflow
+  DB_TYPE: postgres
+  DB_DRIVER: pgx
+  DB_PORT: "5432"
+  # The IP address for port-forwarding the database. Go tests will connect to this IP.
+  # This should be kept in sync with other postgres test workflows and local test scripts.
+  # Using 127.0.0.1 to match MySQL workflow behavior and Kind local development setup.
+  DB_FORWARD_IP: 127.0.0.1
+  DB_USER: user
+  DB_PASSWORD: password
+  DB_NAME: mlpipeline
+jobs:
+  build:
+    uses: ./.github/workflows/image-builds-with-cache.yml
+  postgres-pgx:
+    runs-on: ubuntu-latest
+    needs: build
+    continue-on-error: false
+    strategy:
+      matrix:
+        cache_enabled: [true, false]
+      fail-fast: false # Ensure all jobs in the matrix run, even if one fails
+    name: KFP Backend V2 Postgres Tests (Cache ${{ matrix.cache_enabled }})
+
+    steps:
+      - name: Checkout target code
+        uses: actions/checkout@v5
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: "1.22"
+          cache: true
+
+      - name: Create KFP cluster
+        uses: ./.github/actions/create-cluster
+        with:
+          k8s_version: "v1.30.2"
+
+      - name: Deploy KFP with Postgres
+        uses: ./.github/actions/deploy
+        with:
+          db_type: "pgx"
+          pipeline_store: "database"
+          cache_enabled: ${{ matrix.cache_enabled }}
+          image_path: ${{ needs.build.outputs.IMAGE_PATH }}
+          image_tag: ${{ needs.build.outputs.IMAGE_TAG }}
+          image_registry: ${{ needs.build.outputs.IMAGE_REGISTRY }}
+          forward_port: "true"
+      - name: Port-forward Postgres
+        run: kubectl -n "$POSTGRES_NAMESPACE" port-forward svc/postgres-service ${{ env.DB_PORT }}:${{ env.DB_PORT }} --address=${{ env.DB_FORWARD_IP }} &
+
+      - name: Port-forward ML Metadata service
+        run: kubectl -n "$NAMESPACE" port-forward svc/metadata-grpc-service 8080:8080 &
+      # Exclude upgrade tests for the following reasons:
+      # 1. Responsibility: Upgrade tests are handled by the dedicated `upgrade-test.yml` workflow.
+      # 2. Incompatibility: This workflow runs tests against a single, clean deployment. It cannot
+      #    accommodate the two-phase nature of upgrade tests (prepare on an old version, then
+      #    verify on the new one).
+      # 3. No Baseline: As PostgreSQL was not officially supported before, there is no prior
+      #    stable release to serve as a baseline for an upgrade test.
+      - name: Run v2 api tests
+        run: |
+          go run github.com/onsi/ginkgo/v2/ginkgo -r -v --label-filter="!UpgradePreparation && !UpgradeVerification" ./backend/test/v2/api/... -- \
+            -namespace="$NAMESPACE"
+      - name: Run v2 integration tests
+        run: |
+          # v2/integration tests use testify/suite framework, not Ginkgo, so we must use 'go test' instead of 'ginkgo'
+          # Build the go test command with appropriate flags
+          TEST_CMD="go test -v -timeout 30m ./backend/test/v2/integration/..."
+
+          # Arguments for the test binary (passed after -args)
+          TEST_ARGS="-runIntegrationTests=true -namespace=$NAMESPACE -cacheEnabled=${{ matrix.cache_enabled }}"
+
+          if [[ "${{ matrix.cache_enabled }}" == "false" ]]; then
+            # When cache is disabled, we must skip the cache test itself.
+            # Use Go test's -skip flag to exclude TestCache
+            TEST_CMD="$TEST_CMD -skip TestCache"
+          fi
+
+          # Execute the test command with arguments
+          eval "$TEST_CMD -args $TEST_ARGS"
+
+      - name: Collect pod logs
+        if: always()
+        run: |
+          mkdir -p /tmp/tmp.kfp /tmp/tmp.postgres
+          ./.github/resources/scripts/collect-logs.sh --ns "$NAMESPACE" --output /tmp/tmp.kfp/pod_log.txt
+          ./.github/resources/scripts/collect-logs.sh --ns "$POSTGRES_NAMESPACE" --output /tmp/tmp.postgres/pod_log.txt
+
+      - name: Upload test artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: api-server-postgres-test-artifacts-cache-${{ matrix.cache_enabled }}
+          path: /tmp/tmp*/*
diff --git a/.github/workflows/image-builds-with-cache.yml b/.github/workflows/image-builds-with-cache.yml
@@ -50,6 +50,9 @@ jobs:
           - image: metadata-writer
             dockerfile: backend/metadata_writer/Dockerfile
             context: .
+          - image: cache-server
+            dockerfile: backend/Dockerfile.cacheserver
+            context: .
     env:
       ARTIFACT_NAME: "${{ matrix.image }}"
       ARTIFACTS_PATH: "images_${{ github.sha }}"
@@ -119,4 +122,4 @@ jobs:
           path: ${{ env.ARTIFACTS_PATH }}/${{ env.ARTIFACT_NAME }}.tar
           retention-days: 1
         # Continue the workflow even if the upload failed, because upload can fail if other jobs were able to upload artifact first before the current one
-        continue-on-error: true
+        continue-on-error: true