Skip to content

Commit cd1d08b

Browse files
kaikailaclaude
andcommitted
debug(cache): Add comprehensive logging for PVC cache debugging
Add detailed logging throughout the cache flow to diagnose the TestCacheSingleRunWithPVC_SameName_Caches failure where Run3 with a different PVC name incorrectly hits cache (expected COMPLETE, got CACHED). Logging added to: - cache_test.go: Log execution states and cached_execution_id - driver/cache.go: Log PVC names during fingerprint generation - cacheutils/cache.go: Log cache key creation and query results - task_store.go: Log SQL queries and returned tasks with fingerprints This will help identify whether: 1. PVC parameters are being correctly resolved from DAG inputs 2. PVC names are included in cache key generation 3. Different PVC names produce different fingerprints 4. Database queries correctly filter by fingerprint Related to PostgreSQL migration and V2 cache debugging. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]> Signed-off-by: kaikaila <[email protected]>
1 parent 6cf9be1 commit cd1d08b

File tree

24 files changed

+1289
-23
lines changed

24 files changed

+1289
-23
lines changed

.github/resources/scripts/collect-logs.sh

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,36 @@ function display_pod_info {
5151
kubectl describe pod "${POD_NAME}" -n "${NAMESPACE}" | grep -A 100 Events || echo "No events found for pod ${POD_NAME}."
5252

5353
echo "----- LOGS -----"
54-
kubectl logs "${POD_NAME}" -n "${NAMESPACE}" || echo "No logs found for pod ${POD_NAME}."
54+
55+
# Get all containers (init + regular) from the pod
56+
INIT_CONTAINERS=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null || echo "")
57+
CONTAINERS=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null || echo "")
58+
59+
# Collect logs from init containers
60+
if [[ -n "${INIT_CONTAINERS}" ]]; then
61+
for CONTAINER in ${INIT_CONTAINERS}; do
62+
echo "----- Init Container: ${CONTAINER} (current) -----"
63+
kubectl logs "${POD_NAME}" -c "${CONTAINER}" -n "${NAMESPACE}" 2>&1 || echo "No current logs found for init container ${CONTAINER}."
64+
65+
echo "----- Init Container: ${CONTAINER} (previous) -----"
66+
kubectl logs "${POD_NAME}" -c "${CONTAINER}" -n "${NAMESPACE}" --previous 2>&1 || echo "No previous logs found for init container ${CONTAINER}."
67+
done
68+
fi
69+
70+
# Collect logs from regular containers
71+
if [[ -n "${CONTAINERS}" ]]; then
72+
for CONTAINER in ${CONTAINERS}; do
73+
echo "----- Container: ${CONTAINER} (current) -----"
74+
kubectl logs "${POD_NAME}" -c "${CONTAINER}" -n "${NAMESPACE}" 2>&1 || echo "No current logs found for container ${CONTAINER}."
75+
76+
echo "----- Container: ${CONTAINER} (previous) -----"
77+
kubectl logs "${POD_NAME}" -c "${CONTAINER}" -n "${NAMESPACE}" --previous 2>&1 || echo "No previous logs found for container ${CONTAINER}."
78+
done
79+
else
80+
# Fallback: try to get logs without specifying container (for single-container pods)
81+
echo "----- Default Container -----"
82+
kubectl logs "${POD_NAME}" -n "${NAMESPACE}" 2>&1 || echo "No logs found for pod ${POD_NAME}."
83+
fi
5584

5685
echo "==========================="
5786
echo ""

.github/workflows/api-server-test-Postgres.yml

Lines changed: 113 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,19 +152,127 @@ jobs:
152152
# Execute the test command with arguments
153153
$TEST_CMD -args $TEST_ARGS
154154
155+
- name: Collect cache-specific diagnostics
156+
if: always()
157+
run: |
158+
mkdir -p /tmp/cache-diagnostics
159+
160+
echo "=== Collecting cache-related pod logs ==="
161+
162+
# Find all driver and launcher pods
163+
DRIVER_PODS=$(kubectl get pods -n "$NAMESPACE" --no-headers -o custom-columns=":metadata.name" | grep -E "container-driver|dag-driver" || echo "")
164+
IMPL_PODS=$(kubectl get pods -n "$NAMESPACE" --no-headers -o custom-columns=":metadata.name" | grep "container-impl" || echo "")
165+
166+
# Collect driver logs (cache queries)
167+
if [[ -n "$DRIVER_PODS" ]]; then
168+
echo "Found driver pods:"
169+
echo "$DRIVER_PODS"
170+
171+
for pod in $DRIVER_PODS; do
172+
echo "Collecting logs from driver pod: $pod"
173+
kubectl logs "$pod" -c main -n "$NAMESPACE" > "/tmp/cache-diagnostics/${pod}_driver.log" 2>&1 || true
174+
done
175+
else
176+
echo "No driver pods found"
177+
fi
178+
179+
# Collect launcher/impl logs (cache creation)
180+
if [[ -n "$IMPL_PODS" ]]; then
181+
echo "Found launcher/impl pods:"
182+
echo "$IMPL_PODS"
183+
184+
for pod in $IMPL_PODS; do
185+
echo "Collecting logs from launcher pod: $pod"
186+
kubectl logs "$pod" -c main -n "$NAMESPACE" > "/tmp/cache-diagnostics/${pod}_launcher.log" 2>&1 || true
187+
done
188+
else
189+
echo "No launcher/impl pods found"
190+
fi
191+
192+
# Extract CACHE-DEBUG messages
193+
echo "=== Extracting CACHE-DEBUG messages ==="
194+
grep -h "CACHE-DEBUG\|cache_fingerprint\|cached_execution_id" /tmp/cache-diagnostics/*.log > /tmp/cache-diagnostics/cache_debug_summary.txt 2>/dev/null || echo "No CACHE-DEBUG messages found" > /tmp/cache-diagnostics/cache_debug_summary.txt
195+
196+
# Show summary
197+
echo ""
198+
echo "=== Cache Debug Summary ==="
199+
cat /tmp/cache-diagnostics/cache_debug_summary.txt
200+
201+
# Query PostgreSQL tasks table for cache entries
202+
echo ""
203+
echo "=== Querying PostgreSQL for cache entries ==="
204+
205+
POSTGRES_POD=$(kubectl get pods -n "$POSTGRES_NAMESPACE" -l app=postgres -o jsonpath='{.items[0].metadata.name}')
206+
207+
if [[ -n "$POSTGRES_POD" ]]; then
208+
# Query tasks table
209+
kubectl exec -n "$POSTGRES_NAMESPACE" "$POSTGRES_POD" -- psql -U "$DB_USER" -d "$DB_NAME" -c \
210+
"SELECT \"UUID\", \"Fingerprint\", \"MLMDExecutionID\", \"PipelineName\", \"Namespace\", \"RunUUID\", to_timestamp(\"CreatedTimestamp\") as created, to_timestamp(\"FinishedTimestamp\") as finished FROM tasks ORDER BY \"CreatedTimestamp\" DESC LIMIT 20;" \
211+
> /tmp/cache-diagnostics/postgres_tasks_table.txt 2>&1 || echo "Failed to query tasks table" > /tmp/cache-diagnostics/postgres_tasks_table.txt
212+
213+
# Count cache entries by fingerprint
214+
kubectl exec -n "$POSTGRES_NAMESPACE" "$POSTGRES_POD" -- psql -U "$DB_USER" -d "$DB_NAME" -c \
215+
"SELECT \"Fingerprint\", COUNT(*) as count FROM tasks GROUP BY \"Fingerprint\" ORDER BY count DESC LIMIT 10;" \
216+
> /tmp/cache-diagnostics/postgres_cache_fingerprints.txt 2>&1 || echo "Failed to query cache fingerprints" > /tmp/cache-diagnostics/postgres_cache_fingerprints.txt
217+
218+
echo "PostgreSQL tasks table (last 20):"
219+
cat /tmp/cache-diagnostics/postgres_tasks_table.txt
220+
221+
echo ""
222+
echo "Cache fingerprint counts:"
223+
cat /tmp/cache-diagnostics/postgres_cache_fingerprints.txt
224+
else
225+
echo "Postgres pod not found" > /tmp/cache-diagnostics/postgres_query_error.txt
226+
fi
227+
155228
- name: Collect pod logs
156229
if: always()
157230
run: |
158231
mkdir -p /tmp/kfp-logs
232+
233+
# Function to collect logs from all containers including init containers
234+
collect_pod_logs() {
235+
local namespace=$1
236+
local pod=$2
237+
local log_file="/tmp/kfp-logs/${pod}.log"
238+
239+
echo "===== Pod: ${pod} in ${namespace} =====" > "$log_file"
240+
241+
# Get init containers
242+
init_containers=$(kubectl get pod "$pod" -n "$namespace" -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null || echo "")
243+
244+
# Get regular containers
245+
containers=$(kubectl get pod "$pod" -n "$namespace" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null || echo "")
246+
247+
# Collect init container logs
248+
if [[ -n "$init_containers" ]]; then
249+
for container in $init_containers; do
250+
echo "" >> "$log_file"
251+
echo "===== Init Container: $container =====" >> "$log_file"
252+
kubectl logs "$pod" -c "$container" -n "$namespace" >> "$log_file" 2>&1 || echo "No logs for init container $container" >> "$log_file"
253+
done
254+
fi
255+
256+
# Collect regular container logs
257+
if [[ -n "$containers" ]]; then
258+
for container in $containers; do
259+
echo "" >> "$log_file"
260+
echo "===== Container: $container =====" >> "$log_file"
261+
kubectl logs "$pod" -c "$container" -n "$namespace" >> "$log_file" 2>&1 || echo "No logs for container $container" >> "$log_file"
262+
done
263+
fi
264+
}
265+
159266
echo "Collecting logs from all pods in $NAMESPACE namespace..."
160267
for pod in $(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}'); do
161268
echo "Collecting logs for pod: $pod"
162-
kubectl logs -n "$NAMESPACE" "$pod" --all-containers=true > "/tmp/kfp-logs/${pod}.log" 2>&1 || true
269+
collect_pod_logs "$NAMESPACE" "$pod"
163270
done
271+
164272
echo "Collecting logs from all pods in $POSTGRES_NAMESPACE namespace..."
165273
for pod in $(kubectl get pods -n "$POSTGRES_NAMESPACE" -o jsonpath='{.items[*].metadata.name}'); do
166274
echo "Collecting logs for pod: $pod"
167-
kubectl logs -n "$POSTGRES_NAMESPACE" "$pod" --all-containers=true > "/tmp/kfp-logs/${pod}.log" 2>&1 || true
275+
collect_pod_logs "$POSTGRES_NAMESPACE" "$pod"
168276
done
169277
170278
- name: Cleanup
@@ -184,4 +292,6 @@ jobs:
184292
uses: actions/upload-artifact@v4
185293
with:
186294
name: api-server-postgres-test-artifacts-cache-${{ matrix.cache_enabled }}
187-
path: /tmp/kfp-logs/*
295+
path: |
296+
/tmp/kfp-logs/*
297+
/tmp/cache-diagnostics/*

0 commit comments

Comments
 (0)