Skip to content

Commit 0e0a907

Browse files
authored
Add debug info for check_control_plane_status (fluid-cloudnative#5286)
Signed-off-by: cheyang <cheyang@163.com>
1 parent 51689b1 commit 0e0a907

File tree

1 file changed

+70
-4
lines changed

1 file changed

+70
-4
lines changed

.github/scripts/gha-e2e.sh

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,84 @@ function syslog() {
55
}
66

77
function check_control_plane_status() {
8+
echo "=== Unique image tags used by Fluid control plane ==="
9+
kubectl get pod -n fluid-system -o jsonpath='
10+
{range .items[*]}{range .spec.containers[*]}{.image}{"\n"}{end}{range .spec.initContainers[*]}{.image}{"\n"}{end}{end}' \
11+
| sed 's/.*://' \
12+
| sort -u
13+
14+
# Timeout counter (30 minutes = 360*5 seconds)
15+
local timeout=360
16+
local counter=0
17+
# Status check interval (36 iterations * 5s = 180s = 3 minutes)
18+
local status_interval=36
19+
820
while true; do
921
total_pods=$(kubectl get pod -n fluid-system --no-headers | grep -cv "Completed")
1022
running_pods=$(kubectl get pod -n fluid-system --no-headers | grep -c "Running")
23+
not_running_pods=$(($total_pods - $running_pods))
1124

12-
if [[ $total_pods -ne 0 ]]; then
13-
if [[ $total_pods -eq $running_pods ]]; then
14-
break
25+
# Print status every 3 minutes
26+
if ((counter % status_interval == 0)); then
27+
syslog "[Status Check $((counter/status_interval))] Pod status: $running_pods/$total_pods running ($not_running_pods not ready)"
28+
29+
# Get details for non-running pods
30+
if [[ $not_running_pods -gt 0 ]]; then
31+
echo "=== Not running pods ==="
32+
kubectl get pods -n fluid-system \
33+
--field-selector=status.phase!=Running \
34+
-o=custom-columns='NAME:.metadata.name,STATUS:.status.phase,REASON:.status.reason'
35+
36+
# Get events for problem pods
37+
local problem_pods=$(kubectl get pods -n fluid-system \
38+
--field-selector=status.phase!=Running \
39+
-o=jsonpath='{.items[*].metadata.name}')
40+
41+
for pod in $problem_pods; do
42+
echo "--- Events for $pod ---"
43+
# Extract events section from pod description
44+
kubectl describe pod -n fluid-system $pod | awk '/Events:/,/^ *$/{if($0!~/^ *$/&&$0!~/Events:/)print}'
45+
done
1546
fi
1647
fi
48+
49+
# Exit loop when all pods are running
50+
if [[ $total_pods -ne 0 ]] && [[ $total_pods -eq $running_pods ]]; then
51+
break
52+
fi
53+
54+
# Handle timeout after 30 minutes
55+
if ((counter >= timeout)); then
56+
syslog "Timeout waiting for control plane after $counter checks!"
57+
58+
# Final pod status
59+
echo "=== Final pod status ==="
60+
kubectl get pods -n fluid-system -o wide
61+
62+
# Container logs (last 100 lines)
63+
local all_pods=$(kubectl get pods -n fluid-system -o jsonpath='{.items[*].metadata.name}')
64+
for pod in $all_pods; do
65+
echo "--- Logs for $pod (last 100 lines) ---"
66+
kubectl logs -n fluid-system $pod --all-containers --tail=100
67+
done
68+
69+
# Additional diagnostics
70+
echo "=== Node resource usage ==="
71+
kubectl top nodes
72+
73+
echo "=== Persistent volume claims ==="
74+
kubectl get pvc -n fluid-system
75+
76+
echo "=== Fluid system events ==="
77+
kubectl get events -n fluid-system --sort-by=.metadata.creationTimestamp
78+
79+
exit 1
80+
fi
81+
1782
sleep 5
83+
((counter++))
1884
done
19-
syslog "Fluid control plane is ready!"
85+
syslog "Fluid control plane is ready after $counter checks!"
2086
}
2187

2288
function alluxio_e2e() {

0 commit comments

Comments
 (0)