@@ -5,18 +5,84 @@ function syslog() {
55}
66
77function check_control_plane_status() {
8+ echo " === Unique image tags used by Fluid control plane ==="
9+ kubectl get pod -n fluid-system -o jsonpath='
10+ {range .items[*]}{range .spec.containers[*]}{.image}{"\n"}{end}{range .spec.initContainers[*]}{.image}{"\n"}{end}{end}' \
11+ | sed ' s/.*://' \
12+ | sort -u
13+
14+ # Timeout counter (30 minutes = 360*5 seconds)
15+ local timeout=360
16+ local counter=0
17+ # Status check interval (36 iterations * 5s = 180s = 3 minutes)
18+ local status_interval=36
19+
820 while true ; do
921 total_pods=$( kubectl get pod -n fluid-system --no-headers | grep -cv " Completed" )
1022 running_pods=$( kubectl get pod -n fluid-system --no-headers | grep -c " Running" )
23+ not_running_pods=$(( $total_pods - $running_pods ))
1124
12- if [[ $total_pods -ne 0 ]]; then
13- if [[ $total_pods -eq $running_pods ]]; then
14- break
25+ # Print status every 3 minutes
26+ if (( counter % status_interval == 0 )) ; then
27+ syslog " [Status Check $(( counter/ status_interval)) ] Pod status: $running_pods /$total_pods running ($not_running_pods not ready)"
28+
29+ # Get details for non-running pods
30+ if [[ $not_running_pods -gt 0 ]]; then
31+ echo " === Not running pods ==="
32+ kubectl get pods -n fluid-system \
33+ --field-selector=status.phase! =Running \
34+ -o=custom-columns=' NAME:.metadata.name,STATUS:.status.phase,REASON:.status.reason'
35+
36+ # Get events for problem pods
37+ local problem_pods=$( kubectl get pods -n fluid-system \
38+ --field-selector=status.phase! =Running \
39+ -o=jsonpath=' {.items[*].metadata.name}' )
40+
41+ for pod in $problem_pods ; do
42+ echo " --- Events for $pod ---"
43+ # Extract events section from pod description
44+ kubectl describe pod -n fluid-system $pod | awk ' /Events:/,/^ *$/{if($0!~/^ *$/&&$0!~/Events:/)print}'
45+ done
1546 fi
1647 fi
48+
49+ # Exit loop when all pods are running
50+ if [[ $total_pods -ne 0 ]] && [[ $total_pods -eq $running_pods ]]; then
51+ break
52+ fi
53+
54+ # Handle timeout after 30 minutes
55+ if (( counter >= timeout)) ; then
56+ syslog " Timeout waiting for control plane after $counter checks!"
57+
58+ # Final pod status
59+ echo " === Final pod status ==="
60+ kubectl get pods -n fluid-system -o wide
61+
62+ # Container logs (last 100 lines)
63+ local all_pods=$( kubectl get pods -n fluid-system -o jsonpath=' {.items[*].metadata.name}' )
64+ for pod in $all_pods ; do
65+ echo " --- Logs for $pod (last 100 lines) ---"
66+ kubectl logs -n fluid-system $pod --all-containers --tail=100
67+ done
68+
69+ # Additional diagnostics
70+ echo " === Node resource usage ==="
71+ kubectl top nodes
72+
73+ echo " === Persistent volume claims ==="
74+ kubectl get pvc -n fluid-system
75+
76+ echo " === Fluid system events ==="
77+ kubectl get events -n fluid-system --sort-by=.metadata.creationTimestamp
78+
79+ exit 1
80+ fi
81+
1782 sleep 5
83+ (( counter++ ))
1884 done
19- syslog " Fluid control plane is ready!"
85+ syslog " Fluid control plane is ready after $counter checks !"
2086}
2187
2288function alluxio_e2e() {
0 commit comments