@@ -794,54 +794,72 @@ set_force_new_cluster()
794
794
return $rc
795
795
}
796
796
797
+ # get_force_new_cluster returns a space-separated list of nodes that have the force_new_cluster attribute set.
798
+ # Return values:
799
+ # - Exit code 0 with non-empty output: One or more nodes have the force_new_cluster attribute set
800
+ # - Exit code 0 with empty output: No nodes have the force_new_cluster attribute set
801
+ # - Exit code 1 with empty output: Error occurred while querying the cluster nodes
797
802
get_force_new_cluster ()
798
803
{
799
- crm_attribute --lifetime reboot --query --name " force_new_cluster" | awk -F" value=" ' {print $2}'
804
+ local node nodes value
805
+ local holders=" "
806
+
807
+ if ! nodes=$( crm_node -l | awk ' {print $2}' ) ; then
808
+ ocf_log err " could not get force_new_cluster attribute, crm_node error code: $? "
809
+ return 1
810
+ fi
811
+ if [ -z " $nodes " ]; then
812
+ ocf_log err " could not get force_new_cluster attribute, the list of nodes is empty"
813
+ return 1
814
+ fi
815
+
816
+ for node in $nodes ; do
817
+ if ! value=$( crm_attribute --query --lifetime reboot --name " force_new_cluster" --node " $node " 2> /dev/null | awk -F' value=' ' {print $2}' | tr -d " '" ) ; then
818
+ ocf_log err " could not get force_new_cluster attribute, crm_attribut error code: $? "
819
+ return 1
820
+ fi
821
+ if [ -n " $value " ]; then
822
+ holders=" $holders$node "
823
+ fi
824
+ done
825
+ echo " $holders "
800
826
}
801
827
828
+
802
829
clear_force_new_cluster ()
803
830
{
804
- local force_new_cluster_node
805
-
806
- force_new_cluster_node=$( get_force_new_cluster)
807
- if [ -z " $force_new_cluster_node " ]; then
808
- ocf_log info " $NODENAME : force_new_cluster attribute not set"
831
+ # only the holder of "force_new_cluster" attribute can delete it
832
+ if ! is_force_new_cluster; then
833
+ ocf_log info " force_new_cluster unset or not owned by $NODENAME "
809
834
return $OCF_SUCCESS
810
835
fi
811
836
812
- # only the holder of "force_new_cluster" attribute can delete it
813
- if [ " $NODENAME " = " $force_new_cluster_node " ]; then
814
- crm_attribute --lifetime reboot --name " force_new_cluster" --delete
815
- rc=$?
816
- if [ $rc -ne 0 ]; then
817
- ocf_log err " could not clear force_new_cluster attribute, error code: $rc "
818
- else
819
- ocf_log info " $NODENAME : force_new_cluster attribute cleared"
820
- fi
821
- return $rc
822
- else
823
- ocf_log info " $NODENAME does not hold force_new_cluster ($force_new_cluster_node has it)"
824
- return $OCF_SUCCESS
837
+ if ! crm_attribute --delete --lifetime reboot --node " $NODENAME " --name " force_new_cluster" ; then
838
+ ocf_log err " could not clear force_new_cluster attribute, error code: $? "
839
+ return $OCF_ERR_GENERIC
825
840
fi
841
+
842
+ ocf_log info " $NODENAME : force_new_cluster attribute cleared"
843
+ return $OCF_SUCCESS
826
844
}
827
845
846
+
828
847
is_force_new_cluster ()
829
848
{
830
- # Return 0 if 'force_new_cluster' is set and the value matches the current node name , 1 otherwise.
831
- local value
849
+ # Return 0 if 'force_new_cluster' is set on the current node, 1 otherwise.
850
+ local fnc_holders
832
851
833
- value=$( get_force_new_cluster)
834
- if [ -z " $value " ]; then
835
- ocf_log debug " force_new_cluster attribute is not set"
836
- return 1
852
+ if ! fnc_holders=$( get_force_new_cluster) ; then
853
+ ocf_exit_reason " is_force_new_cluster: Failed to get force_new_cluster node holders"
854
+ exit $OCF_ERR_GENERIC
837
855
fi
838
856
839
- if [ " $value " = " $NODENAME " ] ; then
857
+ if echo " $fnc_holders " | grep -q -w " $NODENAME " ; then
840
858
ocf_log debug " $NODENAME has force_new_cluster set"
841
859
return 0
842
860
fi
843
861
844
- ocf_log info " force_new_cluster attribute set on peer node $value "
862
+ ocf_log debug " force_new_cluster attribute is not set on $NODENAME "
845
863
return 1
846
864
}
847
865
@@ -1415,17 +1433,34 @@ podman_start()
1415
1433
return " $OCF_ERR_GENERIC "
1416
1434
fi
1417
1435
1418
- # force-new-cluster property is a runtime-scoped flag that instructs the agent to force a new cluster-of-1.
1419
- # Since this attribute is configured with a reboot-lifetime, it is automatically cleared when the machine reboots.
1420
- # If the agent detects during its start that this property is set, it indicates that the flag was explicitly set
1421
- # during the current node boot session, implying a deliberate request to recover the cluster.
1422
1436
if ocf_is_true " $pod_was_running " ; then
1423
1437
ocf_log info " static pod was running: start normally"
1424
1438
else
1425
- if is_force_new_cluster; then
1426
- ocf_log notice " '$NODENAME ' marked to force-new-cluster"
1439
+ local fnc_holders
1440
+ if ! fnc_holders=$( get_force_new_cluster) ; then
1441
+ ocf_exit_reason " Failed to get force_new_cluster node holders"
1442
+ return " $OCF_ERR_GENERIC "
1443
+ fi
1444
+
1445
+ local fnc_holder_count
1446
+ fnc_holder_count=$( echo " $fnc_holders " | wc -w)
1447
+ if [ " $fnc_holder_count " -gt 1 ]; then
1448
+ ocf_exit_reason " force_new_cluster attribute is set on multiple nodes ($fnc_holders )"
1449
+ return " $OCF_ERR_GENERIC "
1450
+ fi
1451
+
1452
+ if [ " $fnc_holder_count " -eq 1 ]; then
1453
+ if echo " $fnc_holders " | grep -q -w " $NODENAME " ; then
1454
+ # Attribute is set on the local node.
1455
+ ocf_log notice " $NODENAME marked to force-new-cluster"
1456
+ JOIN_AS_LEARNER=false
1457
+ else
1458
+ # Attribute is set on a peer node.
1459
+ ocf_log info " $NODENAME shall join as learner because force_new_cluster is set on peer $fnc_holders "
1460
+ JOIN_AS_LEARNER=true
1461
+ fi
1427
1462
else
1428
- ocf_log info " ' $NODENAME ' is not marked to force-new-cluster"
1463
+ ocf_log info " no node is marked to force-new-cluster"
1429
1464
# When the local agent starts, we can infer the cluster state by counting
1430
1465
# how many agents are starting or already active:
1431
1466
# - 1 active agent: it's the peer (we are just starting)
@@ -1522,7 +1557,7 @@ podman_start()
1522
1557
for try in $( seq $retries ) ; do
1523
1558
learner_node=$( attribute_learner_node get)
1524
1559
if [ " $NODENAME " != " $learner_node " ]; then
1525
- ocf_log info " $learner_node is not in the member list yet. Retry in $poll_interval_sec seconds."
1560
+ ocf_log info " $NODENAME is not in the member list yet. Retry in $poll_interval_sec seconds."
1526
1561
sleep $poll_interval_sec
1527
1562
continue
1528
1563
fi
@@ -1673,6 +1708,8 @@ podman_stop()
1673
1708
{
1674
1709
local timeout=60
1675
1710
local rc
1711
+
1712
+ ocf_log notice " podman-etcd stop"
1676
1713
podman_simple_status
1677
1714
if [ $? -eq $OCF_NOT_RUNNING ]; then
1678
1715
ocf_log info " could not leave members list: etcd container not running"
0 commit comments