Skip to content

Commit 7c74960

Browse files
authored
Merge pull request #2074 from clobrano/enhancement/sync-on-force-new-cluster-attribute-at-start
OCPBUGS-61117: podman-etcd: Add cluster-wide force_new_cluster attribute checking
2 parents 0716142 + 1afdd91 commit 7c74960

File tree

1 file changed

+72
-35
lines changed

1 file changed

+72
-35
lines changed

heartbeat/podman-etcd

Lines changed: 72 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -794,54 +794,72 @@ set_force_new_cluster()
794794
return $rc
795795
}
796796

797+
# get_force_new_cluster returns a space-separated list of nodes that have the force_new_cluster attribute set.
798+
# Return values:
799+
# - Exit code 0 with non-empty output: One or more nodes have the force_new_cluster attribute set
800+
# - Exit code 0 with empty output: No nodes have the force_new_cluster attribute set
801+
# - Exit code 1 with empty output: Error occurred while querying the cluster nodes
797802
get_force_new_cluster()
798803
{
799-
crm_attribute --lifetime reboot --query --name "force_new_cluster" | awk -F"value=" '{print $2}'
804+
local node nodes value
805+
local holders=""
806+
807+
if ! nodes=$(crm_node -l | awk '{print $2}'); then
808+
ocf_log err "could not get force_new_cluster attribute, crm_node error code: $?"
809+
return 1
810+
fi
811+
if [ -z "$nodes" ]; then
812+
ocf_log err "could not get force_new_cluster attribute, the list of nodes is empty"
813+
return 1
814+
fi
815+
816+
for node in $nodes; do
817+
if ! value=$(crm_attribute --query --lifetime reboot --name "force_new_cluster" --node "$node" 2>/dev/null | awk -F'value=' '{print $2}' | tr -d "'"); then
818+
ocf_log err "could not get force_new_cluster attribute, crm_attribut error code: $?"
819+
return 1
820+
fi
821+
if [ -n "$value" ]; then
822+
holders="$holders$node "
823+
fi
824+
done
825+
echo "$holders"
800826
}
801827

828+
802829
clear_force_new_cluster()
803830
{
804-
local force_new_cluster_node
805-
806-
force_new_cluster_node=$(get_force_new_cluster)
807-
if [ -z "$force_new_cluster_node" ]; then
808-
ocf_log info "$NODENAME: force_new_cluster attribute not set"
831+
# only the holder of "force_new_cluster" attribute can delete it
832+
if ! is_force_new_cluster; then
833+
ocf_log info "force_new_cluster unset or not owned by $NODENAME"
809834
return $OCF_SUCCESS
810835
fi
811836

812-
# only the holder of "force_new_cluster" attribute can delete it
813-
if [ "$NODENAME" = "$force_new_cluster_node" ]; then
814-
crm_attribute --lifetime reboot --name "force_new_cluster" --delete
815-
rc=$?
816-
if [ $rc -ne 0 ]; then
817-
ocf_log err "could not clear force_new_cluster attribute, error code: $rc"
818-
else
819-
ocf_log info "$NODENAME: force_new_cluster attribute cleared"
820-
fi
821-
return $rc
822-
else
823-
ocf_log info "$NODENAME does not hold force_new_cluster ($force_new_cluster_node has it)"
824-
return $OCF_SUCCESS
837+
if ! crm_attribute --delete --lifetime reboot --node "$NODENAME" --name "force_new_cluster"; then
838+
ocf_log err "could not clear force_new_cluster attribute, error code: $?"
839+
return $OCF_ERR_GENERIC
825840
fi
841+
842+
ocf_log info "$NODENAME: force_new_cluster attribute cleared"
843+
return $OCF_SUCCESS
826844
}
827845

846+
828847
is_force_new_cluster()
829848
{
830-
# Return 0 if 'force_new_cluster' is set and the value matches the current node name, 1 otherwise.
831-
local value
849+
# Return 0 if 'force_new_cluster' is set on the current node, 1 otherwise.
850+
local fnc_holders
832851

833-
value=$(get_force_new_cluster)
834-
if [ -z "$value" ]; then
835-
ocf_log debug "force_new_cluster attribute is not set"
836-
return 1
852+
if ! fnc_holders=$(get_force_new_cluster); then
853+
ocf_exit_reason "is_force_new_cluster: Failed to get force_new_cluster node holders"
854+
exit $OCF_ERR_GENERIC
837855
fi
838856

839-
if [ "$value" = "$NODENAME" ]; then
857+
if echo "$fnc_holders" | grep -q -w "$NODENAME"; then
840858
ocf_log debug "$NODENAME has force_new_cluster set"
841859
return 0
842860
fi
843861

844-
ocf_log info "force_new_cluster attribute set on peer node $value"
862+
ocf_log debug "force_new_cluster attribute is not set on $NODENAME"
845863
return 1
846864
}
847865

@@ -1415,17 +1433,34 @@ podman_start()
14151433
return "$OCF_ERR_GENERIC"
14161434
fi
14171435

1418-
# force-new-cluster property is a runtime-scoped flag that instructs the agent to force a new cluster-of-1.
1419-
# Since this attribute is configured with a reboot-lifetime, it is automatically cleared when the machine reboots.
1420-
# If the agent detects during its start that this property is set, it indicates that the flag was explicitly set
1421-
# during the current node boot session, implying a deliberate request to recover the cluster.
14221436
if ocf_is_true "$pod_was_running"; then
14231437
ocf_log info "static pod was running: start normally"
14241438
else
1425-
if is_force_new_cluster; then
1426-
ocf_log notice "'$NODENAME' marked to force-new-cluster"
1439+
local fnc_holders
1440+
if ! fnc_holders=$(get_force_new_cluster); then
1441+
ocf_exit_reason "Failed to get force_new_cluster node holders"
1442+
return "$OCF_ERR_GENERIC"
1443+
fi
1444+
1445+
local fnc_holder_count
1446+
fnc_holder_count=$(echo "$fnc_holders" | wc -w)
1447+
if [ "$fnc_holder_count" -gt 1 ]; then
1448+
ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)"
1449+
return "$OCF_ERR_GENERIC"
1450+
fi
1451+
1452+
if [ "$fnc_holder_count" -eq 1 ]; then
1453+
if echo "$fnc_holders" | grep -q -w "$NODENAME"; then
1454+
# Attribute is set on the local node.
1455+
ocf_log notice "$NODENAME marked to force-new-cluster"
1456+
JOIN_AS_LEARNER=false
1457+
else
1458+
# Attribute is set on a peer node.
1459+
ocf_log info "$NODENAME shall join as learner because force_new_cluster is set on peer $fnc_holders"
1460+
JOIN_AS_LEARNER=true
1461+
fi
14271462
else
1428-
ocf_log info "'$NODENAME' is not marked to force-new-cluster"
1463+
ocf_log info "no node is marked to force-new-cluster"
14291464
# When the local agent starts, we can infer the cluster state by counting
14301465
# how many agents are starting or already active:
14311466
# - 1 active agent: it's the peer (we are just starting)
@@ -1522,7 +1557,7 @@ podman_start()
15221557
for try in $(seq $retries); do
15231558
learner_node=$(attribute_learner_node get)
15241559
if [ "$NODENAME" != "$learner_node" ]; then
1525-
ocf_log info "$learner_node is not in the member list yet. Retry in $poll_interval_sec seconds."
1560+
ocf_log info "$NODENAME is not in the member list yet. Retry in $poll_interval_sec seconds."
15261561
sleep $poll_interval_sec
15271562
continue
15281563
fi
@@ -1673,6 +1708,8 @@ podman_stop()
16731708
{
16741709
local timeout=60
16751710
local rc
1711+
1712+
ocf_log notice "podman-etcd stop"
16761713
podman_simple_status
16771714
if [ $? -eq $OCF_NOT_RUNNING ]; then
16781715
ocf_log info "could not leave members list: etcd container not running"

0 commit comments

Comments
 (0)