Skip to content

Commit a31f151

Browse files
committed
Refactor(podman-etcd): improve peer checking and leadership loss detection
The check_peers function is broken up into smaller, more manageable functions. This refactoring separates the logic for detecting a loss of cluster leadership from the logic for managing peer membership. The main function is renamed to check_peer as there is only 1 peer to check (it was check_peers).
1 parent 7c74960 commit a31f151

File tree

1 file changed

+45
-33
lines changed

1 file changed

+45
-33
lines changed

heartbeat/podman-etcd

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,42 +1014,35 @@ get_member_list_json() {
10141014
podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
10151015
}
10161016

1017-
check_peers()
1017+
detect_cluster_leadership_loss()
10181018
{
1019-
# Check peers endpoint status and locally accessible member list
1020-
local member_list_json
1021-
1022-
if ! container_exists; then
1023-
# we need a running container to execute etcdctl.
1024-
return $OCF_SUCCESS
1019+
endpoint_status_json=$(get_endpoint_status_json)
1020+
ocf_log info "endpoint status: $endpoint_status_json"
1021+
1022+
count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
1023+
if [ "$count_endpoints" -eq 1 ]; then
1024+
ocf_log info "one endpoint only: checking status errors"
1025+
endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
1026+
if echo "$endpoint_status_errors" | grep -q "no leader"; then
1027+
set_force_new_cluster
1028+
set_standalone_node
1029+
ocf_exit_reason "$NODENAME must force a new cluster"
1030+
return $OCF_ERR_GENERIC
1031+
fi
1032+
if [ "$endpoint_status_errors" != "null" ]; then
1033+
ocf_log err "unmanaged endpoint status error: $endpoint_status_errors"
1034+
fi
10251035
fi
10261036

1027-
member_list_json=$(get_member_list_json)
1028-
rc=$?
1029-
ocf_log debug "member list: $member_list_json"
1030-
if [ $rc -ne 0 ]; then
1031-
ocf_log info "podman failed to get member list, error code: $rc"
1032-
1033-
endpoint_status_json=$(get_endpoint_status_json)
1034-
ocf_log info "endpoint status: $endpoint_status_json"
1035-
1036-
count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
1037-
if [ "$count_endpoints" -eq 1 ]; then
1038-
ocf_log info "one endpoint only: checking status errors"
1039-
endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
1040-
if echo "$endpoint_status_errors" | grep -q "no leader"; then
1041-
set_force_new_cluster
1042-
set_standalone_node
1043-
ocf_exit_reason "$NODENAME must force a new cluster"
1044-
return $OCF_ERR_GENERIC
1045-
fi
1046-
if [ "$endpoint_status_errors" != "null" ]; then
1047-
ocf_log err "unmanaged endpoint status error: $endpoint_status_errors"
1048-
fi
1049-
fi
1037+
return $OCF_SUCCESS
1038+
}
10501039

1051-
return $OCF_SUCCESS
1052-
fi
1040+
manage_peer_membership()
1041+
{
1042+
# Read etcd member list to detect the status of the peer member.
1043+
# If the peer is missing from the member list, it will be added back as learner
1044+
# If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset
1045+
local member_list_json="$1"
10531046

10541047
# Example of .members[] instance fields in member list json format:
10551048
# NOTE that "name" is present in voting members only, while "isLearner" in learner members only
@@ -1083,6 +1076,25 @@ check_peers()
10831076
clear_standalone_and_learner_if_not_learners "$member_list_json"
10841077
fi
10851078
done
1079+
}
1080+
1081+
check_peer()
1082+
{
1083+
# Check peers endpoint status and locally accessible member list
1084+
local member_list_json
1085+
1086+
# we need a running container to execute etcdctl.
1087+
if ! container_exists; then
1088+
return $OCF_SUCCESS
1089+
fi
1090+
1091+
if ! member_list_json=$(get_member_list_json); then
1092+
ocf_log info "podman failed to get member list, error code: $?"
1093+
detect_cluster_leadership_loss
1094+
return $?
1095+
fi
1096+
1097+
manage_peer_membership "$member_list_json"
10861098
return $OCF_SUCCESS
10871099
}
10881100

@@ -1124,7 +1136,7 @@ podman_monitor()
11241136
# monitor operation to fail.
11251137
# TODO: move this inside check_peers where we already query member list json
11261138
attribute_node_member_id update
1127-
if ! check_peers; then
1139+
if ! check_peer; then
11281140
return $OCF_ERR_GENERIC
11291141
fi
11301142

0 commit comments

Comments
 (0)