Skip to content

Commit de7c73a

Browse files
committed
OCPBUGS-42808: podman-etcd: add automatic learner member promotion
Automatically promote etcd learner members to voting members when detected. Includes refactored member management functions and improved validation.
1 parent a31f151 commit de7c73a

File tree

1 file changed

+79
-29
lines changed

1 file changed

+79
-29
lines changed

heartbeat/podman-etcd

Lines changed: 79 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,22 @@ attribute_node_revision_peer()
712712
crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}'
713713
}
714714

715+
# Converts a decimal number to hexadecimal format with validation
716+
# Args: $1 - decimal number (test for non-negative integer too)
717+
# Returns: 0 on success, OCF_ERR_GENERIC on invalid input
718+
# Outputs: hexadecimal representation to stdout
719+
decimal_to_hex() {
720+
local dec=$1
721+
722+
if ! echo "$dec" | grep -q "^[1-9][0-9]*$"; then
723+
ocf_log err "Invalid member ID format: '$dec' (expected decimal number)"
724+
return $OCF_ERR_GENERIC
725+
fi
726+
727+
printf "%x" "$dec"
728+
return $OCF_SUCCESS
729+
}
730+
715731
attribute_node_member_id()
716732
{
717733
local action="$1"
@@ -737,16 +753,19 @@ attribute_node_member_id()
737753
return "$rc"
738754
fi
739755

740-
local value
756+
local value value_hex
741757
if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then
742758
rc=$?
743759
ocf_log err "could not get $attribute from member list JSON, error code: $rc"
744760
return "$rc"
745761
fi
746762

747763
# JSON member_id is decimal, while etcdctl command needs the hex version
748-
value=$(printf "%x" "$value")
749-
if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
764+
if ! value_hex=$(decimal_to_hex "$value"); then
765+
ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?"
766+
return $OCF_ERR_GENERIC
767+
fi
768+
if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then
750769
rc=$?
751770
ocf_log err "could not update etcd $attribute, error code: $rc"
752771
return "$rc"
@@ -905,42 +924,70 @@ clear_standalone_node()
905924
crm_attribute --name "standalone_node" --delete
906925
}
907926

908-
clear_standalone_and_learner_if_not_learners()
927+
928+
# Promotes an etcd learner member to a voting member
929+
# Args: $1 - learner member ID in decimal format
930+
# Returns: OCF_SUCCESS (even on expected promotion failures), OCF_ERR_GENERIC on conversion errors
931+
# Note: Promotion failures are expected and logged as info (peer may not be up-to-date)
932+
promote_learner_member()
933+
{
934+
local learner_member_id=$1
935+
936+
# JSON member_id is decimal, while etcdctl command needs the hex version
937+
if ! learner_member_id_hex=$(decimal_to_hex "$learner_member_id"); then
938+
ocf_log err "could not convert decimal member_id '$learner_member_id' to hex, error code: $?"
939+
return $OCF_ERR_GENERIC
940+
fi
941+
if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
942+
# promotion is expected to fail if the peer is not yet up-to-date
943+
ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
944+
return $OCF_SUCCESS
945+
fi
946+
ocf_log info "successfully promoted member '$learner_member_id_hex'"
947+
return $OCF_SUCCESS
948+
}
949+
950+
# Reconciles etcd cluster member states
951+
# Promotes learner members or clears standalone/learner attributes as needed
952+
# Args: $1 - member list JSON from etcdctl
953+
# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors
954+
# Note: Only operates when exactly 2 started members are present
955+
reconcile_member_state()
909956
{
910957
local rc
911958
local member_list_json="$1"
912959

913-
number_of_members=$(printf "%s" "$member_list_json" | jq -r ".members[].ID" | wc -l)
914-
if [ "$number_of_members" -ne 2 ]; then
915-
ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2"
960+
# count only the started members, which have the ".name" JSON field
961+
number_of_started_members=$(printf "%s" "$member_list_json" | jq -r ".members[].name | select(. != null)" | wc -l)
962+
if [ "$number_of_started_members" -ne 2 ]; then
963+
ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_started_members members, need 2"
916964
return $OCF_SUCCESS
917965
fi
918966

919-
id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
967+
learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
920968
rc=$?
921969
if [ $rc -ne 0 ]; then
922970
ocf_log err "could not get isLearner field from member list, error code: $rc"
923971
return $rc
924972
fi
925973

926-
if [ -z "$id" ]; then
927-
clear_standalone_node
928-
rc=$?
929-
if [ $rc -ne 0 ]; then
930-
ocf_og error "could not clear standalone_node attribute, error code: $rc"
931-
return $rc
932-
fi
974+
if [ -n "$learner_member_id" ]; then
975+
promote_learner_member "$learner_member_id"
976+
return $?
933977
fi
934-
if [ -z "$id" ]; then
935-
attribute_learner_node clear
936-
rc=$?
937-
if [ $rc -ne 0 ]; then
938-
ocf_og error "could not clear learner_node attribute, error code: $rc"
939-
return $rc
978+
979+
if [ -z "$learner_member_id" ]; then
980+
if ! clear_standalone_node; then
981+
ocf_log error "could not clear standalone_node attribute, error code: $?"
982+
return $OCF_ERR_GENERIC
983+
fi
984+
if ! attribute_learner_node clear; then
985+
ocf_log error "could not clear learner_node attribute, error code: $?"
986+
return $OCF_ERR_GENERIC
940987
fi
941988
fi
942989

943-
return $rc
990+
return $OCF_SUCCESS
944991
}
945992

946993
attribute_learner_node()
@@ -1019,7 +1066,7 @@ detect_cluster_leadership_loss()
10191066
endpoint_status_json=$(get_endpoint_status_json)
10201067
ocf_log info "endpoint status: $endpoint_status_json"
10211068

1022-
count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
1069+
count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
10231070
if [ "$count_endpoints" -eq 1 ]; then
10241071
ocf_log info "one endpoint only: checking status errors"
10251072
endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
@@ -1037,11 +1084,14 @@ detect_cluster_leadership_loss()
10371084
return $OCF_SUCCESS
10381085
}
10391086

1087+
1088+
# Manages etcd peer membership by detecting and handling missing or rejoining peers
1089+
# Adds missing peers as learners and reconciles member states when peers rejoin
1090+
# Args: $1 - member list JSON from etcdctl
1091+
# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors
1092+
# Note: Iterates through all peer nodes to ensure proper cluster membership
10401093
manage_peer_membership()
10411094
{
1042-
# Read etcd member list to detect the status of the peer member.
1043-
# If the peer is missing from the member list, it will be added back as learner
1044-
# If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset
10451095
local member_list_json="$1"
10461096

10471097
# Example of .members[] instance fields in member list json format:
@@ -1066,14 +1116,14 @@ manage_peer_membership()
10661116

10671117
# Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
10681118
ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
1069-
id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
1070-
if [ -z "$id" ]; then
1119+
peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
1120+
if [ -z "$peer_member_id" ]; then
10711121
ocf_log info "$name is not in the members list"
10721122
add_member_as_learner "$name" "$ip"
10731123
set_standalone_node
10741124
else
10751125
ocf_log debug "$name is in the members list by IP: $ip"
1076-
clear_standalone_and_learner_if_not_learners "$member_list_json"
1126+
reconcile_member_state "$member_list_json"
10771127
fi
10781128
done
10791129
}

0 commit comments

Comments
 (0)