@@ -712,6 +712,22 @@ attribute_node_revision_peer()
712
712
crm_attribute --query --type nodes --node " $nodename " --name " revision" | awk -F" value=" ' {print $2}'
713
713
}
714
714
715
+ # Converts a decimal number to hexadecimal format with validation
716
+ # Args: $1 - decimal number (test for non-negative integer too)
717
+ # Returns: 0 on success, OCF_ERR_GENERIC on invalid input
718
+ # Outputs: hexadecimal representation to stdout
719
+ decimal_to_hex () {
720
+ local dec=$1
721
+
722
+ if ! echo " $dec " | grep -q " ^[1-9][0-9]*$" ; then
723
+ ocf_log err " Invalid member ID format: '$dec ' (expected decimal number)"
724
+ return $OCF_ERR_GENERIC
725
+ fi
726
+
727
+ printf " %x" " $dec "
728
+ return $OCF_SUCCESS
729
+ }
730
+
715
731
attribute_node_member_id ()
716
732
{
717
733
local action=" $1 "
@@ -737,16 +753,19 @@ attribute_node_member_id()
737
753
return " $rc "
738
754
fi
739
755
740
- local value
756
+ local value value_hex
741
757
if ! value=$( echo -n " $member_list_json " | jq -r " .header.member_id" ) ; then
742
758
rc=$?
743
759
ocf_log err " could not get $attribute from member list JSON, error code: $rc "
744
760
return " $rc "
745
761
fi
746
762
747
763
# JSON member_id is decimal, while etcdctl command needs the hex version
748
- value=$( printf " %x" " $value " )
749
- if ! crm_attribute --type nodes --node " $NODENAME " --name " $attribute " --update " $value " ; then
764
+ if ! value_hex=$( decimal_to_hex " $value " ) ; then
765
+ ocf_log err " could not convert decimal member_id '$value ' to hex, error code: $? "
766
+ return $OCF_ERR_GENERIC
767
+ fi
768
+ if ! crm_attribute --type nodes --node " $NODENAME " --name " $attribute " --update " $value_hex " ; then
750
769
rc=$?
751
770
ocf_log err " could not update etcd $attribute , error code: $rc "
752
771
return " $rc "
@@ -905,42 +924,70 @@ clear_standalone_node()
905
924
crm_attribute --name " standalone_node" --delete
906
925
}
907
926
908
- clear_standalone_and_learner_if_not_learners ()
927
+
928
+ # Promotes an etcd learner member to a voting member
929
+ # Args: $1 - learner member ID in decimal format
930
+ # Returns: OCF_SUCCESS (even on expected promotion failures), OCF_ERR_GENERIC on conversion errors
931
+ # Note: Promotion failures are expected and logged as info (peer may not be up-to-date)
932
+ promote_learner_member ()
933
+ {
934
+ local learner_member_id=$1
935
+
936
+ # JSON member_id is decimal, while etcdctl command needs the hex version
937
+ if ! learner_member_id_hex=$( decimal_to_hex " $learner_member_id " ) ; then
938
+ ocf_log err " could not convert decimal member_id '$learner_member_id ' to hex, error code: $? "
939
+ return $OCF_ERR_GENERIC
940
+ fi
941
+ if ! ocf_run podman exec " ${CONTAINER} " etcdctl member promote " $learner_member_id_hex " 2>&1 ; then
942
+ # promotion is expected to fail if the peer is not yet up-to-date
943
+ ocf_log info " could not promote member $learner_member_id_hex , error code: $? "
944
+ return $OCF_SUCCESS
945
+ fi
946
+ ocf_log info " successfully promoted member '$learner_member_id_hex '"
947
+ return $OCF_SUCCESS
948
+ }
949
+
950
+ # Reconciles etcd cluster member states
951
+ # Promotes learner members or clears standalone/learner attributes as needed
952
+ # Args: $1 - member list JSON from etcdctl
953
+ # Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors
954
+ # Note: Only operates when exactly 2 started members are present
955
+ reconcile_member_state ()
909
956
{
910
957
local rc
911
958
local member_list_json=" $1 "
912
959
913
- number_of_members=$( printf " %s" " $member_list_json " | jq -r " .members[].ID" | wc -l)
914
- if [ " $number_of_members " -ne 2 ]; then
915
- ocf_log info " could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2"
960
+ # count only the started members, which have the ".name" JSON field
961
+ number_of_started_members=$( printf " %s" " $member_list_json " | jq -r " .members[].name | select(. != null)" | wc -l)
962
+ if [ " $number_of_started_members " -ne 2 ]; then
963
+ ocf_log info " could not clear standalone_node, nor learner_node properties: found $number_of_started_members members, need 2"
916
964
return $OCF_SUCCESS
917
965
fi
918
966
919
- id =$( printf " %s" " $member_list_json " | jq -r " .members[] | select( .isLearner==true ).ID" )
967
+ learner_member_id =$( printf " %s" " $member_list_json " | jq -r " .members[] | select( .isLearner==true ).ID" )
920
968
rc=$?
921
969
if [ $rc -ne 0 ]; then
922
970
ocf_log err " could not get isLearner field from member list, error code: $rc "
923
971
return $rc
924
972
fi
925
973
926
- if [ -z " $id " ]; then
927
- clear_standalone_node
928
- rc=$?
929
- if [ $rc -ne 0 ]; then
930
- ocf_og error " could not clear standalone_node attribute, error code: $rc "
931
- return $rc
932
- fi
974
+ if [ -n " $learner_member_id " ]; then
975
+ promote_learner_member " $learner_member_id "
976
+ return $?
933
977
fi
934
- if [ -z " $id " ]; then
935
- attribute_learner_node clear
936
- rc=$?
937
- if [ $rc -ne 0 ]; then
938
- ocf_og error " could not clear learner_node attribute, error code: $rc "
939
- return $rc
978
+
979
+ if [ -z " $learner_member_id " ]; then
980
+ if ! clear_standalone_node; then
981
+ ocf_log error " could not clear standalone_node attribute, error code: $? "
982
+ return $OCF_ERR_GENERIC
983
+ fi
984
+ if ! attribute_learner_node clear; then
985
+ ocf_log error " could not clear learner_node attribute, error code: $? "
986
+ return $OCF_ERR_GENERIC
940
987
fi
941
988
fi
942
989
943
- return $rc
990
+ return $OCF_SUCCESS
944
991
}
945
992
946
993
attribute_learner_node ()
@@ -1019,7 +1066,7 @@ detect_cluster_leadership_loss()
1019
1066
endpoint_status_json=$( get_endpoint_status_json)
1020
1067
ocf_log info " endpoint status: $endpoint_status_json "
1021
1068
1022
- count_endpoints=$( printf " %s" " $endpoint_status_json " | jq -r " .[].Endpoint" | wc -l)
1069
+ count_endpoints=$( printf " %s" " $endpoint_status_json " | jq -r " .[].Endpoint" | wc -l)
1023
1070
if [ " $count_endpoints " -eq 1 ]; then
1024
1071
ocf_log info " one endpoint only: checking status errors"
1025
1072
endpoint_status_errors=$( printf " %s" " $endpoint_status_json " | jq -r " .[0].Status.errors" )
@@ -1037,11 +1084,14 @@ detect_cluster_leadership_loss()
1037
1084
return $OCF_SUCCESS
1038
1085
}
1039
1086
1087
+
1088
+ # Manages etcd peer membership by detecting and handling missing or rejoining peers
1089
+ # Adds missing peers as learners and reconciles member states when peers rejoin
1090
+ # Args: $1 - member list JSON from etcdctl
1091
+ # Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors
1092
+ # Note: Iterates through all peer nodes to ensure proper cluster membership
1040
1093
manage_peer_membership ()
1041
1094
{
1042
- # Read etcd member list to detect the status of the peer member.
1043
- # If the peer is missing from the member list, it will be added back as learner
1044
- # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset
1045
1095
local member_list_json=" $1 "
1046
1096
1047
1097
# Example of .members[] instance fields in member list json format:
@@ -1066,14 +1116,14 @@ manage_peer_membership()
1066
1116
1067
1117
# Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
1068
1118
ip=$( echo " $node " | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
1069
- id =$( printf " %s" " $member_list_json " | jq -r " .members[] | select( .peerURLs | map(test(\" $ip \" )) | any).ID" )
1070
- if [ -z " $id " ]; then
1119
+ peer_member_id =$( printf " %s" " $member_list_json " | jq -r " .members[] | select( .peerURLs | map(test(\" $ip \" )) | any).ID" )
1120
+ if [ -z " $peer_member_id " ]; then
1071
1121
ocf_log info " $name is not in the members list"
1072
1122
add_member_as_learner " $name " " $ip "
1073
1123
set_standalone_node
1074
1124
else
1075
1125
ocf_log debug " $name is in the members list by IP: $ip "
1076
- clear_standalone_and_learner_if_not_learners " $member_list_json "
1126
+ reconcile_member_state " $member_list_json "
1077
1127
fi
1078
1128
done
1079
1129
}
0 commit comments