@@ -641,11 +641,19 @@ etcd_pod_container_exists() {
641
641
642
642
attribute_node_cluster_id ()
643
643
{
644
+ # Get or update local cluster_id from revision.json file.
645
+ # Fails if file is missing, or the cluster_id can not be parsed from it.
644
646
local action=" $1 "
645
647
local value
646
- if ! value=$( jq -r " .clusterId" /var/lib/etcd/revision.json) ; then
648
+
649
+ if [ ! -f " $REV_JSON " ]; then
650
+ ocf_log warn " could not '$action ' cluster_id: revision.json not found"
651
+ return $OCF_ERR_GENERIC
652
+ fi
653
+
654
+ if ! value=$( jq -r " .clusterId" " $REV_JSON " ) ; then
647
655
rc=$?
648
- ocf_log err " could not get cluster_id, error code: $rc "
656
+ ocf_log err " could not parse cluster_id, error code: $rc "
649
657
return " $rc "
650
658
fi
651
659
@@ -654,7 +662,7 @@ attribute_node_cluster_id()
654
662
echo " $value "
655
663
;;
656
664
update)
657
- if ! crm_attribute --type nodes --node " $NODENAME " --name " cluster_id" --update " $value " ; then
665
+ if ! crm_attribute --lifetime reboot -- type nodes --node " $NODENAME " --name " cluster_id" --update " $value " ; then
658
666
rc=$?
659
667
ocf_log err " could not update cluster_id, error code: $rc "
660
668
return " $rc "
@@ -670,20 +678,50 @@ attribute_node_cluster_id()
670
678
attribute_node_cluster_id_peer ()
671
679
{
672
680
local nodename
681
+ local value
682
+ local retries=0
673
683
674
684
nodename=$( get_peer_node_name)
675
- crm_attribute --query --type nodes --node " $nodename " --name " cluster_id" | awk -F" value=" ' {print $2}'
685
+
686
+ while [ $retries -lt $CIB_MAX_RETRIES ]; do
687
+ if value=$( crm_attribute --query --lifetime reboot --type nodes --node " $nodename " --name " cluster_id" 2> /dev/null | awk -F" value=" ' {print $2}' ) ; then
688
+ if [ -n " $value " ] && [ " $value " != " null" ]; then
689
+ echo " $value "
690
+ return 0
691
+ fi
692
+ fi
693
+
694
+ retries=$(( retries + 1 ))
695
+ if [ $retries -lt $CIB_MAX_RETRIES ]; then
696
+ ocf_log info " peer cluster_id not available yet, retrying in ${CIB_RETRY_DELAY} s (attempt $retries /$CIB_MAX_RETRIES )"
697
+ sleep $CIB_RETRY_DELAY
698
+ fi
699
+ done
700
+
701
+ ocf_log warn " peer cluster_id not available after $CIB_MAX_RETRIES retries"
702
+ return $OCF_ERR_GENERIC
676
703
}
677
704
678
705
attribute_node_revision ()
679
706
{
707
+ # Get or update local revision from revision.json file.
708
+ # Fails if file is missing, or the revision can not be parsed from it.
680
709
local action=" $1 "
681
710
local value
682
- local attribute=" revision"
683
711
684
- if ! value=$( jq -r " .maxRaftIndex" /var/lib/etcd/revision.json) ; then
712
+ if [ " $action " != " get" ] && [ " $action " != " update" ]; then
713
+ ocf_log err " unsupported action: '$action ' for attribute_node_revision"
714
+ return " $OCF_ERR_GENERIC "
715
+ fi
716
+
717
+ if [ ! -f " $REV_JSON " ]; then
718
+ ocf_log warn " could not '$action ' revision: revision.json not found"
719
+ return $OCF_ERR_GENERIC
720
+ fi
721
+
722
+ if ! value=$( jq -r " .maxRaftIndex" " $REV_JSON " ) ; then
685
723
rc=$?
686
- ocf_log err " could not get $attribute , error code: $rc "
724
+ ocf_log err " could not parse maxRaftIndex from existing revision.json , error code: $rc "
687
725
return " $rc "
688
726
fi
689
727
@@ -692,24 +730,40 @@ attribute_node_revision()
692
730
echo " $value "
693
731
;;
694
732
update)
695
- if ! crm_attribute --type nodes --node " $NODENAME " --name " $attribute " --update " $value " ; then
733
+ if ! crm_attribute --lifetime reboot -- type nodes --node " $NODENAME " --name " revision " --update " $value " ; then
696
734
rc=$?
697
- ocf_log err " could not update etcd $ revision , error code: $rc "
735
+ ocf_log err " could not update etcd revision, error code: $rc "
698
736
return " $rc "
699
737
fi
700
738
;;
701
- * )
702
- ocf_log err " unsupported $action for attribute_node_revision"
703
- return " $OCF_ERR_GENERIC "
704
- ;;
705
739
esac
706
740
}
707
741
708
742
attribute_node_revision_peer ()
709
743
{
710
744
local nodename
745
+ local value
746
+ local retries=0
747
+
711
748
nodename=$( get_peer_node_name)
712
- crm_attribute --query --type nodes --node " $nodename " --name " revision" | awk -F" value=" ' {print $2}'
749
+
750
+ while [ $retries -lt $CIB_MAX_RETRIES ]; do
751
+ if value=$( crm_attribute --query --lifetime reboot --type nodes --node " $nodename " --name " revision" 2> /dev/null | awk -F" value=" ' {print $2}' ) ; then
752
+ if [ -n " $value " ] && [ " $value " != " null" ]; then
753
+ echo " $value "
754
+ return 0
755
+ fi
756
+ fi
757
+
758
+ retries=$(( retries + 1 ))
759
+ if [ $retries -lt $CIB_MAX_RETRIES ]; then
760
+ ocf_log info " peer revision not available yet, retrying in ${CIB_RETRY_DELAY} s (attempt $retries /$CIB_MAX_RETRIES )"
761
+ sleep $CIB_RETRY_DELAY
762
+ fi
763
+ done
764
+
765
+ ocf_log warn " peer revision not available after $CIB_MAX_RETRIES retries"
766
+ return $OCF_ERR_GENERIC
713
767
}
714
768
715
769
attribute_node_member_id ()
@@ -1248,20 +1302,30 @@ run_new_container()
1248
1302
1249
1303
compare_revision ()
1250
1304
{
1251
- # Compare local revision (from disk) against peer revision (from CIB).
1252
- # returns "older", "equal" or "newer"
1305
+ # Compare local revision (from disk) against peer revision (from CIB), returning "older", "equal", or "newer" accordingly.
1306
+ # If local revision is missing, but peer revision exists, returns "older" to allow starting as learner, assuming that
1307
+ # the lack of local revision means the etcd member was a learner in the previous lifecycle.
1308
+ # Fails otherwise.
1253
1309
local revision
1254
1310
local peer_node_name
1255
1311
local peer_revision
1256
1312
1257
- revision=$( attribute_node_revision get)
1258
- peer_revision=$( attribute_node_revision_peer)
1259
-
1260
- if [ " $revision " = " " ] || [ " $revision " = " null" ] || [ " $peer_revision " = " " ] || [ " $peer_revision " = " null" ]; then
1261
- ocf_log err " could not compare revisions: '$NODENAME ' local revision='$revision ', peer revision='$peer_revision '"
1313
+ if ! peer_revision=$( attribute_node_revision_peer) ; then
1314
+ return " $OCF_ERR_GENERIC "
1315
+ elif [ " $peer_revision " = " " ] || [ " $peer_revision " = " null" ]; then
1316
+ ocf_log err " peer revision is empty or null"
1262
1317
return " $OCF_ERR_GENERIC "
1263
1318
fi
1264
1319
1320
+ # Handle the scenario where only the local revision data is missing
1321
+ revision=$( attribute_node_revision get)
1322
+ if [ " $revision " = " " ] || [ " $revision " = " null" ]; then
1323
+ ocf_log info " $NODENAME local revision missing but peer has valid revision '$peer_revision ' - can start as learner"
1324
+ echo " older"
1325
+ return " $OCF_SUCCESS "
1326
+ fi
1327
+
1328
+ # Normal revision comparison when both exist
1265
1329
if [ " $revision " -gt " $peer_revision " ]; then
1266
1330
ocf_log info " $NODENAME revision: '$revision ' is newer than peer revision: '$peer_revision '"
1267
1331
echo " newer"
@@ -1325,7 +1389,7 @@ can_reuse_container() {
1325
1389
1326
1390
1327
1391
# If the container does not exist it cannot be reused
1328
- if ! container_exists; then
1392
+ if ! container_exists; then
1329
1393
OCF_RESKEY_reuse=0
1330
1394
return " $OCF_SUCCESS "
1331
1395
fi
@@ -1336,7 +1400,7 @@ can_reuse_container() {
1336
1400
OCF_RESKEY_reuse=0
1337
1401
return " $OCF_SUCCESS "
1338
1402
fi
1339
-
1403
+
1340
1404
if ! filtered_original_pod_manifest=$( filter_pod_manifest " $OCF_RESKEY_pod_manifest " ) ; then
1341
1405
return $OCF_ERR_GENERIC
1342
1406
fi
@@ -1505,15 +1569,25 @@ podman_start()
1505
1569
fi
1506
1570
;;
1507
1571
2)
1508
- # TODO: can we start "normally", regardless the revisions, if the container-id is the same on both nodes?
1572
+ # TODO: Consider starting normally, regardless revision *IF* the cluster-id is the same.
1573
+ # NOTE: cluster-id does not change during force-new-cluster (https://github.com/openshift/etcd/pull/339)
1509
1574
ocf_log info " peer starting"
1510
1575
if [ " $revision_compare_result " = " newer" ]; then
1511
1576
set_force_new_cluster
1512
1577
elif [ " $revision_compare_result " = " older" ]; then
1513
1578
ocf_log info " $NODENAME shall join as learner"
1514
1579
JOIN_AS_LEARNER=true
1515
1580
else
1516
- if [ " $( attribute_node_cluster_id get) " = " $( attribute_node_cluster_id_peer) " ]; then
1581
+ local local_cluster_id peer_cluster_id
1582
+ if ! local_cluster_id=$( attribute_node_cluster_id get) ; then
1583
+ return " $OCF_ERR_GENERIC "
1584
+ fi
1585
+
1586
+ if ! peer_cluster_id=$( attribute_node_cluster_id_peer) ; then
1587
+ return " $OCF_ERR_GENERIC "
1588
+ fi
1589
+
1590
+ if [ " $local_cluster_id " = " $peer_cluster_id " ]; then
1517
1591
ocf_log info " same cluster_id and revision: start normal"
1518
1592
else
1519
1593
ocf_exit_reason " same revision but different cluster id"
@@ -1860,6 +1934,9 @@ CONTAINER=$OCF_RESKEY_name
1860
1934
POD_MANIFEST_COPY=" ${OCF_RESKEY_config_location} /pod.yaml"
1861
1935
ETCD_CONFIGURATION_FILE=" ${OCF_RESKEY_config_location} /config.yaml"
1862
1936
ETCD_BACKUP_FILE=" ${OCF_RESKEY_backup_location} /config-previous.tar.gz"
1937
+ REV_JSON=" /var/lib/etcd/revision.json"
1938
+ CIB_MAX_RETRIES=2
1939
+ CIB_RETRY_DELAY=2
1863
1940
1864
1941
# Note: we currently monitor podman containers by with the "podman exec"
1865
1942
# command, so make sure that invocation is always valid by enforcing the
0 commit comments