From 639e3ed6ca65226d414501ec639b2df1197c6764 Mon Sep 17 00:00:00 2001 From: Muhammad Soliman Date: Wed, 16 Jul 2025 11:02:52 +0200 Subject: [PATCH 1/2] Exit when stopped leading In case of flapping failures that affected the leading pod like flapping network, we noticed that the same pod tried to reaquire the lock after losing it instead of allowing another helthy pod unaffected by the failure to aquire the lock and become leader. This pull request changes the behaviour to work in accordance with other k8s components in leader election configuration, where if the leader stopped leading it exits the process and let another pod take leadership while k8s creates a new pod to replace the old leader. --- cluster-autoscaler/main.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 7d090a7416ce..c0e37bab55a3 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -403,7 +403,8 @@ func main() { run(healthCheck, debuggingSnapshotter) }, OnStoppedLeading: func() { - klog.Fatalf("lost master") + klog.Fatalf("lost master. Shutting down.") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) }, }, }) From e9659905d5cb33cd5e9559fb8d7425c438c2ffcf Mon Sep 17 00:00:00 2001 From: Muhammad Soliman Date: Mon, 28 Jul 2025 15:30:55 +0200 Subject: [PATCH 2/2] Use exit instead. --- cluster-autoscaler/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index c0e37bab55a3..ce5885774339 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -403,7 +403,7 @@ func main() { run(healthCheck, debuggingSnapshotter) }, OnStoppedLeading: func() { - klog.Fatalf("lost master. Shutting down.") + klog.Error("lost master. Shutting down.") klog.FlushAndExit(klog.ExitFlushTimeout, 1) }, },