From 3dd583aa414f89d1dbe10a03bb41f34a6b5cfd58 Mon Sep 17 00:00:00 2001
From: Ming Lu <ming.lu@cloud.com>
Date: Thu, 7 Aug 2025 10:37:10 +0800
Subject: [PATCH 1/3] CA-412983: HA doesn't keep trying to start best-effort VM

The issue occurs in a scenario involving a HA-enabled pool. A VM with its
VM.ha_restart_priority set to best-effort is running on a host. The VM's disk
resides on the host's local storage. When the host goes down, the VM cannot be
restarted on other hosts due to the disk's local storage dependency. However,
after the host recovers and comes back online, the VM still does not
automatically start on the original host.

Expected behavior: The VM should automatically start on the original host once
it has recovered. Generally, this behavior should be applied to all non-agile
VMs.

Signed-off-by: Ming Lu <ming.lu@cloud.com>
---
 ocaml/xapi/xapi_ha.ml              |  25 ++++---
 ocaml/xapi/xapi_ha_vm_failover.ml  | 104 ++++++++++++++++++++++++-----
 ocaml/xapi/xapi_ha_vm_failover.mli |   6 +-
 3 files changed, 105 insertions(+), 30 deletions(-)

diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml
index e88ecf13769..fda471b1868 100644
--- a/ocaml/xapi/xapi_ha.ml
+++ b/ocaml/xapi/xapi_ha.ml
@@ -508,6 +508,14 @@ module Monitor = struct
               let liveset_uuids =
                 List.sort compare (uuids_of_liveset liveset)
               in
+              let to_refs uuids =
+                List.map
+                  (fun uuid ->
+                    Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
+                  )
+                  uuids
+              in
+              let last_live_set = to_refs !last_liveset_uuids in
               if !last_liveset_uuids <> liveset_uuids then (
                 warn
                   "Liveset looks different; assuming we need to rerun the \
@@ -515,17 +523,11 @@ module Monitor = struct
                 plan_out_of_date := true ;
                 last_liveset_uuids := liveset_uuids
               ) ;
-              let liveset_refs =
-                List.map
-                  (fun uuid ->
-                    Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
-                  )
-                  liveset_uuids
-              in
+              let live_set = to_refs liveset_uuids in
               if local_failover_decisions_are_ok () then (
                 try
                   Xapi_ha_vm_failover.restart_auto_run_vms ~__context
-                    liveset_refs to_tolerate
+                    ~last_live_set ~live_set to_tolerate
                 with e ->
                   log_backtrace e ;
                   error
@@ -539,9 +541,7 @@ module Monitor = struct
               (* Next update the Host_metrics.live value to spot hosts coming back *)
               let all_hosts = Db.Host.get_all ~__context in
               let livemap =
-                List.map
-                  (fun host -> (host, List.mem host liveset_refs))
-                  all_hosts
+                List.map (fun host -> (host, List.mem host live_set)) all_hosts
               in
               List.iter
                 (fun (host, live) ->
@@ -704,8 +704,7 @@ module Monitor = struct
               in
               if plan_too_old || !plan_out_of_date then (
                 let changed =
-                  Xapi_ha_vm_failover.update_pool_status ~__context
-                    ~live_set:liveset_refs ()
+                  Xapi_ha_vm_failover.update_pool_status ~__context ~live_set ()
                 in
                 (* Extremely bad: something managed to break our careful plan *)
                 if changed && not !plan_out_of_date then
diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml
index 5cbb946b150..b5c6b24f022 100644
--- a/ocaml/xapi/xapi_ha_vm_failover.ml
+++ b/ocaml/xapi/xapi_ha_vm_failover.ml
@@ -1259,9 +1259,26 @@ let restart_failed : (API.ref_VM, unit) Hashtbl.t = Hashtbl.create 10
 (* We also limit the rate we attempt to retry starting the VM. *)
 let last_start_attempt : (API.ref_VM, float) Hashtbl.t = Hashtbl.create 10
 
+module VMRefOrd = struct
+  type t = [`VM] Ref.t
+
+  let compare = Ref.compare
+end
+
+module VMMap = Map.Make (VMRefOrd)
+
+(* When a host is up, it will be added in the HA live set. But it may be still
+   in disabled state so that starting best-effort VMs on it would fail.
+   Meanwhile we don't want to retry on starting them forever.
+   This data is to remember the best-effort VMs which failed to start due to
+   this and the key of the map is the VM ref. And its value is the count of the
+   attempts of starting. This is to avoid retrying for ever and can be adjusted
+   according to how hong the host becomes enabled since it is in HA live set. *)
+let tried_best_eff_vms = ref VMMap.empty
+
 (* Takes the current live_set and number of hosts we're planning to handle, updates the host records in the database
    and restarts any offline protected VMs *)
-let restart_auto_run_vms ~__context live_set n =
+let restart_auto_run_vms ~__context ~last_live_set ~live_set n =
   (* ensure we have live=false on the host_metrics for those hosts not in the live_set; and force state to Halted for
      	   all VMs that are "running" or "paused" with resident_on set to one of the hosts that is now dead
   *)
@@ -1566,32 +1583,87 @@ let restart_auto_run_vms ~__context live_set n =
          			   ok since this is 'best-effort'). NOTE we do not use the restart_vm function above as this will mark the
          			   pool as overcommitted if an HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN is received (although this should never
          			   happen it's better safe than sorry) *)
-      map_parallel
-        ~order_f:(fun vm -> order_f (vm, Db.VM.get_record ~__context ~self:vm))
-        (fun vm ->
+      let is_best_effort r =
+        r.API.vM_ha_restart_priority = Constants.ha_restart_best_effort
+        && r.API.vM_power_state = `Halted
+      in
+      let resets =
+        !reset_vms
+        |> List.map (fun self -> (self, Db.VM.get_record ~__context ~self))
+      in
+      let revalidate_tried m =
+        let valid, invalid =
+          VMMap.bindings m
+          |> List.partition_map (fun (self, _) ->
+                 match Db.VM.get_record ~__context ~self with
+                 | r ->
+                     Left (self, r)
+                 | exception _ ->
+                     Right self
+             )
+        in
+        let to_retry, to_remove =
+          List.partition (fun (_, r) -> is_best_effort r) valid
+        in
+        let m' =
+          List.map fst to_remove
+          |> List.rev_append invalid
+          |> List.fold_left (fun acc vm -> VMMap.remove vm acc) m
+        in
+        (to_retry, m')
+      in
+      let best_effort_vms =
+        (* Carefully decide which best-effort VMs should attempt to start. *)
+        let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in
+        let is_better = List.length live_set > List.length last_live_set in
+        ( match (all_prot_is_ok, is_better, last_live_set = live_set) with
+        | true, true, _ ->
+            (* Try to start all the best-effort halted VMs when HA is being
+               enabled or some hosts are transiting to HA live.
+               The DB has been updated by Xapi_vm_lifecycle.force_state_reset.
+               Read again. *)
+            tried_best_eff_vms := VMMap.empty ;
+            Db.VM.get_all_records ~__context
+        | true, false, true ->
+            (* Retry for best-effort VMs which attepmted but failed last time. *)
+            let to_retry, m = revalidate_tried !tried_best_eff_vms in
+            tried_best_eff_vms := m ;
+            List.rev_append to_retry resets
+        | true, false, false | false, _, _ ->
+            (* Try to start only the reset VMs. They were observed as residing
+               on the non-live hosts in this run.
+               Give up starting tried VMs as the HA situation changes. *)
+            tried_best_eff_vms := VMMap.empty ;
+            resets
+        )
+        |> List.filter (fun (_, r) -> is_best_effort r)
+      in
+      map_parallel ~order_f
+        (fun (vm, _) ->
           ( vm
-          , if
-              Db.VM.get_power_state ~__context ~self:vm = `Halted
-              && Db.VM.get_ha_restart_priority ~__context ~self:vm
-                 = Constants.ha_restart_best_effort
-            then
-              TaskChains.task (fun () ->
-                  Client.Client.Async.VM.start ~rpc ~session_id ~vm
-                    ~start_paused:false ~force:true
-              )
-            else
-              TaskChains.ok Rpc.Null
+          , TaskChains.task (fun () ->
+                Client.Client.Async.VM.start ~rpc ~session_id ~vm
+                  ~start_paused:false ~force:true
+            )
           )
         )
-        !reset_vms
+        best_effort_vms
       |> List.iter (fun (vm, result) ->
              match result with
              | Error e ->
+                 tried_best_eff_vms :=
+                   VMMap.update vm
+                     (Option.fold ~none:(Some 1) ~some:(fun n ->
+                          if n < 2 then Some (n + 1) else None
+                      )
+                     )
+                     !tried_best_eff_vms ;
                  error "Failed to restart best-effort VM %s (%s): %s"
                    (Db.VM.get_uuid ~__context ~self:vm)
                    (Db.VM.get_name_label ~__context ~self:vm)
                    (ExnHelper.string_of_exn e)
              | Ok _ ->
+                 tried_best_eff_vms := VMMap.remove vm !tried_best_eff_vms ;
                  ()
          )
   )
diff --git a/ocaml/xapi/xapi_ha_vm_failover.mli b/ocaml/xapi/xapi_ha_vm_failover.mli
index 20eb3b6b844..abf6374822a 100644
--- a/ocaml/xapi/xapi_ha_vm_failover.mli
+++ b/ocaml/xapi/xapi_ha_vm_failover.mli
@@ -18,7 +18,11 @@
 val all_protected_vms : __context:Context.t -> (API.ref_VM * API.vM_t) list
 
 val restart_auto_run_vms :
-  __context:Context.t -> API.ref_host list -> int -> unit
+     __context:Context.t
+  -> last_live_set:API.ref_host list
+  -> live_set:API.ref_host list
+  -> int
+  -> unit
 (** Take a set of live VMs and attempt to restart all protected VMs which have failed *)
 
 val compute_evacuation_plan :

From ea89a26c69ddfd583bf979a58b9cce0775a1d7fb Mon Sep 17 00:00:00 2001
From: Ming Lu <ming.lu@cloud.com>
Date: Mon, 11 Aug 2025 13:29:21 +0800
Subject: [PATCH 2/3] Add Xapi_globs.ha_best_effort_max_retries to eliminate
 hard-coding

Signed-off-by: Ming Lu <ming.lu@cloud.com>
---
 ocaml/xapi/xapi_globs.ml          | 5 +++++
 ocaml/xapi/xapi_ha_vm_failover.ml | 5 ++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml
index 490045f871f..3688478dce8 100644
--- a/ocaml/xapi/xapi_globs.ml
+++ b/ocaml/xapi/xapi_globs.ml
@@ -436,6 +436,10 @@ let xapi_clusterd_port = ref 8896
  *)
 let local_yum_repo_port = ref 8000
 
+(* The maximum number of start attempts for HA best-effort VMs. Each attempt is
+   spaced 20 seconds apart. *)
+let ha_best_effort_max_retries = ref 2
+
 (* When a host is known to be shutting down or rebooting, we add it's reference in here.
    This can be used to force the Host_metrics.live flag to false. *)
 let hosts_which_are_shutting_down : API.ref_host list ref = ref []
@@ -1238,6 +1242,7 @@ let xapi_globs_spec =
   ; ("max_observer_file_size", Int max_observer_file_size)
   ; ("test-open", Int test_open) (* for consistency with xenopsd *)
   ; ("local_yum_repo_port", Int local_yum_repo_port)
+  ; ("ha_best_effort_max_retries", Int ha_best_effort_max_retries)
   ]
 
 let xapi_globs_spec_with_descriptions =
diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml
index b5c6b24f022..31dc6e7a2ef 100644
--- a/ocaml/xapi/xapi_ha_vm_failover.ml
+++ b/ocaml/xapi/xapi_ha_vm_failover.ml
@@ -1654,7 +1654,10 @@ let restart_auto_run_vms ~__context ~last_live_set ~live_set n =
                  tried_best_eff_vms :=
                    VMMap.update vm
                      (Option.fold ~none:(Some 1) ~some:(fun n ->
-                          if n < 2 then Some (n + 1) else None
+                          if n < !Xapi_globs.ha_best_effort_max_retries then
+                            Some (n + 1)
+                          else
+                            None
                       )
                      )
                      !tried_best_eff_vms ;

From 34bdb57dd8a2ecc5ab62ed8c9fc612446fc6eaba Mon Sep 17 00:00:00 2001
From: Ming Lu <ming.lu@cloud.com>
Date: Wed, 13 Aug 2025 18:49:09 +0800
Subject: [PATCH 3/3] Optimize with List.compare_lengths

Signed-off-by: Ming Lu <ming.lu@cloud.com>
---
 ocaml/xapi/xapi_ha_vm_failover.ml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml
index 31dc6e7a2ef..5c43984541c 100644
--- a/ocaml/xapi/xapi_ha_vm_failover.ml
+++ b/ocaml/xapi/xapi_ha_vm_failover.ml
@@ -1615,7 +1615,7 @@ let restart_auto_run_vms ~__context ~last_live_set ~live_set n =
       let best_effort_vms =
         (* Carefully decide which best-effort VMs should attempt to start. *)
         let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in
-        let is_better = List.length live_set > List.length last_live_set in
+        let is_better = List.compare_lengths live_set last_live_set > 0 in
         ( match (all_prot_is_ok, is_better, last_live_set = live_set) with
         | true, true, _ ->
             (* Try to start all the best-effort halted VMs when HA is being