CA-412983: HA doesn't keep trying to start best-effort VM (#6619)

minglumlu · web-flow · commit f5eb08d2af5b · 2025-08-22T06:37:10.000Z
The issue occurs in a scenario involving a HA-enabled pool. A VM with
its VM.ha_restart_priority set to best-effort is running on a host. The
VM's disk resides on the host's local storage. When the host goes down,
the VM cannot be restarted on other hosts due to the disk's local
storage dependency. However, after the host recovers and comes back
online, the VM still does not automatically start on the original host.

Expected behavior: The VM should automatically start on the original
host once it has recovered. Generally, this behavior should be applied
to all non-agile VMs.
diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml
@@ -436,6 +436,10 @@ let xapi_clusterd_port = ref 8896
  *)
 let local_yum_repo_port = ref 8000
 
+(* The maximum number of start attempts for HA best-effort VMs. Each attempt is
+   spaced 20 seconds apart. *)
+let ha_best_effort_max_retries = ref 2
+
 (* When a host is known to be shutting down or rebooting, we add it's reference in here.
    This can be used to force the Host_metrics.live flag to false. *)
 let hosts_which_are_shutting_down : API.ref_host list ref = ref []
@@ -1238,6 +1242,7 @@ let xapi_globs_spec =
   ; ("max_observer_file_size", Int max_observer_file_size)
   ; ("test-open", Int test_open) (* for consistency with xenopsd *)
   ; ("local_yum_repo_port", Int local_yum_repo_port)
+  ; ("ha_best_effort_max_retries", Int ha_best_effort_max_retries)
   ]
 
 let xapi_globs_spec_with_descriptions =
diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml
@@ -508,24 +508,26 @@ module Monitor = struct
               let liveset_uuids =
                 List.sort compare (uuids_of_liveset liveset)
               in
+              let to_refs uuids =
+                List.map
+                  (fun uuid ->
+                    Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
+                  )
+                  uuids
+              in
+              let last_live_set = to_refs !last_liveset_uuids in
               if !last_liveset_uuids <> liveset_uuids then (
                 warn
                   "Liveset looks different; assuming we need to rerun the \
                    planner" ;
                 plan_out_of_date := true ;
                 last_liveset_uuids := liveset_uuids
               ) ;
-              let liveset_refs =
-                List.map
-                  (fun uuid ->
-                    Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
-                  )
-                  liveset_uuids
-              in
+              let live_set = to_refs liveset_uuids in
               if local_failover_decisions_are_ok () then (
                 try
                   Xapi_ha_vm_failover.restart_auto_run_vms ~__context
-                    liveset_refs to_tolerate
+                    ~last_live_set ~live_set to_tolerate
                 with e ->
                   log_backtrace e ;
                   error
@@ -539,9 +541,7 @@ module Monitor = struct
               (* Next update the Host_metrics.live value to spot hosts coming back *)
               let all_hosts = Db.Host.get_all ~__context in
               let livemap =
-                List.map
-                  (fun host -> (host, List.mem host liveset_refs))
-                  all_hosts
+                List.map (fun host -> (host, List.mem host live_set)) all_hosts
               in
               List.iter
                 (fun (host, live) ->
@@ -704,8 +704,7 @@ module Monitor = struct
               in
               if plan_too_old || !plan_out_of_date then (
                 let changed =
-                  Xapi_ha_vm_failover.update_pool_status ~__context
-                    ~live_set:liveset_refs ()
+                  Xapi_ha_vm_failover.update_pool_status ~__context ~live_set ()
                 in
                 (* Extremely bad: something managed to break our careful plan *)
                 if changed && not !plan_out_of_date then
diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml
@@ -1259,9 +1259,26 @@ let restart_failed : (API.ref_VM, unit) Hashtbl.t = Hashtbl.create 10
 (* We also limit the rate we attempt to retry starting the VM. *)
 let last_start_attempt : (API.ref_VM, float) Hashtbl.t = Hashtbl.create 10
 
+module VMRefOrd = struct
+  type t = [`VM] Ref.t
+
+  let compare = Ref.compare
+end
+
+module VMMap = Map.Make (VMRefOrd)
+
+(* When a host is up, it will be added in the HA live set. But it may be still
+   in disabled state so that starting best-effort VMs on it would fail.
+   Meanwhile we don't want to retry on starting them forever.
+   This data is to remember the best-effort VMs which failed to start due to
+   this and the key of the map is the VM ref. And its value is the count of the
+   attempts of starting. This is to avoid retrying for ever and can be adjusted
+   according to how hong the host becomes enabled since it is in HA live set. *)
+let tried_best_eff_vms = ref VMMap.empty
+
 (* Takes the current live_set and number of hosts we're planning to handle, updates the host records in the database
    and restarts any offline protected VMs *)
-let restart_auto_run_vms ~__context live_set n =
+let restart_auto_run_vms ~__context ~last_live_set ~live_set n =
   (* ensure we have live=false on the host_metrics for those hosts not in the live_set; and force state to Halted for
      	   all VMs that are "running" or "paused" with resident_on set to one of the hosts that is now dead
   *)
@@ -1566,32 +1583,90 @@ let restart_auto_run_vms ~__context live_set n =
          			   ok since this is 'best-effort'). NOTE we do not use the restart_vm function above as this will mark the
          			   pool as overcommitted if an HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN is received (although this should never
          			   happen it's better safe than sorry) *)
-      map_parallel
-        ~order_f:(fun vm -> order_f (vm, Db.VM.get_record ~__context ~self:vm))
-        (fun vm ->
+      let is_best_effort r =
+        r.API.vM_ha_restart_priority = Constants.ha_restart_best_effort
+        && r.API.vM_power_state = `Halted
+      in
+      let resets =
+        !reset_vms
+        |> List.map (fun self -> (self, Db.VM.get_record ~__context ~self))
+      in
+      let revalidate_tried m =
+        let valid, invalid =
+          VMMap.bindings m
+          |> List.partition_map (fun (self, _) ->
+                 match Db.VM.get_record ~__context ~self with
+                 | r ->
+                     Left (self, r)
+                 | exception _ ->
+                     Right self
+             )
+        in
+        let to_retry, to_remove =
+          List.partition (fun (_, r) -> is_best_effort r) valid
+        in
+        let m' =
+          List.map fst to_remove
+          |> List.rev_append invalid
+          |> List.fold_left (fun acc vm -> VMMap.remove vm acc) m
+        in
+        (to_retry, m')
+      in
+      let best_effort_vms =
+        (* Carefully decide which best-effort VMs should attempt to start. *)
+        let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in
+        let is_better = List.compare_lengths live_set last_live_set > 0 in
+        ( match (all_prot_is_ok, is_better, last_live_set = live_set) with
+        | true, true, _ ->
+            (* Try to start all the best-effort halted VMs when HA is being
+               enabled or some hosts are transiting to HA live.
+               The DB has been updated by Xapi_vm_lifecycle.force_state_reset.
+               Read again. *)
+            tried_best_eff_vms := VMMap.empty ;
+            Db.VM.get_all_records ~__context
+        | true, false, true ->
+            (* Retry for best-effort VMs which attepmted but failed last time. *)
+            let to_retry, m = revalidate_tried !tried_best_eff_vms in
+            tried_best_eff_vms := m ;
+            List.rev_append to_retry resets
+        | true, false, false | false, _, _ ->
+            (* Try to start only the reset VMs. They were observed as residing
+               on the non-live hosts in this run.
+               Give up starting tried VMs as the HA situation changes. *)
+            tried_best_eff_vms := VMMap.empty ;
+            resets
+        )
+        |> List.filter (fun (_, r) -> is_best_effort r)
+      in
+      map_parallel ~order_f
+        (fun (vm, _) ->
           ( vm
-          , if
-              Db.VM.get_power_state ~__context ~self:vm = `Halted
-              && Db.VM.get_ha_restart_priority ~__context ~self:vm
-                 = Constants.ha_restart_best_effort
-            then
-              TaskChains.task (fun () ->
-                  Client.Client.Async.VM.start ~rpc ~session_id ~vm
-                    ~start_paused:false ~force:true
-              )
-            else
-              TaskChains.ok Rpc.Null
+          , TaskChains.task (fun () ->
+                Client.Client.Async.VM.start ~rpc ~session_id ~vm
+                  ~start_paused:false ~force:true
+            )
           )
         )
-        !reset_vms
+        best_effort_vms
       |> List.iter (fun (vm, result) ->
              match result with
              | Error e ->
+                 tried_best_eff_vms :=
+                   VMMap.update vm
+                     (Option.fold ~none:(Some 1) ~some:(fun n ->
+                          if n < !Xapi_globs.ha_best_effort_max_retries then
+                            Some (n + 1)
+                          else
+                            None
+                      )
+                     )
+                     !tried_best_eff_vms ;
                  error "Failed to restart best-effort VM %s (%s): %s"
                    (Db.VM.get_uuid ~__context ~self:vm)
                    (Db.VM.get_name_label ~__context ~self:vm)
                    (ExnHelper.string_of_exn e)
              | Ok _ ->
+                 tried_best_eff_vms := VMMap.remove vm !tried_best_eff_vms ;
                  ()
          )
   )
diff --git a/ocaml/xapi/xapi_ha_vm_failover.mli b/ocaml/xapi/xapi_ha_vm_failover.mli
@@ -18,7 +18,11 @@
 val all_protected_vms : __context:Context.t -> (API.ref_VM * API.vM_t) list
 
 val restart_auto_run_vms :
-  __context:Context.t -> API.ref_host list -> int -> unit
+     __context:Context.t
+  -> last_live_set:API.ref_host list
+  -> live_set:API.ref_host list
+  -> int
+  -> unit
 (** Take a set of live VMs and attempt to restart all protected VMs which have failed *)
 
 val compute_evacuation_plan :