xapi-project · minglumlu · Aug 22, 2025 · Aug 7, 2025 · Aug 11, 2025 · Aug 13, 2025
diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml
@@ -436,6 +436,10 @@ let xapi_clusterd_port = ref 8896
  *)
 let local_yum_repo_port = ref 8000
 
+(* The maximum number of start attempts for HA best-effort VMs. Each attempt is
+   spaced 20 seconds apart. *)
+let ha_best_effort_max_retries = ref 2
+
 (* When a host is known to be shutting down or rebooting, we add it's reference in here.
    This can be used to force the Host_metrics.live flag to false. *)
 let hosts_which_are_shutting_down : API.ref_host list ref = ref []
@@ -1238,6 +1242,7 @@ let xapi_globs_spec =
   ; ("max_observer_file_size", Int max_observer_file_size)
   ; ("test-open", Int test_open) (* for consistency with xenopsd *)
   ; ("local_yum_repo_port", Int local_yum_repo_port)
+  ; ("ha_best_effort_max_retries", Int ha_best_effort_max_retries)
   ]
 
 let xapi_globs_spec_with_descriptions =

diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml
@@ -508,24 +508,26 @@ module Monitor = struct
               let liveset_uuids =
                 List.sort compare (uuids_of_liveset liveset)
               in
+              let to_refs uuids =
+                List.map
+                  (fun uuid ->
+                    Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
+                  )
+                  uuids
+              in
+              let last_live_set = to_refs !last_liveset_uuids in
               if !last_liveset_uuids <> liveset_uuids then (
                 warn
                   "Liveset looks different; assuming we need to rerun the \
                    planner" ;
                 plan_out_of_date := true ;
                 last_liveset_uuids := liveset_uuids
               ) ;
-              let liveset_refs =
-                List.map
-                  (fun uuid ->
-                    Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
-                  )
-                  liveset_uuids
-              in
+              let live_set = to_refs liveset_uuids in
               if local_failover_decisions_are_ok () then (
                 try
                   Xapi_ha_vm_failover.restart_auto_run_vms ~__context
-                    liveset_refs to_tolerate
+                    ~last_live_set ~live_set to_tolerate
                 with e ->
                   log_backtrace e ;
                   error
@@ -539,9 +541,7 @@ module Monitor = struct
               (* Next update the Host_metrics.live value to spot hosts coming back *)
               let all_hosts = Db.Host.get_all ~__context in
               let livemap =
-                List.map
-                  (fun host -> (host, List.mem host liveset_refs))
-                  all_hosts
+                List.map (fun host -> (host, List.mem host live_set)) all_hosts
               in
               List.iter
                 (fun (host, live) ->
@@ -704,8 +704,7 @@ module Monitor = struct
               in
               if plan_too_old || !plan_out_of_date then (
                 let changed =
-                  Xapi_ha_vm_failover.update_pool_status ~__context
-                    ~live_set:liveset_refs ()
+                  Xapi_ha_vm_failover.update_pool_status ~__context ~live_set ()
                 in
                 (* Extremely bad: something managed to break our careful plan *)
                 if changed && not !plan_out_of_date then

diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml
@@ -1259,9 +1259,26 @@ let restart_failed : (API.ref_VM, unit) Hashtbl.t = Hashtbl.create 10
 (* We also limit the rate we attempt to retry starting the VM. *)
 let last_start_attempt : (API.ref_VM, float) Hashtbl.t = Hashtbl.create 10
 
+module VMRefOrd = struct
+  type t = [`VM] Ref.t
+
+  let compare = Ref.compare
+end
+
+module VMMap = Map.Make (VMRefOrd)
+
+(* When a host is up, it will be added in the HA live set. But it may be still
+   in disabled state so that starting best-effort VMs on it would fail.
+   Meanwhile we don't want to retry on starting them forever.
+   This data is to remember the best-effort VMs which failed to start due to
+   this and the key of the map is the VM ref. And its value is the count of the
+   attempts of starting. This is to avoid retrying for ever and can be adjusted
+   according to how hong the host becomes enabled since it is in HA live set. *)
+let tried_best_eff_vms = ref VMMap.empty
+
 (* Takes the current live_set and number of hosts we're planning to handle, updates the host records in the database
    and restarts any offline protected VMs *)
-let restart_auto_run_vms ~__context live_set n =
+let restart_auto_run_vms ~__context ~last_live_set ~live_set n =
   (* ensure we have live=false on the host_metrics for those hosts not in the live_set; and force state to Halted for
      	   all VMs that are "running" or "paused" with resident_on set to one of the hosts that is now dead
   *)
@@ -1566,32 +1583,90 @@ let restart_auto_run_vms ~__context live_set n =
          			   ok since this is 'best-effort'). NOTE we do not use the restart_vm function above as this will mark the
          			   pool as overcommitted if an HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN is received (although this should never
          			   happen it's better safe than sorry) *)
-      map_parallel
-        ~order_f:(fun vm -> order_f (vm, Db.VM.get_record ~__context ~self:vm))
-        (fun vm ->
+      let is_best_effort r =
+        r.API.vM_ha_restart_priority = Constants.ha_restart_best_effort
+        && r.API.vM_power_state = `Halted
+      in
+      let resets =
+        !reset_vms
+        |> List.map (fun self -> (self, Db.VM.get_record ~__context ~self))
+      in
+      let revalidate_tried m =
+        let valid, invalid =
+          VMMap.bindings m
+          |> List.partition_map (fun (self, _) ->
+                 match Db.VM.get_record ~__context ~self with
+                 | r ->
+                     Left (self, r)
+                 | exception _ ->
+                     Right self
+             )
+        in
+        let to_retry, to_remove =
+          List.partition (fun (_, r) -> is_best_effort r) valid
+        in
+        let m' =
+          List.map fst to_remove
+          |> List.rev_append invalid
+          |> List.fold_left (fun acc vm -> VMMap.remove vm acc) m
+        in
+        (to_retry, m')
+      in
+      let best_effort_vms =
+        (* Carefully decide which best-effort VMs should attempt to start. *)
+        let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in
+        let is_better = List.compare_lengths live_set last_live_set > 0 in
+        ( match (all_prot_is_ok, is_better, last_live_set = live_set) with
+        | true, true, _ ->
+            (* Try to start all the best-effort halted VMs when HA is being
+               enabled or some hosts are transiting to HA live.
+               The DB has been updated by Xapi_vm_lifecycle.force_state_reset.
+               Read again. *)
+            tried_best_eff_vms := VMMap.empty ;
+            Db.VM.get_all_records ~__context
+        | true, false, true ->
+            (* Retry for best-effort VMs which attepmted but failed last time. *)
+            let to_retry, m = revalidate_tried !tried_best_eff_vms in
+            tried_best_eff_vms := m ;
+            List.rev_append to_retry resets
+        | true, false, false | false, _, _ ->
+            (* Try to start only the reset VMs. They were observed as residing
+               on the non-live hosts in this run.
+               Give up starting tried VMs as the HA situation changes. *)
+            tried_best_eff_vms := VMMap.empty ;
+            resets
+        )
+        |> List.filter (fun (_, r) -> is_best_effort r)
+      in
+      map_parallel ~order_f
+        (fun (vm, _) ->
           ( vm
-          , if
-              Db.VM.get_power_state ~__context ~self:vm = `Halted
-              && Db.VM.get_ha_restart_priority ~__context ~self:vm
-                 = Constants.ha_restart_best_effort
-            then
-              TaskChains.task (fun () ->
-                  Client.Client.Async.VM.start ~rpc ~session_id ~vm
-                    ~start_paused:false ~force:true
-              )
-            else
-              TaskChains.ok Rpc.Null
+          , TaskChains.task (fun () ->
+                Client.Client.Async.VM.start ~rpc ~session_id ~vm
+                  ~start_paused:false ~force:true
+            )
           )
         )
-        !reset_vms
+        best_effort_vms
       |> List.iter (fun (vm, result) ->
              match result with
              | Error e ->
+                 tried_best_eff_vms :=
+                   VMMap.update vm
+                     (Option.fold ~none:(Some 1) ~some:(fun n ->
+                          if n < !Xapi_globs.ha_best_effort_max_retries then
+                            Some (n + 1)
+                          else
+                            None
+                      )
+                     )
+                     !tried_best_eff_vms ;
                  error "Failed to restart best-effort VM %s (%s): %s"
                    (Db.VM.get_uuid ~__context ~self:vm)
                    (Db.VM.get_name_label ~__context ~self:vm)
                    (ExnHelper.string_of_exn e)
              | Ok _ ->
+                 tried_best_eff_vms := VMMap.remove vm !tried_best_eff_vms ;
                  ()
          )
   )
diff --git a/ocaml/xapi/xapi_ha_vm_failover.mli b/ocaml/xapi/xapi_ha_vm_failover.mli
@@ -18,7 +18,11 @@
 val all_protected_vms : __context:Context.t -> (API.ref_VM * API.vM_t) list
 
 val restart_auto_run_vms :
-  __context:Context.t -> API.ref_host list -> int -> unit
+     __context:Context.t
+  -> last_live_set:API.ref_host list
+  -> live_set:API.ref_host list
+  -> int
+  -> unit
 (** Take a set of live VMs and attempt to restart all protected VMs which have failed *)
 
 val compute_evacuation_plan :