From 3dd583aa414f89d1dbe10a03bb41f34a6b5cfd58 Mon Sep 17 00:00:00 2001 From: Ming Lu Date: Thu, 7 Aug 2025 10:37:10 +0800 Subject: [PATCH 1/3] CA-412983: HA doesn't keep trying to start best-effort VM The issue occurs in a scenario involving a HA-enabled pool. A VM with its VM.ha_restart_priority set to best-effort is running on a host. The VM's disk resides on the host's local storage. When the host goes down, the VM cannot be restarted on other hosts due to the disk's local storage dependency. However, after the host recovers and comes back online, the VM still does not automatically start on the original host. Expected behavior: The VM should automatically start on the original host once it has recovered. Generally, this behavior should be applied to all non-agile VMs. Signed-off-by: Ming Lu --- ocaml/xapi/xapi_ha.ml | 25 ++++--- ocaml/xapi/xapi_ha_vm_failover.ml | 104 ++++++++++++++++++++++++----- ocaml/xapi/xapi_ha_vm_failover.mli | 6 +- 3 files changed, 105 insertions(+), 30 deletions(-) diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml index e88ecf13769..fda471b1868 100644 --- a/ocaml/xapi/xapi_ha.ml +++ b/ocaml/xapi/xapi_ha.ml @@ -508,6 +508,14 @@ module Monitor = struct let liveset_uuids = List.sort compare (uuids_of_liveset liveset) in + let to_refs uuids = + List.map + (fun uuid -> + Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid) + ) + uuids + in + let last_live_set = to_refs !last_liveset_uuids in if !last_liveset_uuids <> liveset_uuids then ( warn "Liveset looks different; assuming we need to rerun the \ @@ -515,17 +523,11 @@ module Monitor = struct plan_out_of_date := true ; last_liveset_uuids := liveset_uuids ) ; - let liveset_refs = - List.map - (fun uuid -> - Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid) - ) - liveset_uuids - in + let live_set = to_refs liveset_uuids in if local_failover_decisions_are_ok () then ( try Xapi_ha_vm_failover.restart_auto_run_vms ~__context - liveset_refs to_tolerate + ~last_live_set ~live_set to_tolerate with e -> log_backtrace e ; error @@ -539,9 +541,7 @@ module Monitor = struct (* Next update the Host_metrics.live value to spot hosts coming back *) let all_hosts = Db.Host.get_all ~__context in let livemap = - List.map - (fun host -> (host, List.mem host liveset_refs)) - all_hosts + List.map (fun host -> (host, List.mem host live_set)) all_hosts in List.iter (fun (host, live) -> @@ -704,8 +704,7 @@ module Monitor = struct in if plan_too_old || !plan_out_of_date then ( let changed = - Xapi_ha_vm_failover.update_pool_status ~__context - ~live_set:liveset_refs () + Xapi_ha_vm_failover.update_pool_status ~__context ~live_set () in (* Extremely bad: something managed to break our careful plan *) if changed && not !plan_out_of_date then diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml index 5cbb946b150..b5c6b24f022 100644 --- a/ocaml/xapi/xapi_ha_vm_failover.ml +++ b/ocaml/xapi/xapi_ha_vm_failover.ml @@ -1259,9 +1259,26 @@ let restart_failed : (API.ref_VM, unit) Hashtbl.t = Hashtbl.create 10 (* We also limit the rate we attempt to retry starting the VM. *) let last_start_attempt : (API.ref_VM, float) Hashtbl.t = Hashtbl.create 10 +module VMRefOrd = struct + type t = [`VM] Ref.t + + let compare = Ref.compare +end + +module VMMap = Map.Make (VMRefOrd) + +(* When a host is up, it will be added in the HA live set. But it may be still + in disabled state so that starting best-effort VMs on it would fail. + Meanwhile we don't want to retry on starting them forever. + This data is to remember the best-effort VMs which failed to start due to + this and the key of the map is the VM ref. And its value is the count of the + attempts of starting. This is to avoid retrying for ever and can be adjusted + according to how hong the host becomes enabled since it is in HA live set. *) +let tried_best_eff_vms = ref VMMap.empty + (* Takes the current live_set and number of hosts we're planning to handle, updates the host records in the database and restarts any offline protected VMs *) -let restart_auto_run_vms ~__context live_set n = +let restart_auto_run_vms ~__context ~last_live_set ~live_set n = (* ensure we have live=false on the host_metrics for those hosts not in the live_set; and force state to Halted for all VMs that are "running" or "paused" with resident_on set to one of the hosts that is now dead *) @@ -1566,32 +1583,87 @@ let restart_auto_run_vms ~__context live_set n = ok since this is 'best-effort'). NOTE we do not use the restart_vm function above as this will mark the pool as overcommitted if an HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN is received (although this should never happen it's better safe than sorry) *) - map_parallel - ~order_f:(fun vm -> order_f (vm, Db.VM.get_record ~__context ~self:vm)) - (fun vm -> + let is_best_effort r = + r.API.vM_ha_restart_priority = Constants.ha_restart_best_effort + && r.API.vM_power_state = `Halted + in + let resets = + !reset_vms + |> List.map (fun self -> (self, Db.VM.get_record ~__context ~self)) + in + let revalidate_tried m = + let valid, invalid = + VMMap.bindings m + |> List.partition_map (fun (self, _) -> + match Db.VM.get_record ~__context ~self with + | r -> + Left (self, r) + | exception _ -> + Right self + ) + in + let to_retry, to_remove = + List.partition (fun (_, r) -> is_best_effort r) valid + in + let m' = + List.map fst to_remove + |> List.rev_append invalid + |> List.fold_left (fun acc vm -> VMMap.remove vm acc) m + in + (to_retry, m') + in + let best_effort_vms = + (* Carefully decide which best-effort VMs should attempt to start. *) + let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in + let is_better = List.length live_set > List.length last_live_set in + ( match (all_prot_is_ok, is_better, last_live_set = live_set) with + | true, true, _ -> + (* Try to start all the best-effort halted VMs when HA is being + enabled or some hosts are transiting to HA live. + The DB has been updated by Xapi_vm_lifecycle.force_state_reset. + Read again. *) + tried_best_eff_vms := VMMap.empty ; + Db.VM.get_all_records ~__context + | true, false, true -> + (* Retry for best-effort VMs which attepmted but failed last time. *) + let to_retry, m = revalidate_tried !tried_best_eff_vms in + tried_best_eff_vms := m ; + List.rev_append to_retry resets + | true, false, false | false, _, _ -> + (* Try to start only the reset VMs. They were observed as residing + on the non-live hosts in this run. + Give up starting tried VMs as the HA situation changes. *) + tried_best_eff_vms := VMMap.empty ; + resets + ) + |> List.filter (fun (_, r) -> is_best_effort r) + in + map_parallel ~order_f + (fun (vm, _) -> ( vm - , if - Db.VM.get_power_state ~__context ~self:vm = `Halted - && Db.VM.get_ha_restart_priority ~__context ~self:vm - = Constants.ha_restart_best_effort - then - TaskChains.task (fun () -> - Client.Client.Async.VM.start ~rpc ~session_id ~vm - ~start_paused:false ~force:true - ) - else - TaskChains.ok Rpc.Null + , TaskChains.task (fun () -> + Client.Client.Async.VM.start ~rpc ~session_id ~vm + ~start_paused:false ~force:true + ) ) ) - !reset_vms + best_effort_vms |> List.iter (fun (vm, result) -> match result with | Error e -> + tried_best_eff_vms := + VMMap.update vm + (Option.fold ~none:(Some 1) ~some:(fun n -> + if n < 2 then Some (n + 1) else None + ) + ) + !tried_best_eff_vms ; error "Failed to restart best-effort VM %s (%s): %s" (Db.VM.get_uuid ~__context ~self:vm) (Db.VM.get_name_label ~__context ~self:vm) (ExnHelper.string_of_exn e) | Ok _ -> + tried_best_eff_vms := VMMap.remove vm !tried_best_eff_vms ; () ) ) diff --git a/ocaml/xapi/xapi_ha_vm_failover.mli b/ocaml/xapi/xapi_ha_vm_failover.mli index 20eb3b6b844..abf6374822a 100644 --- a/ocaml/xapi/xapi_ha_vm_failover.mli +++ b/ocaml/xapi/xapi_ha_vm_failover.mli @@ -18,7 +18,11 @@ val all_protected_vms : __context:Context.t -> (API.ref_VM * API.vM_t) list val restart_auto_run_vms : - __context:Context.t -> API.ref_host list -> int -> unit + __context:Context.t + -> last_live_set:API.ref_host list + -> live_set:API.ref_host list + -> int + -> unit (** Take a set of live VMs and attempt to restart all protected VMs which have failed *) val compute_evacuation_plan : From ea89a26c69ddfd583bf979a58b9cce0775a1d7fb Mon Sep 17 00:00:00 2001 From: Ming Lu Date: Mon, 11 Aug 2025 13:29:21 +0800 Subject: [PATCH 2/3] Add Xapi_globs.ha_best_effort_max_retries to eliminate hard-coding Signed-off-by: Ming Lu --- ocaml/xapi/xapi_globs.ml | 5 +++++ ocaml/xapi/xapi_ha_vm_failover.ml | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index 490045f871f..3688478dce8 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -436,6 +436,10 @@ let xapi_clusterd_port = ref 8896 *) let local_yum_repo_port = ref 8000 +(* The maximum number of start attempts for HA best-effort VMs. Each attempt is + spaced 20 seconds apart. *) +let ha_best_effort_max_retries = ref 2 + (* When a host is known to be shutting down or rebooting, we add it's reference in here. This can be used to force the Host_metrics.live flag to false. *) let hosts_which_are_shutting_down : API.ref_host list ref = ref [] @@ -1238,6 +1242,7 @@ let xapi_globs_spec = ; ("max_observer_file_size", Int max_observer_file_size) ; ("test-open", Int test_open) (* for consistency with xenopsd *) ; ("local_yum_repo_port", Int local_yum_repo_port) + ; ("ha_best_effort_max_retries", Int ha_best_effort_max_retries) ] let xapi_globs_spec_with_descriptions = diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml index b5c6b24f022..31dc6e7a2ef 100644 --- a/ocaml/xapi/xapi_ha_vm_failover.ml +++ b/ocaml/xapi/xapi_ha_vm_failover.ml @@ -1654,7 +1654,10 @@ let restart_auto_run_vms ~__context ~last_live_set ~live_set n = tried_best_eff_vms := VMMap.update vm (Option.fold ~none:(Some 1) ~some:(fun n -> - if n < 2 then Some (n + 1) else None + if n < !Xapi_globs.ha_best_effort_max_retries then + Some (n + 1) + else + None ) ) !tried_best_eff_vms ; From 34bdb57dd8a2ecc5ab62ed8c9fc612446fc6eaba Mon Sep 17 00:00:00 2001 From: Ming Lu Date: Wed, 13 Aug 2025 18:49:09 +0800 Subject: [PATCH 3/3] Optimize with List.compare_lengths Signed-off-by: Ming Lu --- ocaml/xapi/xapi_ha_vm_failover.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml index 31dc6e7a2ef..5c43984541c 100644 --- a/ocaml/xapi/xapi_ha_vm_failover.ml +++ b/ocaml/xapi/xapi_ha_vm_failover.ml @@ -1615,7 +1615,7 @@ let restart_auto_run_vms ~__context ~last_live_set ~live_set n = let best_effort_vms = (* Carefully decide which best-effort VMs should attempt to start. *) let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in - let is_better = List.length live_set > List.length last_live_set in + let is_better = List.compare_lengths live_set last_live_set > 0 in ( match (all_prot_is_ok, is_better, last_live_set = live_set) with | true, true, _ -> (* Try to start all the best-effort halted VMs when HA is being