Skip to content

CA-412983: HA doesn't keep trying to start best-effort VM #6619

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions ocaml/xapi/xapi_globs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,10 @@ let xapi_clusterd_port = ref 8896
*)
let local_yum_repo_port = ref 8000

(* The maximum number of start attempts for HA best-effort VMs. Each attempt is
spaced 20 seconds apart. *)
let ha_best_effort_max_retries = ref 2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this include the first try? i.e. will it try 2 or 3 totally?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It includes the first try.


(* When a host is known to be shutting down or rebooting, we add it's reference in here.
This can be used to force the Host_metrics.live flag to false. *)
let hosts_which_are_shutting_down : API.ref_host list ref = ref []
Expand Down Expand Up @@ -1238,6 +1242,7 @@ let xapi_globs_spec =
; ("max_observer_file_size", Int max_observer_file_size)
; ("test-open", Int test_open) (* for consistency with xenopsd *)
; ("local_yum_repo_port", Int local_yum_repo_port)
; ("ha_best_effort_max_retries", Int ha_best_effort_max_retries)
]

let xapi_globs_spec_with_descriptions =
Expand Down
25 changes: 12 additions & 13 deletions ocaml/xapi/xapi_ha.ml
Original file line number Diff line number Diff line change
Expand Up @@ -508,24 +508,26 @@ module Monitor = struct
let liveset_uuids =
List.sort compare (uuids_of_liveset liveset)
in
let to_refs uuids =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refs are just strings; should we not use a Set for efficiency?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually a sorted list. But yes, a set.mem would be more efficient than List.mem.
I would like to get this fix done first and raise another one for this.

List.map
(fun uuid ->
Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
)
uuids
in
let last_live_set = to_refs !last_liveset_uuids in
if !last_liveset_uuids <> liveset_uuids then (
warn
"Liveset looks different; assuming we need to rerun the \
planner" ;
plan_out_of_date := true ;
last_liveset_uuids := liveset_uuids
) ;
let liveset_refs =
List.map
(fun uuid ->
Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
)
liveset_uuids
in
let live_set = to_refs liveset_uuids in
if local_failover_decisions_are_ok () then (
try
Xapi_ha_vm_failover.restart_auto_run_vms ~__context
liveset_refs to_tolerate
~last_live_set ~live_set to_tolerate
with e ->
log_backtrace e ;
error
Expand All @@ -539,9 +541,7 @@ module Monitor = struct
(* Next update the Host_metrics.live value to spot hosts coming back *)
let all_hosts = Db.Host.get_all ~__context in
let livemap =
List.map
(fun host -> (host, List.mem host liveset_refs))
all_hosts
List.map (fun host -> (host, List.mem host live_set)) all_hosts
in
List.iter
(fun (host, live) ->
Expand Down Expand Up @@ -704,8 +704,7 @@ module Monitor = struct
in
if plan_too_old || !plan_out_of_date then (
let changed =
Xapi_ha_vm_failover.update_pool_status ~__context
~live_set:liveset_refs ()
Xapi_ha_vm_failover.update_pool_status ~__context ~live_set ()
in
(* Extremely bad: something managed to break our careful plan *)
if changed && not !plan_out_of_date then
Expand Down
107 changes: 91 additions & 16 deletions ocaml/xapi/xapi_ha_vm_failover.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1259,9 +1259,26 @@ let restart_failed : (API.ref_VM, unit) Hashtbl.t = Hashtbl.create 10
(* We also limit the rate we attempt to retry starting the VM. *)
let last_start_attempt : (API.ref_VM, float) Hashtbl.t = Hashtbl.create 10

module VMRefOrd = struct
type t = [`VM] Ref.t

let compare = Ref.compare
end

module VMMap = Map.Make (VMRefOrd)

(* When a host is up, it will be added in the HA live set. But it may be still
in disabled state so that starting best-effort VMs on it would fail.
Meanwhile we don't want to retry on starting them forever.
This data is to remember the best-effort VMs which failed to start due to
this and the key of the map is the VM ref. And its value is the count of the
attempts of starting. This is to avoid retrying for ever and can be adjusted
according to how hong the host becomes enabled since it is in HA live set. *)
let tried_best_eff_vms = ref VMMap.empty

(* Takes the current live_set and number of hosts we're planning to handle, updates the host records in the database
and restarts any offline protected VMs *)
let restart_auto_run_vms ~__context live_set n =
let restart_auto_run_vms ~__context ~last_live_set ~live_set n =
(* ensure we have live=false on the host_metrics for those hosts not in the live_set; and force state to Halted for
all VMs that are "running" or "paused" with resident_on set to one of the hosts that is now dead
*)
Expand Down Expand Up @@ -1566,32 +1583,90 @@ let restart_auto_run_vms ~__context live_set n =
ok since this is 'best-effort'). NOTE we do not use the restart_vm function above as this will mark the
pool as overcommitted if an HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN is received (although this should never
happen it's better safe than sorry) *)
map_parallel
~order_f:(fun vm -> order_f (vm, Db.VM.get_record ~__context ~self:vm))
(fun vm ->
let is_best_effort r =
r.API.vM_ha_restart_priority = Constants.ha_restart_best_effort
&& r.API.vM_power_state = `Halted
in
let resets =
!reset_vms
|> List.map (fun self -> (self, Db.VM.get_record ~__context ~self))
in
let revalidate_tried m =
let valid, invalid =
VMMap.bindings m
|> List.partition_map (fun (self, _) ->
match Db.VM.get_record ~__context ~self with
| r ->
Left (self, r)
| exception _ ->
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could use an explanation. The reason we can see an exception is that the VM was deleted by we still have a reference in our VMMap?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. This is to ensure that we don't attempt to start an invalid VM.

Right self
)
in
let to_retry, to_remove =
List.partition (fun (_, r) -> is_best_effort r) valid
in
let m' =
List.map fst to_remove
|> List.rev_append invalid
|> List.fold_left (fun acc vm -> VMMap.remove vm acc) m
in
(to_retry, m')
in
let best_effort_vms =
(* Carefully decide which best-effort VMs should attempt to start. *)
let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in
let is_better = List.compare_lengths live_set last_live_set > 0 in
( match (all_prot_is_ok, is_better, last_live_set = live_set) with
| true, true, _ ->
(* Try to start all the best-effort halted VMs when HA is being
enabled or some hosts are transiting to HA live.
The DB has been updated by Xapi_vm_lifecycle.force_state_reset.
Read again. *)
tried_best_eff_vms := VMMap.empty ;
Db.VM.get_all_records ~__context
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why return all VM records here? Should it apply a filter for best effort VMs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The filter will be applied later.

| true, false, true ->
(* Retry for best-effort VMs which attepmted but failed last time. *)
let to_retry, m = revalidate_tried !tried_best_eff_vms in
tried_best_eff_vms := m ;
List.rev_append to_retry resets
| true, false, false | false, _, _ ->
(* Try to start only the reset VMs. They were observed as residing
on the non-live hosts in this run.
Give up starting tried VMs as the HA situation changes. *)
tried_best_eff_vms := VMMap.empty ;
resets
)
|> List.filter (fun (_, r) -> is_best_effort r)
in
map_parallel ~order_f
(fun (vm, _) ->
( vm
, if
Db.VM.get_power_state ~__context ~self:vm = `Halted
&& Db.VM.get_ha_restart_priority ~__context ~self:vm
= Constants.ha_restart_best_effort
then
TaskChains.task (fun () ->
Client.Client.Async.VM.start ~rpc ~session_id ~vm
~start_paused:false ~force:true
)
else
TaskChains.ok Rpc.Null
, TaskChains.task (fun () ->
Client.Client.Async.VM.start ~rpc ~session_id ~vm
~start_paused:false ~force:true
)
)
)
!reset_vms
best_effort_vms
|> List.iter (fun (vm, result) ->
match result with
| Error e ->
tried_best_eff_vms :=
VMMap.update vm
(Option.fold ~none:(Some 1) ~some:(fun n ->
if n < !Xapi_globs.ha_best_effort_max_retries then
Some (n + 1)
else
None
)
)
!tried_best_eff_vms ;
error "Failed to restart best-effort VM %s (%s): %s"
(Db.VM.get_uuid ~__context ~self:vm)
(Db.VM.get_name_label ~__context ~self:vm)
(ExnHelper.string_of_exn e)
| Ok _ ->
tried_best_eff_vms := VMMap.remove vm !tried_best_eff_vms ;
()
)
)
6 changes: 5 additions & 1 deletion ocaml/xapi/xapi_ha_vm_failover.mli
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
val all_protected_vms : __context:Context.t -> (API.ref_VM * API.vM_t) list

val restart_auto_run_vms :
__context:Context.t -> API.ref_host list -> int -> unit
__context:Context.t
-> last_live_set:API.ref_host list
-> live_set:API.ref_host list
-> int
-> unit
(** Take a set of live VMs and attempt to restart all protected VMs which have failed *)

val compute_evacuation_plan :
Expand Down
Loading