Skip to content

Commit f5eb08d

Browse files
authored
CA-412983: HA doesn't keep trying to start best-effort VM (#6619)
The issue occurs in a scenario involving a HA-enabled pool. A VM with its VM.ha_restart_priority set to best-effort is running on a host. The VM's disk resides on the host's local storage. When the host goes down, the VM cannot be restarted on other hosts due to the disk's local storage dependency. However, after the host recovers and comes back online, the VM still does not automatically start on the original host. Expected behavior: The VM should automatically start on the original host once it has recovered. Generally, this behavior should be applied to all non-agile VMs.
2 parents 2d17143 + 34bdb57 commit f5eb08d

File tree

4 files changed

+113
-30
lines changed

4 files changed

+113
-30
lines changed

ocaml/xapi/xapi_globs.ml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,10 @@ let xapi_clusterd_port = ref 8896
436436
*)
437437
let local_yum_repo_port = ref 8000
438438

439+
(* The maximum number of start attempts for HA best-effort VMs. Each attempt is
440+
spaced 20 seconds apart. *)
441+
let ha_best_effort_max_retries = ref 2
442+
439443
(* When a host is known to be shutting down or rebooting, we add it's reference in here.
440444
This can be used to force the Host_metrics.live flag to false. *)
441445
let hosts_which_are_shutting_down : API.ref_host list ref = ref []
@@ -1238,6 +1242,7 @@ let xapi_globs_spec =
12381242
; ("max_observer_file_size", Int max_observer_file_size)
12391243
; ("test-open", Int test_open) (* for consistency with xenopsd *)
12401244
; ("local_yum_repo_port", Int local_yum_repo_port)
1245+
; ("ha_best_effort_max_retries", Int ha_best_effort_max_retries)
12411246
]
12421247

12431248
let xapi_globs_spec_with_descriptions =

ocaml/xapi/xapi_ha.ml

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -508,24 +508,26 @@ module Monitor = struct
508508
let liveset_uuids =
509509
List.sort compare (uuids_of_liveset liveset)
510510
in
511+
let to_refs uuids =
512+
List.map
513+
(fun uuid ->
514+
Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
515+
)
516+
uuids
517+
in
518+
let last_live_set = to_refs !last_liveset_uuids in
511519
if !last_liveset_uuids <> liveset_uuids then (
512520
warn
513521
"Liveset looks different; assuming we need to rerun the \
514522
planner" ;
515523
plan_out_of_date := true ;
516524
last_liveset_uuids := liveset_uuids
517525
) ;
518-
let liveset_refs =
519-
List.map
520-
(fun uuid ->
521-
Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid)
522-
)
523-
liveset_uuids
524-
in
526+
let live_set = to_refs liveset_uuids in
525527
if local_failover_decisions_are_ok () then (
526528
try
527529
Xapi_ha_vm_failover.restart_auto_run_vms ~__context
528-
liveset_refs to_tolerate
530+
~last_live_set ~live_set to_tolerate
529531
with e ->
530532
log_backtrace e ;
531533
error
@@ -539,9 +541,7 @@ module Monitor = struct
539541
(* Next update the Host_metrics.live value to spot hosts coming back *)
540542
let all_hosts = Db.Host.get_all ~__context in
541543
let livemap =
542-
List.map
543-
(fun host -> (host, List.mem host liveset_refs))
544-
all_hosts
544+
List.map (fun host -> (host, List.mem host live_set)) all_hosts
545545
in
546546
List.iter
547547
(fun (host, live) ->
@@ -704,8 +704,7 @@ module Monitor = struct
704704
in
705705
if plan_too_old || !plan_out_of_date then (
706706
let changed =
707-
Xapi_ha_vm_failover.update_pool_status ~__context
708-
~live_set:liveset_refs ()
707+
Xapi_ha_vm_failover.update_pool_status ~__context ~live_set ()
709708
in
710709
(* Extremely bad: something managed to break our careful plan *)
711710
if changed && not !plan_out_of_date then

ocaml/xapi/xapi_ha_vm_failover.ml

Lines changed: 91 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1259,9 +1259,26 @@ let restart_failed : (API.ref_VM, unit) Hashtbl.t = Hashtbl.create 10
12591259
(* We also limit the rate we attempt to retry starting the VM. *)
12601260
let last_start_attempt : (API.ref_VM, float) Hashtbl.t = Hashtbl.create 10
12611261

1262+
module VMRefOrd = struct
1263+
type t = [`VM] Ref.t
1264+
1265+
let compare = Ref.compare
1266+
end
1267+
1268+
module VMMap = Map.Make (VMRefOrd)
1269+
1270+
(* When a host is up, it will be added in the HA live set. But it may be still
1271+
in disabled state so that starting best-effort VMs on it would fail.
1272+
Meanwhile we don't want to retry on starting them forever.
1273+
This data is to remember the best-effort VMs which failed to start due to
1274+
this and the key of the map is the VM ref. And its value is the count of the
1275+
attempts of starting. This is to avoid retrying for ever and can be adjusted
1276+
according to how hong the host becomes enabled since it is in HA live set. *)
1277+
let tried_best_eff_vms = ref VMMap.empty
1278+
12621279
(* Takes the current live_set and number of hosts we're planning to handle, updates the host records in the database
12631280
and restarts any offline protected VMs *)
1264-
let restart_auto_run_vms ~__context live_set n =
1281+
let restart_auto_run_vms ~__context ~last_live_set ~live_set n =
12651282
(* ensure we have live=false on the host_metrics for those hosts not in the live_set; and force state to Halted for
12661283
all VMs that are "running" or "paused" with resident_on set to one of the hosts that is now dead
12671284
*)
@@ -1566,32 +1583,90 @@ let restart_auto_run_vms ~__context live_set n =
15661583
ok since this is 'best-effort'). NOTE we do not use the restart_vm function above as this will mark the
15671584
pool as overcommitted if an HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN is received (although this should never
15681585
happen it's better safe than sorry) *)
1569-
map_parallel
1570-
~order_f:(fun vm -> order_f (vm, Db.VM.get_record ~__context ~self:vm))
1571-
(fun vm ->
1586+
let is_best_effort r =
1587+
r.API.vM_ha_restart_priority = Constants.ha_restart_best_effort
1588+
&& r.API.vM_power_state = `Halted
1589+
in
1590+
let resets =
1591+
!reset_vms
1592+
|> List.map (fun self -> (self, Db.VM.get_record ~__context ~self))
1593+
in
1594+
let revalidate_tried m =
1595+
let valid, invalid =
1596+
VMMap.bindings m
1597+
|> List.partition_map (fun (self, _) ->
1598+
match Db.VM.get_record ~__context ~self with
1599+
| r ->
1600+
Left (self, r)
1601+
| exception _ ->
1602+
Right self
1603+
)
1604+
in
1605+
let to_retry, to_remove =
1606+
List.partition (fun (_, r) -> is_best_effort r) valid
1607+
in
1608+
let m' =
1609+
List.map fst to_remove
1610+
|> List.rev_append invalid
1611+
|> List.fold_left (fun acc vm -> VMMap.remove vm acc) m
1612+
in
1613+
(to_retry, m')
1614+
in
1615+
let best_effort_vms =
1616+
(* Carefully decide which best-effort VMs should attempt to start. *)
1617+
let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in
1618+
let is_better = List.compare_lengths live_set last_live_set > 0 in
1619+
( match (all_prot_is_ok, is_better, last_live_set = live_set) with
1620+
| true, true, _ ->
1621+
(* Try to start all the best-effort halted VMs when HA is being
1622+
enabled or some hosts are transiting to HA live.
1623+
The DB has been updated by Xapi_vm_lifecycle.force_state_reset.
1624+
Read again. *)
1625+
tried_best_eff_vms := VMMap.empty ;
1626+
Db.VM.get_all_records ~__context
1627+
| true, false, true ->
1628+
(* Retry for best-effort VMs which attepmted but failed last time. *)
1629+
let to_retry, m = revalidate_tried !tried_best_eff_vms in
1630+
tried_best_eff_vms := m ;
1631+
List.rev_append to_retry resets
1632+
| true, false, false | false, _, _ ->
1633+
(* Try to start only the reset VMs. They were observed as residing
1634+
on the non-live hosts in this run.
1635+
Give up starting tried VMs as the HA situation changes. *)
1636+
tried_best_eff_vms := VMMap.empty ;
1637+
resets
1638+
)
1639+
|> List.filter (fun (_, r) -> is_best_effort r)
1640+
in
1641+
map_parallel ~order_f
1642+
(fun (vm, _) ->
15721643
( vm
1573-
, if
1574-
Db.VM.get_power_state ~__context ~self:vm = `Halted
1575-
&& Db.VM.get_ha_restart_priority ~__context ~self:vm
1576-
= Constants.ha_restart_best_effort
1577-
then
1578-
TaskChains.task (fun () ->
1579-
Client.Client.Async.VM.start ~rpc ~session_id ~vm
1580-
~start_paused:false ~force:true
1581-
)
1582-
else
1583-
TaskChains.ok Rpc.Null
1644+
, TaskChains.task (fun () ->
1645+
Client.Client.Async.VM.start ~rpc ~session_id ~vm
1646+
~start_paused:false ~force:true
1647+
)
15841648
)
15851649
)
1586-
!reset_vms
1650+
best_effort_vms
15871651
|> List.iter (fun (vm, result) ->
15881652
match result with
15891653
| Error e ->
1654+
tried_best_eff_vms :=
1655+
VMMap.update vm
1656+
(Option.fold ~none:(Some 1) ~some:(fun n ->
1657+
if n < !Xapi_globs.ha_best_effort_max_retries then
1658+
Some (n + 1)
1659+
else
1660+
None
1661+
)
1662+
)
1663+
!tried_best_eff_vms ;
15901664
error "Failed to restart best-effort VM %s (%s): %s"
15911665
(Db.VM.get_uuid ~__context ~self:vm)
15921666
(Db.VM.get_name_label ~__context ~self:vm)
15931667
(ExnHelper.string_of_exn e)
15941668
| Ok _ ->
1669+
tried_best_eff_vms := VMMap.remove vm !tried_best_eff_vms ;
15951670
()
15961671
)
15971672
)

ocaml/xapi/xapi_ha_vm_failover.mli

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
val all_protected_vms : __context:Context.t -> (API.ref_VM * API.vM_t) list
1919

2020
val restart_auto_run_vms :
21-
__context:Context.t -> API.ref_host list -> int -> unit
21+
__context:Context.t
22+
-> last_live_set:API.ref_host list
23+
-> live_set:API.ref_host list
24+
-> int
25+
-> unit
2226
(** Take a set of live VMs and attempt to restart all protected VMs which have failed *)
2327

2428
val compute_evacuation_plan :

0 commit comments

Comments
 (0)