diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index 490045f871..3688478dce 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -436,6 +436,10 @@ let xapi_clusterd_port = ref 8896 *) let local_yum_repo_port = ref 8000 +(* The maximum number of start attempts for HA best-effort VMs. Each attempt is + spaced 20 seconds apart. *) +let ha_best_effort_max_retries = ref 2 + (* When a host is known to be shutting down or rebooting, we add it's reference in here. This can be used to force the Host_metrics.live flag to false. *) let hosts_which_are_shutting_down : API.ref_host list ref = ref [] @@ -1238,6 +1242,7 @@ let xapi_globs_spec = ; ("max_observer_file_size", Int max_observer_file_size) ; ("test-open", Int test_open) (* for consistency with xenopsd *) ; ("local_yum_repo_port", Int local_yum_repo_port) + ; ("ha_best_effort_max_retries", Int ha_best_effort_max_retries) ] let xapi_globs_spec_with_descriptions = diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml index e88ecf1376..fda471b186 100644 --- a/ocaml/xapi/xapi_ha.ml +++ b/ocaml/xapi/xapi_ha.ml @@ -508,6 +508,14 @@ module Monitor = struct let liveset_uuids = List.sort compare (uuids_of_liveset liveset) in + let to_refs uuids = + List.map + (fun uuid -> + Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid) + ) + uuids + in + let last_live_set = to_refs !last_liveset_uuids in if !last_liveset_uuids <> liveset_uuids then ( warn "Liveset looks different; assuming we need to rerun the \ @@ -515,17 +523,11 @@ module Monitor = struct plan_out_of_date := true ; last_liveset_uuids := liveset_uuids ) ; - let liveset_refs = - List.map - (fun uuid -> - Db.Host.get_by_uuid ~__context ~uuid:(Uuidx.to_string uuid) - ) - liveset_uuids - in + let live_set = to_refs liveset_uuids in if local_failover_decisions_are_ok () then ( try Xapi_ha_vm_failover.restart_auto_run_vms ~__context - liveset_refs to_tolerate + ~last_live_set ~live_set to_tolerate with e -> log_backtrace e ; error @@ -539,9 +541,7 @@ module Monitor = struct (* Next update the Host_metrics.live value to spot hosts coming back *) let all_hosts = Db.Host.get_all ~__context in let livemap = - List.map - (fun host -> (host, List.mem host liveset_refs)) - all_hosts + List.map (fun host -> (host, List.mem host live_set)) all_hosts in List.iter (fun (host, live) -> @@ -704,8 +704,7 @@ module Monitor = struct in if plan_too_old || !plan_out_of_date then ( let changed = - Xapi_ha_vm_failover.update_pool_status ~__context - ~live_set:liveset_refs () + Xapi_ha_vm_failover.update_pool_status ~__context ~live_set () in (* Extremely bad: something managed to break our careful plan *) if changed && not !plan_out_of_date then diff --git a/ocaml/xapi/xapi_ha_vm_failover.ml b/ocaml/xapi/xapi_ha_vm_failover.ml index 5cbb946b15..5c43984541 100644 --- a/ocaml/xapi/xapi_ha_vm_failover.ml +++ b/ocaml/xapi/xapi_ha_vm_failover.ml @@ -1259,9 +1259,26 @@ let restart_failed : (API.ref_VM, unit) Hashtbl.t = Hashtbl.create 10 (* We also limit the rate we attempt to retry starting the VM. *) let last_start_attempt : (API.ref_VM, float) Hashtbl.t = Hashtbl.create 10 +module VMRefOrd = struct + type t = [`VM] Ref.t + + let compare = Ref.compare +end + +module VMMap = Map.Make (VMRefOrd) + +(* When a host is up, it will be added in the HA live set. But it may be still + in disabled state so that starting best-effort VMs on it would fail. + Meanwhile we don't want to retry on starting them forever. + This data is to remember the best-effort VMs which failed to start due to + this and the key of the map is the VM ref. And its value is the count of the + attempts of starting. This is to avoid retrying for ever and can be adjusted + according to how hong the host becomes enabled since it is in HA live set. *) +let tried_best_eff_vms = ref VMMap.empty + (* Takes the current live_set and number of hosts we're planning to handle, updates the host records in the database and restarts any offline protected VMs *) -let restart_auto_run_vms ~__context live_set n = +let restart_auto_run_vms ~__context ~last_live_set ~live_set n = (* ensure we have live=false on the host_metrics for those hosts not in the live_set; and force state to Halted for all VMs that are "running" or "paused" with resident_on set to one of the hosts that is now dead *) @@ -1566,32 +1583,90 @@ let restart_auto_run_vms ~__context live_set n = ok since this is 'best-effort'). NOTE we do not use the restart_vm function above as this will mark the pool as overcommitted if an HA_OPERATION_WOULD_BREAK_FAILOVER_PLAN is received (although this should never happen it's better safe than sorry) *) - map_parallel - ~order_f:(fun vm -> order_f (vm, Db.VM.get_record ~__context ~self:vm)) - (fun vm -> + let is_best_effort r = + r.API.vM_ha_restart_priority = Constants.ha_restart_best_effort + && r.API.vM_power_state = `Halted + in + let resets = + !reset_vms + |> List.map (fun self -> (self, Db.VM.get_record ~__context ~self)) + in + let revalidate_tried m = + let valid, invalid = + VMMap.bindings m + |> List.partition_map (fun (self, _) -> + match Db.VM.get_record ~__context ~self with + | r -> + Left (self, r) + | exception _ -> + Right self + ) + in + let to_retry, to_remove = + List.partition (fun (_, r) -> is_best_effort r) valid + in + let m' = + List.map fst to_remove + |> List.rev_append invalid + |> List.fold_left (fun acc vm -> VMMap.remove vm acc) m + in + (to_retry, m') + in + let best_effort_vms = + (* Carefully decide which best-effort VMs should attempt to start. *) + let all_prot_is_ok = List.for_all (fun (_, r) -> r = Ok ()) started in + let is_better = List.compare_lengths live_set last_live_set > 0 in + ( match (all_prot_is_ok, is_better, last_live_set = live_set) with + | true, true, _ -> + (* Try to start all the best-effort halted VMs when HA is being + enabled or some hosts are transiting to HA live. + The DB has been updated by Xapi_vm_lifecycle.force_state_reset. + Read again. *) + tried_best_eff_vms := VMMap.empty ; + Db.VM.get_all_records ~__context + | true, false, true -> + (* Retry for best-effort VMs which attepmted but failed last time. *) + let to_retry, m = revalidate_tried !tried_best_eff_vms in + tried_best_eff_vms := m ; + List.rev_append to_retry resets + | true, false, false | false, _, _ -> + (* Try to start only the reset VMs. They were observed as residing + on the non-live hosts in this run. + Give up starting tried VMs as the HA situation changes. *) + tried_best_eff_vms := VMMap.empty ; + resets + ) + |> List.filter (fun (_, r) -> is_best_effort r) + in + map_parallel ~order_f + (fun (vm, _) -> ( vm - , if - Db.VM.get_power_state ~__context ~self:vm = `Halted - && Db.VM.get_ha_restart_priority ~__context ~self:vm - = Constants.ha_restart_best_effort - then - TaskChains.task (fun () -> - Client.Client.Async.VM.start ~rpc ~session_id ~vm - ~start_paused:false ~force:true - ) - else - TaskChains.ok Rpc.Null + , TaskChains.task (fun () -> + Client.Client.Async.VM.start ~rpc ~session_id ~vm + ~start_paused:false ~force:true + ) ) ) - !reset_vms + best_effort_vms |> List.iter (fun (vm, result) -> match result with | Error e -> + tried_best_eff_vms := + VMMap.update vm + (Option.fold ~none:(Some 1) ~some:(fun n -> + if n < !Xapi_globs.ha_best_effort_max_retries then + Some (n + 1) + else + None + ) + ) + !tried_best_eff_vms ; error "Failed to restart best-effort VM %s (%s): %s" (Db.VM.get_uuid ~__context ~self:vm) (Db.VM.get_name_label ~__context ~self:vm) (ExnHelper.string_of_exn e) | Ok _ -> + tried_best_eff_vms := VMMap.remove vm !tried_best_eff_vms ; () ) ) diff --git a/ocaml/xapi/xapi_ha_vm_failover.mli b/ocaml/xapi/xapi_ha_vm_failover.mli index 20eb3b6b84..abf6374822 100644 --- a/ocaml/xapi/xapi_ha_vm_failover.mli +++ b/ocaml/xapi/xapi_ha_vm_failover.mli @@ -18,7 +18,11 @@ val all_protected_vms : __context:Context.t -> (API.ref_VM * API.vM_t) list val restart_auto_run_vms : - __context:Context.t -> API.ref_host list -> int -> unit + __context:Context.t + -> last_live_set:API.ref_host list + -> live_set:API.ref_host list + -> int + -> unit (** Take a set of live VMs and attempt to restart all protected VMs which have failed *) val compute_evacuation_plan :