Skip to content

Commit bc3f4c5

Browse files
authored
XSI-2155/CA-411684: When a host has most of its memory used, restarted VMs are not placed in a single NUMA node (#6929)
Xenopsd takes the minimum between the actual free memory on a NUMA node, and an estimate. The estimate never got implemented, so it just took the last free memory value, which resulted in the free memory only ever decreasing on a NUMA node, and once the host memory is fully used NUMA optimization would stop working for newly (re)booted VMs, even if meanwhile other VMs were stopped. Master has a different fix (it uses claims, and tracks memory usage accurately), but that requires a newer version of Xen. Initially it was decided that this bug wouldn't get fixed on the LCM branch, but that decision has changed now. Instead it should track how much memory the pending VM starts are using. This estimate can become inaccurate fairly quickly on old versions of Xen without per-node claims. To reduce that window we only use the estimates when we have other pending domain builds. When we have 0 pending domain builds, we reset the estimate to the actual free memory on the node (note: reset, *not* take the minimum). Then finally we take the minimum between the actual free memory on a node and the above estimate. In practice this is not as simple as that, because Xen doesn't always listen to us when we give it a CPU affinity hint, and sometimes it places the memory on other nodes. This happens particularly when the last 4GiB (the 32-bit DMA zone) is used up on node0: Xen would only use this as a last resort (some devices can only work if you have memory available here). To fix this xenopsd considers node0 to have less memory available than it actually does (so it'll prefer other nodes instead of using up the last 4GiB). And finally it'll spread the last VMs across all nodes, when we can't accurately estimate how much memory ends up where (we don't know how much of the DMA32 heap is already used or not by devices). This bug also affects master in a different way: there due to the claims Xen does *exactly* what we ask it to do, and will completely deplete the memory on node0. But that could cause failures with some devices, so we might need to include the 2nd commit in some form on master too (maybe don't reserve the full 4GiB, but only a few hundred MiB?). Some manual testing shows promising results: previously VMs would get all/all affinity, and after this fix they get the proper affinity again. Getting the automated tests to pass is still an open problem due to the 4GiB handling, which will require some test changes too to not be so strict, so opening this as a draft.
2 parents a691062 + a16a427 commit bc3f4c5

File tree

4 files changed

+186
-9
lines changed

4 files changed

+186
-9
lines changed

ocaml/xenopsd/lib/topology.ml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ module NUMAResource = struct
4646

4747
let min_memory r1 r2 = {r1 with memfree= min r1.memfree r2.memfree}
4848

49+
let shrink_memory r mem =
50+
{r with memfree= Int64.max 0L (Int64.sub r.memfree mem)}
51+
4952
let pp_dump =
5053
Fmt.(
5154
Dump.record

ocaml/xenopsd/lib/topology.mli

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ module NUMAResource : sig
5656
val min_memory : t -> t -> t
5757
(** [min_memory a b] is [a] with the memory set to the minimum of [a] and [b] *)
5858

59+
val shrink_memory : t -> int64 -> t
60+
(** [shrink_memory a mem] is [a] with the memory set to [max 0 (a-mem)] *)
61+
5962
val pp_dump : t Fmt.t
6063
(** [pp_dump ppf v] pretty-prints [v] on [ppf] *)
6164
end
@@ -89,6 +92,8 @@ module NUMA : sig
8992
(** A NUMA node index. Distinct from an int to avoid mixing with CPU numbers *)
9093
type node = private Node of int
9194

95+
module NodeMap : Map.S with type key = node
96+
9297
val make :
9398
distances:int array array
9499
-> cpu_to_node:int array

ocaml/xenopsd/lib/xenopsd.ml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ let numa_placement_compat = ref true
7272

7373
let numa_best_effort_prio_mem_only = ref false
7474

75+
let numa_reserve_node0_dmaheap_bits = ref 31
76+
7577
(* O(N^2) operations, until we get a xenstore cache, so use a small number here *)
7678
let vm_guest_agent_xenstore_quota = ref 128
7779

@@ -282,6 +284,12 @@ let options =
282284
filtered NUMA nodes based on available memory. Only use if there are \
283285
issues with the new best effort policy"
284286
)
287+
; ( "numa-reserve-node0-dmaheap-bits"
288+
, Arg.Int (fun x -> numa_reserve_node0_dmaheap_bits := x)
289+
, (fun () -> string_of_int !numa_reserve_node0_dmaheap_bits)
290+
, "Reserve 2^N bytes on node0 in the NUMA planner, similar to how Xen would\n\
291+
\ protect the low 4GiB of the memory for the DMA heap"
292+
)
285293
; ( "pci-quarantine"
286294
, Arg.Bool (fun b -> pci_quarantine := b)
287295
, (fun () -> string_of_bool !pci_quarantine)

ocaml/xenopsd/xc/domain.ml

Lines changed: 170 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -984,9 +984,134 @@ let numa_hierarchy =
984984
NUMA.make ~distances ~cpu_to_node ~node_cores
985985
)
986986

987+
module NUMAResources = struct
988+
module DomidMap = Map.Make (Int)
989+
990+
type node_memusage = int64 Topology.NUMA.NodeMap.t
991+
992+
type t = {
993+
host: Topology.NUMAResource.t Topology.NUMA.NodeMap.t
994+
; estimated: node_memusage DomidMap.t
995+
; outstanding_builds: node_memusage DomidMap.t
996+
}
997+
998+
let node_memusage_of_domidmap node t =
999+
DomidMap.fold
1000+
(fun _ nodemap acc ->
1001+
Topology.NUMA.NodeMap.find_opt node nodemap
1002+
|> Option.value ~default:0L
1003+
|> Int64.add acc
1004+
)
1005+
t 0L
1006+
1007+
let to_nodes t =
1008+
let open Topology in
1009+
t.host
1010+
|> NUMA.NodeMap.mapi @@ fun node r ->
1011+
NUMAResource.shrink_memory r
1012+
(Int64.add
1013+
(node_memusage_of_domidmap node t.estimated)
1014+
(node_memusage_of_domidmap node t.outstanding_builds)
1015+
)
1016+
1017+
let of_host nodes =
1018+
{
1019+
host= nodes |> Topology.NUMA.NodeMap.of_seq
1020+
; estimated= DomidMap.empty
1021+
; outstanding_builds= DomidMap.empty
1022+
}
1023+
1024+
let empty =
1025+
{
1026+
host= Topology.NUMA.NodeMap.empty
1027+
; estimated= DomidMap.empty
1028+
; outstanding_builds= DomidMap.empty
1029+
}
1030+
1031+
let pp_dump_nodemap pp_elt =
1032+
let open Topology.NUMA in
1033+
Fmt.(Dump.iter_bindings NodeMap.iter (any "nodemap") pp_dump_node pp_elt)
1034+
1035+
let pp_dump_domidmap pp_elt =
1036+
Fmt.(Dump.iter_bindings DomidMap.iter (any "domidmap") int pp_elt)
1037+
1038+
let pp_dump_numa_resources =
1039+
Fmt.Dump.(
1040+
record
1041+
[
1042+
field "host"
1043+
(fun t -> t.host)
1044+
(pp_dump_nodemap Topology.NUMAResource.pp_dump)
1045+
; field "estimated"
1046+
(fun t -> t.estimated)
1047+
(pp_dump_domidmap @@ pp_dump_nodemap Fmt.int64)
1048+
; field "outstanding_builds"
1049+
(fun t -> t.outstanding_builds)
1050+
(pp_dump_domidmap @@ pp_dump_nodemap Fmt.int64)
1051+
]
1052+
)
1053+
1054+
let add_outstanding domid plan t =
1055+
D.debug "domid %u: adding outstanding NUMA memory usage estimate: %s" domid
1056+
((Fmt.to_to_string @@ pp_dump_nodemap Fmt.int64) plan) ;
1057+
{t with outstanding_builds= DomidMap.add domid plan t.outstanding_builds}
1058+
1059+
let remove_outstanding domid t =
1060+
D.debug "domid %u: removing outstanding NUMA memory usage estimate" domid ;
1061+
{t with outstanding_builds= DomidMap.remove domid t.outstanding_builds}
1062+
1063+
let transfer_outstanding domid t =
1064+
D.debug "domid %u: transfering outstanding NUMA memory usage estimate" domid ;
1065+
DomidMap.find_opt domid t.outstanding_builds
1066+
|> Option.fold ~none:t ~some:(fun plan ->
1067+
{
1068+
t with
1069+
estimated= DomidMap.add domid plan t.estimated
1070+
; outstanding_builds= DomidMap.remove domid t.outstanding_builds
1071+
}
1072+
)
1073+
1074+
let estimate_plan nodes mem plan =
1075+
let open Topology in
1076+
let _, _, nodemap =
1077+
List.fold_left
1078+
(fun (mem, remaining_nodes, acc) node ->
1079+
let per_node_usage =
1080+
Int64.div mem (remaining_nodes |> Int64.of_int)
1081+
in
1082+
let r = NUMA.NodeMap.find node nodes in
1083+
let allocated_from_node =
1084+
Int64.min per_node_usage r.NUMAResource.memfree
1085+
in
1086+
( Int64.sub mem allocated_from_node
1087+
, remaining_nodes - 1
1088+
, NUMA.NodeMap.add node allocated_from_node acc
1089+
)
1090+
)
1091+
(mem, List.length plan, NUMA.NodeMap.empty)
1092+
plan
1093+
in
1094+
nodemap
1095+
end
1096+
9871097
let numa_mutex = Mutex.create ()
9881098

989-
let numa_resources = ref None
1099+
(* protected by numa_mutex *)
1100+
let numa_resources = ref NUMAResources.empty
1101+
1102+
let numa_reserve_node0 mem =
1103+
let reserved = Int64.shift_left 1L !Xenopsd.numa_reserve_node0_dmaheap_bits in
1104+
let mem' = Int64.(max 0L (sub mem reserved)) in
1105+
(* Prefer nodes other than node0 when node0 is low on memory.
1106+
This is reserved for the DMA heap in Xen, and even if we try to make a VM
1107+
use it up, Xen will move some of its memory to other nodes to avoid
1108+
running out.
1109+
Without the claim API we have no way of forcing or querying this
1110+
(and even with the claim API it'd be desirable to avoid running out on
1111+
node0).
1112+
*)
1113+
D.debug "numa_reserve_node0, reducing free memory: %Lu -> %Lu" mem mem' ;
1114+
mem'
9901115

9911116
let numa_init () =
9921117
let xcext = Xenctrlext.get_handle () in
@@ -1007,28 +1132,48 @@ let set_affinity = function
10071132
| Xenops_server.Soft ->
10081133
Xenctrlext.vcpu_setaffinity_soft
10091134

1010-
let numa_placement domid ~vcpus ~cores ~memory affinity =
1135+
let numa_placement domid ~vcpus ~cores ~memory ~required_free affinity =
10111136
let open Xenctrlext in
10121137
let open Topology in
1138+
let open NUMAResources in
10131139
with_lock numa_mutex (fun () ->
10141140
let ( let* ) = Option.bind in
10151141
let xcext = get_handle () in
10161142
let* host = Lazy.force numa_hierarchy in
10171143
let numa_meminfo = (numainfo xcext).memory |> Array.to_seq in
10181144
let nodes =
10191145
Seq.map2
1020-
(fun node m -> NUMA.resource host node ~memory:m.memfree)
1146+
(fun node m ->
1147+
( node
1148+
, let (NUMA.Node nodeid) = node in
1149+
let memory = m.memfree in
1150+
let memory =
1151+
if nodeid = 0 then
1152+
numa_reserve_node0 memory
1153+
else
1154+
memory
1155+
in
1156+
NUMA.resource host node ~memory
1157+
)
1158+
)
10211159
(NUMA.nodes host) numa_meminfo
10221160
in
10231161
let vm = NUMARequest.make ~memory ~vcpus ~cores in
1162+
1163+
let previous = !numa_resources in
1164+
if DomidMap.is_empty previous.outstanding_builds then begin
1165+
(* no outstanding domain builds: synchronize actual free node memory
1166+
with host *)
1167+
numa_resources := of_host nodes ;
1168+
D.debug "no outstanding domain builds, dropping NUMA usage estimates"
1169+
end ;
1170+
D.debug "numa_resources(in): %s"
1171+
@@ Fmt.to_to_string pp_dump_numa_resources !numa_resources ;
1172+
1173+
let estimated_nodes = to_nodes !numa_resources in
10241174
let nodea =
1025-
match !numa_resources with
1026-
| None ->
1027-
Array.of_seq nodes
1028-
| Some a ->
1029-
Array.map2 NUMAResource.min_memory (Array.of_seq nodes) a
1175+
estimated_nodes |> NUMA.NodeMap.to_seq |> Seq.map snd |> Array.of_seq
10301176
in
1031-
numa_resources := Some nodea ;
10321177
let cpu_affinity, memory_plan =
10331178
match Softaffinity.plan ~vm host nodea with
10341179
| None ->
@@ -1037,6 +1182,12 @@ let numa_placement domid ~vcpus ~cores ~memory affinity =
10371182
| Some (cpu_affinity, mem_plan) ->
10381183
(Some cpu_affinity, mem_plan)
10391184
in
1185+
numa_resources :=
1186+
add_outstanding domid
1187+
(estimate_plan estimated_nodes required_free memory_plan)
1188+
!numa_resources ;
1189+
D.debug "numa_resources(out): %s"
1190+
@@ Fmt.to_to_string pp_dump_numa_resources !numa_resources ;
10401191
let set_vcpu_affinity = function
10411192
| None ->
10421193
D.debug "%s: not setting vcpu affinity for domain %d" __FUNCTION__
@@ -1177,6 +1328,7 @@ let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid =
11771328
in
11781329
numa_placement domid ~vcpus ~cores
11791330
~memory:(Int64.mul memory.xen_max_mib 1048576L)
1331+
~required_free:(Int64.mul memory.required_host_free_mib 1048576L)
11801332
affinity
11811333
|> Option.map fst
11821334
)
@@ -1358,6 +1510,11 @@ let build (task : Xenops_task.task_handle) ~xc ~xs ~store_domid ~console_domid
13581510
let target_mib = Memory.mib_of_kib_used target_kib in
13591511
(* Sanity check. *)
13601512
assert (target_mib <= static_max_mib) ;
1513+
let finally () =
1514+
with_lock numa_mutex @@ fun () ->
1515+
numa_resources := NUMAResources.remove_outstanding domid !numa_resources
1516+
in
1517+
Fun.protect ~finally @@ fun () ->
13611518
let store_mfn, store_port, console_mfn, console_port, vm_stuff, domain_type =
13621519
match info.priv with
13631520
| BuildHVM hvminfo ->
@@ -1444,6 +1601,10 @@ let build (task : Xenops_task.task_handle) ~xc ~xs ~store_domid ~console_domid
14441601
)
14451602
in
14461603
let local_stuff = console_keys console_port console_mfn in
1604+
let () =
1605+
with_lock numa_mutex @@ fun () ->
1606+
numa_resources := NUMAResources.transfer_outstanding domid !numa_resources
1607+
in
14471608
build_post ~xc ~xs ~target_mib ~static_max_mib domid domain_type store_mfn
14481609
store_port local_stuff vm_stuff
14491610

0 commit comments

Comments
 (0)