diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 7aeeb6409f..e9d1d8c02e 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -34,6 +34,7 @@ use hyperv_ic_resources::shutdown::ShutdownRpc; use hyperv_ic_resources::shutdown::ShutdownType; use igvm_defs::MemoryMapEntryType; use inspect::Inspect; +use mana_driver::save_restore::ManaSavedState; use mesh::CancelContext; use mesh::MeshPayload; use mesh::error::RemoteError; @@ -114,6 +115,8 @@ pub trait LoadedVmNetworkSettings: Inspect { vmbus_server: &Option, dma_client_spawner: DmaClientSpawner, is_isolated: bool, + save_restore_supported: bool, + mana_state: Option<&ManaSavedState>, ) -> anyhow::Result; /// Callback when network is removed externally. @@ -127,6 +130,9 @@ pub trait LoadedVmNetworkSettings: Inspect { &self, mut params: PacketCaptureParams, ) -> anyhow::Result>; + + /// Save the network state for restoration after servicing. + async fn save(&mut self) -> Vec; } /// A VM that has been loaded and can be run. @@ -187,6 +193,7 @@ pub(crate) struct LoadedVm { pub _periodic_telemetry_task: Task<()>, pub nvme_keep_alive: bool, + pub mana_keep_alive: bool, pub test_configuration: Option, pub dma_manager: OpenhclDmaManager, } @@ -298,7 +305,7 @@ impl LoadedVm { WorkerRpc::Restart(rpc) => { let state = async { let running = self.stop().await; - match self.save(None, false).await { + match self.save(None, false, false).await { Ok(servicing_state) => Some((rpc, servicing_state)), Err(err) => { if running { @@ -363,7 +370,7 @@ impl LoadedVm { UhVmRpc::Save(rpc) => { rpc.handle_failable(async |()| { let running = self.stop().await; - let r = self.save(None, false).await; + let r = self.save(None, false, false).await; if running { self.start(None).await; } @@ -565,6 +572,7 @@ impl LoadedVm { // NOTE: This is set via the corresponding env arg, as this feature is // experimental. let nvme_keepalive = self.nvme_keep_alive && capabilities_flags.enable_nvme_keepalive(); + let mana_keepalive = self.mana_keep_alive && capabilities_flags.enable_mana_keepalive(); // Do everything before the log flush under a span. let r = async { @@ -579,7 +587,7 @@ impl LoadedVm { anyhow::bail!("cannot service underhill while paused"); } - let mut state = self.save(Some(deadline), nvme_keepalive).await?; + let mut state = self.save(Some(deadline), nvme_keepalive, mana_keepalive).await?; state.init_state.correlation_id = Some(correlation_id); // Unload any network devices. @@ -741,19 +749,33 @@ impl LoadedVm { async fn save( &mut self, _deadline: Option, - vf_keepalive_flag: bool, + nvme_keepalive_flag: bool, + mana_keepalive_flag: bool, ) -> anyhow::Result { assert!(!self.state_units.is_running()); let emuplat = (self.emuplat_servicing.save()).context("emuplat save failed")?; + // Only save dma manager state if we are expected to keep VF devices + // alive across save. Otherwise, don't persist the state at all, as + // there should be no live DMA across save. + // + // This has to happen before saving the network state, otherwise its allocations + // are marked as Free and are unable to be restored. + let dma_manager_state = if nvme_keepalive_flag || mana_keepalive_flag { + use vmcore::save_restore::SaveRestore; + Some(self.dma_manager.save().context("dma_manager save failed")?) + } else { + None + }; + // Only save NVMe state when there are NVMe controllers and keep alive // was enabled. let nvme_state = if let Some(n) = &self.nvme_manager { // DEVNOTE: A subtlety here is that the act of saving the NVMe state also causes the driver // to enter a state where subsequent teardown operations will noop. There is a STRONG // correlation between save/restore and keepalive. - n.save(vf_keepalive_flag) + n.save(nvme_keepalive_flag) .instrument(tracing::info_span!("nvme_manager_save", CVM_ALLOWED)) .await .map(|s| NvmeSavedState { nvme_state: s }) @@ -762,6 +784,15 @@ impl LoadedVm { }; let units = self.save_units().await.context("state unit save failed")?; + + let mana_state = if let Some(network_settings) = &mut self.network_settings + && mana_keepalive_flag + { + Some(network_settings.save().await) + } else { + None + }; + let vmgs = if let Some((vmgs_thin_client, vmgs_disk_metadata, _)) = self.vmgs.as_ref() { Some(( vmgs_thin_client.save().await.context("vmgs save failed")?, @@ -771,16 +802,6 @@ impl LoadedVm { None }; - // Only save dma manager state if we are expected to keep VF devices - // alive across save. Otherwise, don't persist the state at all, as - // there should be no live DMA across save. - let dma_manager_state = if vf_keepalive_flag { - use vmcore::save_restore::SaveRestore; - Some(self.dma_manager.save().context("dma_manager save failed")?) - } else { - None - }; - let vmbus_client = if let Some(vmbus_client) = &mut self.vmbus_client { vmbus_client.stop().await; Some(vmbus_client.save().await) @@ -800,6 +821,7 @@ impl LoadedVm { nvme_state, dma_manager_state, vmbus_client, + mana_state, }, units, }; @@ -866,6 +888,8 @@ impl LoadedVm { &self.vmbus_server, self.dma_manager.client_spawner(), self.isolation.is_isolated(), + self.mana_keep_alive, + None, // No existing mana state ) .await?; diff --git a/openhcl/underhill_core/src/emuplat/netvsp.rs b/openhcl/underhill_core/src/emuplat/netvsp.rs index 47ec89027d..31c2d831b1 100644 --- a/openhcl/underhill_core/src/emuplat/netvsp.rs +++ b/openhcl/underhill_core/src/emuplat/netvsp.rs @@ -14,6 +14,7 @@ use guid::Guid; use inspect::Inspect; use mana_driver::mana::ManaDevice; use mana_driver::mana::VportState; +use mana_driver::save_restore::ManaSavedState; use mesh::rpc::FailableRpc; use mesh::rpc::Rpc; use mesh::rpc::RpcSend; @@ -58,6 +59,7 @@ enum HclNetworkVfManagerMessage { HideVtl0VF(Rpc), Inspect(inspect::Deferred), PacketCapture(FailableRpc, PacketCaptureParams>), + SaveState(Rpc<(), Option>), } async fn create_mana_device( @@ -66,7 +68,21 @@ async fn create_mana_device( vp_count: u32, max_sub_channels: u16, dma_client: Arc, + mana_state: Option<&ManaSavedState>, ) -> anyhow::Result> { + if let Some(mana_state) = mana_state { + tracing::info!("restoring MANA device from saved state"); + return try_create_mana_device( + driver_source, + pci_id, + vp_count, + max_sub_channels, + dma_client, + Some(mana_state), + ) + .await; + } + // Disable FLR on vfio attach/detach; this allows faster system // startup/shutdown with the caveat that the device needs to be properly // sent through the shutdown path during servicing operations, as that is @@ -90,6 +106,7 @@ async fn create_mana_device( vp_count, max_sub_channels, dma_client.clone(), + None, ) .await { @@ -119,16 +136,28 @@ async fn try_create_mana_device( vp_count: u32, max_sub_channels: u16, dma_client: Arc, + mana_state: Option<&ManaSavedState>, ) -> anyhow::Result> { - let device = VfioDevice::new(driver_source, pci_id, dma_client) - .await - .context("failed to open device")?; + // Restore the device if we have saved state from servicing, otherwise create a new one. + let device = if mana_state.is_some() { + tracing::info!("Restoring VFIO device from saved state"); + VfioDevice::restore(driver_source, pci_id, true, dma_client) + .instrument(tracing::info_span!("restore_mana_vfio_device")) + .await + .context("failed to restore device")? + } else { + VfioDevice::new(driver_source, pci_id, dma_client) + .instrument(tracing::info_span!("new_mana_vfio_device")) + .await + .context("failed to open device")? + }; ManaDevice::new( &driver_source.simple(), device, vp_count, max_sub_channels + 1, + mana_state.map(|state| &state.mana_device), ) .instrument(tracing::info_span!("new_mana_device")) .await @@ -393,22 +422,7 @@ impl HclNetworkVFManagerWorker { } pub async fn shutdown_vtl2_device(&mut self, keep_vf_alive: bool) { - futures::future::join_all(self.endpoint_controls.iter_mut().map(async |control| { - match control.disconnect().await { - Ok(Some(mut endpoint)) => { - tracing::info!("Network endpoint disconnected"); - endpoint.stop().await; - } - Ok(None) => (), - Err(err) => { - tracing::error!( - err = err.as_ref() as &dyn std::error::Error, - "Failed to disconnect endpoint" - ); - } - } - })) - .await; + self.disconnect_all_endpoints().await; if let Some(device) = self.mana_device.take() { let (result, device) = device.shutdown().await; // Closing the VFIO device handle can take a long time. Leak the handle by @@ -461,6 +475,25 @@ impl HclNetworkVFManagerWorker { } } + async fn disconnect_all_endpoints(&mut self) { + futures::future::join_all(self.endpoint_controls.iter_mut().map(async |control| { + match control.disconnect().await { + Ok(Some(mut endpoint)) => { + tracing::info!("Network endpoint disconnected"); + endpoint.stop().await; + } + Ok(None) => (), + Err(err) => { + tracing::error!( + err = err.as_ref() as &dyn std::error::Error, + "Failed to disconnect endpoint" + ); + } + } + })) + .await; + } + pub async fn run(&mut self) { #[derive(Debug)] enum NextWorkItem { @@ -643,6 +676,41 @@ impl HclNetworkVFManagerWorker { }) .await; } + NextWorkItem::ManagerMessage(HclNetworkVfManagerMessage::SaveState(rpc)) => { + assert!(self.is_shutdown_active); + drop(self.messages.take().unwrap()); + rpc.handle(async |_| { + self.disconnect_all_endpoints().await; + + if let Some(device) = self.mana_device.take() { + let (saved_state, device) = device.save().await; + + // Closing the VFIO device handle can take a long time. + // Leak the handle by stashing it away. + std::mem::forget(device); + + if let Ok(saved_state) = saved_state { + Some(ManaSavedState { + mana_device: saved_state, + pci_id: self.vtl2_pci_id.clone(), + }) + } else { + tracing::error!( + "Failed while saving MANA device state, returning None" + ); + None + } + } else { + tracing::warn!( + "no MANA device present when saving state, returning None" + ); + None + } + }) + .await; + // Exit worker thread. + return; + } NextWorkItem::ManagerMessage(HclNetworkVfManagerMessage::ShutdownBegin( remove_vtl0_vf, )) => { @@ -652,6 +720,7 @@ impl HclNetworkVFManagerWorker { self.is_shutdown_active = true; } NextWorkItem::ManagerMessage(HclNetworkVfManagerMessage::ShutdownComplete(rpc)) => { + tracing::info!("shutting down VTL2 device"); assert!(self.is_shutdown_active); drop(self.messages.take().unwrap()); rpc.handle(async |keep_vf_alive| { @@ -683,6 +752,7 @@ impl HclNetworkVFManagerWorker { self.vp_count, self.max_sub_channels, self.dma_client.clone(), + None, // No saved state on new device arrival ) .await { @@ -856,6 +926,7 @@ impl HclNetworkVFManager { netvsp_state: &Option>, dma_mode: GuestDmaMode, dma_client: Arc, + mana_state: Option<&ManaSavedState>, ) -> anyhow::Result<( Self, Vec, @@ -867,6 +938,7 @@ impl HclNetworkVFManager { vp_count, max_sub_channels, dma_client.clone(), + mana_state, ) .await?; let (mut endpoints, endpoint_controls): (Vec<_>, Vec<_>) = (0..device.num_vports()) @@ -966,6 +1038,29 @@ impl HclNetworkVFManager { )) } + pub async fn save(&self) -> Option { + let save_state = self + .shared_state + .worker_channel + .call(HclNetworkVfManagerMessage::SaveState, ()) + .await; + + match save_state { + Ok(None) => { + tracing::warn!("No MANA device present when saving state, returning None"); + None + } + Ok(Some(state)) => Some(state), + Err(err) => { + tracing::error!( + err = &err as &dyn std::error::Error, + "RPC failure when saving VF Manager state" + ); + None + } + } + } + pub async fn packet_capture( &self, params: PacketCaptureParams, @@ -1063,6 +1158,12 @@ impl HclNetworkVFManagerShutdownInProgress { } self.complete = true; } + + pub async fn save(mut self) -> Option { + let result = self.inner.save().await; + self.complete = true; + result + } } struct HclNetworkVFManagerInstance { diff --git a/openhcl/underhill_core/src/lib.rs b/openhcl/underhill_core/src/lib.rs index e554fa3c55..821041ee2c 100644 --- a/openhcl/underhill_core/src/lib.rs +++ b/openhcl/underhill_core/src/lib.rs @@ -324,6 +324,7 @@ async fn launch_workers( gdbstub: opt.gdbstub, hide_isolation: opt.hide_isolation, nvme_keep_alive: opt.nvme_keep_alive, + mana_keep_alive: opt.mana_keep_alive, nvme_always_flr: opt.nvme_always_flr, test_configuration: opt.test_configuration, disable_uefi_frontpage: opt.disable_uefi_frontpage, diff --git a/openhcl/underhill_core/src/options.rs b/openhcl/underhill_core/src/options.rs index 33d811aa0a..6393e57883 100644 --- a/openhcl/underhill_core/src/options.rs +++ b/openhcl/underhill_core/src/options.rs @@ -183,6 +183,9 @@ pub struct Options { /// (OPENHCL_NVME_KEEP_ALIVE=1) Enable nvme keep alive when servicing. pub nvme_keep_alive: bool, + /// (OPENHCL_MANA_KEEP_ALIVE=1) Enable MANA keep alive when servicing. + pub mana_keep_alive: bool, + /// (OPENHCL_NVME_ALWAYS_FLR=1) /// Always use the FLR (Function Level Reset) path for NVMe devices, /// even if we would otherwise attempt to use VFIO's NoReset support. @@ -329,6 +332,7 @@ impl Options { let gdbstub = parse_legacy_env_bool("OPENHCL_GDBSTUB"); let gdbstub_port = parse_legacy_env_number("OPENHCL_GDBSTUB_PORT")?.map(|x| x as u32); let nvme_keep_alive = parse_env_bool("OPENHCL_NVME_KEEP_ALIVE"); + let mana_keep_alive = parse_env_bool("OPENHCL_MANA_KEEP_ALIVE"); let nvme_always_flr = parse_env_bool("OPENHCL_NVME_ALWAYS_FLR"); let test_configuration = read_env("OPENHCL_TEST_CONFIG").and_then(|x| { x.to_string_lossy() @@ -415,6 +419,7 @@ impl Options { halt_on_guest_halt, no_sidecar_hotplug, nvme_keep_alive, + mana_keep_alive, nvme_always_flr, test_configuration, disable_uefi_frontpage, diff --git a/openhcl/underhill_core/src/servicing.rs b/openhcl/underhill_core/src/servicing.rs index 0606f51804..ca73a33ae5 100644 --- a/openhcl/underhill_core/src/servicing.rs +++ b/openhcl/underhill_core/src/servicing.rs @@ -10,6 +10,7 @@ use anyhow::Context as _; use vmcore::save_restore::SavedStateBlob; mod state { + use mana_driver::save_restore::ManaSavedState; use mesh::payload::Protobuf; use openhcl_dma_manager::save_restore::OpenhclDmaManagerState; use state_unit::SavedStateUnit; @@ -84,6 +85,8 @@ mod state { pub dma_manager_state: Option, #[mesh(10002)] pub vmbus_client: Option, + #[mesh(10003)] + pub mana_state: Option>, } #[derive(Protobuf)] @@ -183,6 +186,7 @@ impl From for FirmwareType { #[expect(clippy::option_option)] pub mod transposed { use super::*; + use mana_driver::save_restore::ManaSavedState; use openhcl_dma_manager::save_restore::OpenhclDmaManagerState; use vmcore::save_restore::SaveRestore; @@ -193,6 +197,7 @@ pub mod transposed { pub firmware_type: Option, pub vm_stop_reference_time: Option, pub emuplat: OptionEmuplatSavedState, + pub mana_state: Option>, pub flush_logs_result: Option>, pub vmgs: Option<( vmgs::save_restore::state::SavedVmgsState, @@ -230,6 +235,7 @@ pub mod transposed { vmgs, overlay_shutdown_device, nvme_state, + mana_state, dma_manager_state, vmbus_client, } = state; @@ -246,6 +252,7 @@ pub mod transposed { vmgs, overlay_shutdown_device: Some(overlay_shutdown_device), nvme_state: Some(nvme_state), + mana_state, dma_manager_state: Some(dma_manager_state), vmbus_client: Some(vmbus_client), } diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs index 288393d732..8373367ab1 100644 --- a/openhcl/underhill_core/src/worker.rs +++ b/openhcl/underhill_core/src/worker.rs @@ -91,6 +91,7 @@ use input_core::InputData; use input_core::MultiplexedInputHandle; use inspect::Inspect; use loader_defs::shim::MemoryVtlType; +use mana_driver::save_restore::ManaSavedState; use memory_range::MemoryRange; use mesh::CancelContext; use mesh::MeshPayload; @@ -284,6 +285,8 @@ pub struct UnderhillEnvCfg { pub hide_isolation: bool, /// Enable nvme keep alive. pub nvme_keep_alive: bool, + /// Enable mana keep alive. + pub mana_keep_alive: bool, /// Don't skip FLR for NVMe devices. pub nvme_always_flr: bool, /// test configuration @@ -760,6 +763,8 @@ impl UhVmNetworkSettings { vmbus_server: &Option, dma_client_spawner: DmaClientSpawner, is_isolated: bool, + save_restore_supported: bool, + saved_mana_state: Option<&ManaSavedState>, ) -> anyhow::Result { let instance_id = nic_config.instance_id; let nic_max_sub_channels = nic_config @@ -775,7 +780,7 @@ impl UhVmNetworkSettings { } else { AllocationVisibility::Private }, - persistent_allocations: false, + persistent_allocations: save_restore_supported, })?; let (vf_manager, endpoints, save_state) = HclNetworkVFManager::new( @@ -790,6 +795,7 @@ impl UhVmNetworkSettings { servicing_netvsp_state, self.dma_mode, dma_client, + saved_mana_state, ) .await?; @@ -916,6 +922,8 @@ impl LoadedVmNetworkSettings for UhVmNetworkSettings { vmbus_server: &Option, dma_client_spawner: DmaClientSpawner, is_isolated: bool, + save_restore_supported: bool, + mana_state: Option<&ManaSavedState>, ) -> anyhow::Result { if self.vf_managers.contains_key(&instance_id) { return Err(NetworkSettingsError::VFManagerExists(instance_id).into()); @@ -949,6 +957,8 @@ impl LoadedVmNetworkSettings for UhVmNetworkSettings { vmbus_server, dma_client_spawner, is_isolated, + save_restore_supported, + mana_state, ) .await?; @@ -1004,6 +1014,82 @@ impl LoadedVmNetworkSettings for UhVmNetworkSettings { } Ok(params) } + + async fn save(&mut self) -> Vec { + let mut vf_managers: Vec<(Guid, Arc)> = + self.vf_managers.drain().collect(); + + // Notify VF managers of shutdown so that the subsequent teardown of + // the NICs does not modify VF state. + let vf_managers = vf_managers + .drain(..) + .map(move |(instance_id, manager)| { + ( + instance_id, + Arc::into_inner(manager).unwrap().shutdown_begin(false), + ) + }) + .collect::>(); + + // Collect the instance_id of every vf_manager being shutdown + let instance_ids: Vec = vf_managers + .iter() + .map(|(instance_id, _)| *instance_id) + .collect(); + + // Only remove the vmbus channels and NICs from the VF Managers + let mut nic_channels = Vec::new(); + let mut i = 0; + while i < self.nics.len() { + if instance_ids.contains(&self.nics[i].0) { + let val = self.nics.remove(i); + nic_channels.push(val); + } else { + i += 1; + } + } + + for instance_id in instance_ids { + if !nic_channels.iter().any(|(id, _)| *id == instance_id) { + tracing::error!( + "No vmbus channel found that matches VF Manager instance_id: {instance_id}" + ); + } + } + + let mut endpoints: Vec<_> = + join_all(nic_channels.drain(..).map(async |(instance_id, channel)| { + async { + let nic = channel.remove().await.revoke().await; + nic.shutdown() + } + .instrument(tracing::info_span!("nic_shutdown", %instance_id)) + .await + })) + .await; + + let run_endpoints = async { + loop { + let _ = endpoints + .iter_mut() + .map(|endpoint| endpoint.wait_for_endpoint_action()) + .collect::>() + .race() + .await; + } + }; + + let save_vf_managers = join_all( + vf_managers + .into_iter() + .map(|(_, vf_manager)| vf_manager.save()), + ); + + let state = (run_endpoints, save_vf_managers).race().await; + + // Discard any vf_managers that failed to return valid save state. + state.into_iter().flatten().collect() + } } /// The final vtl0 memory layout computed from different inputs. @@ -3102,6 +3188,15 @@ async fn new_underhill_vm( if !controllers.mana.is_empty() { let _span = tracing::info_span!("network_settings", CVM_ALLOWED).entered(); for nic_config in controllers.mana.into_iter() { + let nic_servicing_state = if let Some(ref state) = servicing_state.mana_state { + state.iter().find(|s| s.pci_id == nic_config.pci_id) + } else { + None + }; + + let private_pool_available = !runtime_params.private_pool_ranges().is_empty(); + let save_restore_supported = env_cfg.mana_keep_alive && private_pool_available; + let save_state = uh_network_settings .add_network( nic_config.instance_id, @@ -3115,6 +3210,8 @@ async fn new_underhill_vm( &vmbus_server, dma_manager.client_spawner(), isolation.is_isolated(), + save_restore_supported, + nic_servicing_state, ) .await?; @@ -3334,6 +3431,7 @@ async fn new_underhill_vm( _periodic_telemetry_task: periodic_telemetry_task, nvme_keep_alive: env_cfg.nvme_keep_alive, + mana_keep_alive: env_cfg.mana_keep_alive, test_configuration: env_cfg.test_configuration, dma_manager, }; diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index e364c56f66..e2042d854c 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -1953,6 +1953,12 @@ enum InteractiveCommand { /// configured path. #[clap(long, conflicts_with("user_mode_only"))] igvm: Option, + #[clap(long)] + /// Enable NVMe keepalive + nvme_keepalive: bool, + /// Enable MANA keepalive + #[clap(long)] + mana_keepalive: bool, }, /// Read guest memory @@ -2806,6 +2812,8 @@ async fn run_control(driver: &DefaultDriver, mesh: &VmmMesh, opt: Options) -> an InteractiveCommand::ServiceVtl2 { user_mode_only, igvm, + mana_keepalive, + nvme_keepalive, } => { let paravisor_diag = paravisor_diag.clone(); let vm_rpc = vm_rpc.clone(); @@ -2823,7 +2831,10 @@ async fn run_control(driver: &DefaultDriver, mesh: &VmmMesh, opt: Options) -> an hvlite_helpers::underhill::save_underhill( &vm_rpc, ged_rpc.as_ref().context("no GED")?, - GuestServicingFlags::default(), + GuestServicingFlags { + nvme_keepalive, + mana_keepalive, + }, file.into(), ) .await?; diff --git a/petri/src/vm/mod.rs b/petri/src/vm/mod.rs index 10a470b877..e8a2fc11b2 100644 --- a/petri/src/vm/mod.rs +++ b/petri/src/vm/mod.rs @@ -1727,6 +1727,8 @@ pub enum IsolationType { pub struct OpenHclServicingFlags { /// Preserve DMA memory for NVMe devices if supported. pub enable_nvme_keepalive: bool, + /// Preserve DMA memory for MANA devices if supported. + pub enable_mana_keepalive: bool, /// Skip any logic that the vmm may have to ignore servicing updates if the supplied igvm file version is not different than the one currently running. pub override_version_checks: bool, /// Hint to the OpenHCL runtime how much time to wait when stopping / saving the OpenHCL. diff --git a/petri/src/worker.rs b/petri/src/worker.rs index 491d0eab05..18a53525cd 100644 --- a/petri/src/worker.rs +++ b/petri/src/worker.rs @@ -69,6 +69,7 @@ impl Worker { send, GuestServicingFlags { nvme_keepalive: flags.enable_nvme_keepalive, + mana_keepalive: flags.enable_mana_keepalive, }, file, ) diff --git a/vm/devices/get/get_protocol/src/lib.rs b/vm/devices/get/get_protocol/src/lib.rs index e5c54b9262..6c899b414a 100644 --- a/vm/devices/get/get_protocol/src/lib.rs +++ b/vm/devices/get/get_protocol/src/lib.rs @@ -1186,8 +1186,13 @@ pub struct SaveGuestVtl2StateFlags { /// Explicitly allow nvme_keepalive feature when servicing. #[bits(1)] pub enable_nvme_keepalive: bool, + + /// Explicitly allow mana_keepalive feature when servicing. + #[bits(1)] + pub enable_mana_keepalive: bool, + /// Reserved, must be zero. - #[bits(63)] + #[bits(62)] _rsvd1: u64, } diff --git a/vm/devices/get/get_resources/src/lib.rs b/vm/devices/get/get_resources/src/lib.rs index 9032fe9348..27554ca337 100644 --- a/vm/devices/get/get_resources/src/lib.rs +++ b/vm/devices/get/get_resources/src/lib.rs @@ -160,8 +160,10 @@ pub mod ged { /// Define servicing behavior. #[derive(MeshPayload, Default)] pub struct GuestServicingFlags { - /// Retain memory for DMA-attached devices. + /// Retain memory for nvme devices. pub nvme_keepalive: bool, + /// Retain memory for MANA devices. + pub mana_keepalive: bool, } /// Actions a client can request that the Guest Emulation diff --git a/vm/devices/get/guest_emulation_device/src/lib.rs b/vm/devices/get/guest_emulation_device/src/lib.rs index ec1e3c98a3..945a2da976 100644 --- a/vm/devices/get/guest_emulation_device/src/lib.rs +++ b/vm/devices/get/guest_emulation_device/src/lib.rs @@ -618,7 +618,8 @@ impl GedChannel { ), correlation_id: Guid::ZERO, capabilities_flags: SaveGuestVtl2StateFlags::new() - .with_enable_nvme_keepalive(rpc.input().nvme_keepalive), + .with_enable_nvme_keepalive(rpc.input().nvme_keepalive) + .with_enable_mana_keepalive(rpc.input().mana_keepalive), timeout_hint_secs: 60, }; diff --git a/vm/devices/net/mana_driver/src/gdma_driver.rs b/vm/devices/net/mana_driver/src/gdma_driver.rs index b5af4d04de..f72a29f92f 100644 --- a/vm/devices/net/mana_driver/src/gdma_driver.rs +++ b/vm/devices/net/mana_driver/src/gdma_driver.rs @@ -514,8 +514,7 @@ impl GdmaDriver { Ok(this) } - #[allow(dead_code)] - pub async fn save(mut self) -> anyhow::Result { + pub async fn save(&mut self) -> anyhow::Result { if self.hwc_failure { anyhow::bail!("cannot save/restore after HWC failure"); } @@ -594,7 +593,6 @@ impl GdmaDriver { Ok((bar0_mapping, map)) } - #[allow(dead_code)] pub async fn restore( saved_state: GdmaDriverSavedState, mut device: T, diff --git a/vm/devices/net/mana_driver/src/mana.rs b/vm/devices/net/mana_driver/src/mana.rs index e913de0d93..ea837c070f 100644 --- a/vm/devices/net/mana_driver/src/mana.rs +++ b/vm/devices/net/mana_driver/src/mana.rs @@ -5,6 +5,7 @@ pub use crate::bnic_driver::RxConfig; pub use crate::resources::ResourceArena; +pub use crate::save_restore::ManaDeviceSavedState; use crate::bnic_driver::BnicDriver; use crate::bnic_driver::WqConfig; @@ -76,10 +77,25 @@ impl ManaDevice { device: T, num_vps: u32, max_queues_per_vport: u16, + mana_state: Option<&ManaDeviceSavedState>, ) -> anyhow::Result { - let mut gdma = GdmaDriver::new(driver, device, num_vps, None) - .instrument(tracing::info_span!("new_gdma_driver")) - .await?; + let mut gdma = if let Some(mana_state) = mana_state { + let memory = device.dma_client().attach_pending_buffers()?; + let gdma_memory = memory + .iter() + .find(|m| m.pfns()[0] == mana_state.gdma.mem.base_pfn) + .expect("gdma restored memory not found") + .clone(); + + GdmaDriver::restore(mana_state.gdma.clone(), device, gdma_memory) + .instrument(tracing::info_span!("restore_gdma_driver")) + .await? + } else { + GdmaDriver::new(driver, device, num_vps, None) + .instrument(tracing::info_span!("new_gdma_driver")) + .await? + }; + gdma.test_eq().await?; gdma.verify_vf_driver_version().await?; @@ -92,7 +108,15 @@ impl ManaDevice { .find(|dev_id| dev_id.ty == GdmaDevType::GDMA_DEVICE_MANA) .context("no mana device found")?; - let dev_data = gdma.register_device(dev_id).await?; + let dev_data = if let Some(mana_state) = mana_state { + GdmaRegisterDeviceResp { + pdid: mana_state.gdma.pdid, + gpa_mkey: mana_state.gdma.gpa_mkey, + db_id: mana_state.gdma.db_id as u32, + } + } else { + gdma.register_device(dev_id).await? + }; let mut bnic = BnicDriver::new(&mut gdma, dev_id); let dev_config = bnic.query_dev_config().await?; @@ -143,6 +167,29 @@ impl ManaDevice { Ok(device) } + /// Saves the device's state for servicing + pub async fn save(self) -> (anyhow::Result, T) { + self.inspect_task.cancel().await; + if let Some(hwc_task) = self.hwc_task { + hwc_task.cancel().await; + } + let inner = Arc::into_inner(self.inner).unwrap(); + let mut driver = inner.gdma.into_inner(); + + if let Ok(saved_state) = driver.save().await { + tracing::info!("Saved MANA device state"); + let mana_saved_state = ManaDeviceSavedState { gdma: saved_state }; + + (Ok(mana_saved_state), driver.into_device()) + } else { + tracing::error!("Failed to save MANA device state"); + ( + Err(anyhow::anyhow!("Failed to save MANA device state")), + driver.into_device(), + ) + } + } + /// Returns the number of vports the device supports. pub fn num_vports(&self) -> u32 { self.inner.dev_config.max_num_vports.into() diff --git a/vm/devices/net/mana_driver/src/save_restore.rs b/vm/devices/net/mana_driver/src/save_restore.rs index 23ee9ad73d..f414f9926a 100644 --- a/vm/devices/net/mana_driver/src/save_restore.rs +++ b/vm/devices/net/mana_driver/src/save_restore.rs @@ -5,6 +5,28 @@ use mesh::payload::Protobuf; +/// Mana saved state +#[derive(Debug, Protobuf, Clone)] +#[mesh(package = "mana_driver")] +pub struct ManaSavedState { + /// The saved state of the MANA device driver + #[mesh(1)] + pub mana_device: ManaDeviceSavedState, + + /// Id of the device + #[mesh(2)] + pub pci_id: String, +} + +/// Mana device saved state +#[derive(Debug, Protobuf, Clone)] +#[mesh(package = "mana_driver")] +pub struct ManaDeviceSavedState { + /// Saved state for restoration of the GDMA driver + #[mesh(1)] + pub gdma: GdmaDriverSavedState, +} + /// Top level saved state for the GDMA driver's saved state #[derive(Protobuf, Clone, Debug)] #[mesh(package = "mana_driver")] diff --git a/vm/devices/net/net_mana/src/lib.rs b/vm/devices/net/net_mana/src/lib.rs index 3ffe9c67f1..e86774631c 100644 --- a/vm/devices/net/net_mana/src/lib.rs +++ b/vm/devices/net/net_mana/src/lib.rs @@ -1689,7 +1689,7 @@ mod tests { reserved: 0, max_num_eqs: 64, }; - let thing = ManaDevice::new(&driver, device, 1, 1).await.unwrap(); + let thing = ManaDevice::new(&driver, device, 1, 1, None).await.unwrap(); let vport = thing.new_vport(0, None, &dev_config).await.unwrap(); let mut endpoint = ManaEndpoint::new(driver.clone(), vport, dma_mode).await; let mut queues = Vec::new(); @@ -1783,7 +1783,7 @@ mod tests { reserved: 0, max_num_eqs: 64, }; - let thing = ManaDevice::new(&driver, device, 1, 1).await.unwrap(); + let thing = ManaDevice::new(&driver, device, 1, 1, None).await.unwrap(); let _ = thing.new_vport(0, None, &dev_config).await.unwrap(); } diff --git a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs index ba06fc56fc..0680282aea 100644 --- a/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs +++ b/vmm_tests/vmm_tests/tests/tests/multiarch/openhcl_servicing.rs @@ -497,3 +497,136 @@ async fn create_keepalive_test_config( .run() .await } + +/// Today this only tests that the nic can get an IP address via consomme's DHCP +/// implementation. +/// +/// FUTURE: Test traffic on the nic. +async fn validate_mana_nic(agent: &PipetteClient) -> Result<(), anyhow::Error> { + let sh = agent.unix_shell(); + cmd!(sh, "ifconfig eth0 up").run().await?; + cmd!(sh, "udhcpc eth0").run().await?; + let output = cmd!(sh, "ifconfig eth0").read().await?; + // Validate that we see a mana nic with the expected MAC address and IPs. + assert!(output.contains("HWaddr 00:15:5D:12:12:12")); + assert!(output.contains("inet addr:10.0.0.2")); + assert!(output.contains("inet6 addr: fe80::215:5dff:fe12:1212/64")); + + Ok(()) +} + +/// Test an OpenHCL Linux direct VM with a MANA nic assigned to VTL2 (backed by +/// the MANA emulator), and vmbus relay. Perform servicing and validate that the +/// nic is still functional. +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] +async fn mana_nic_servicing( + config: PetriVmBuilder, + (igvm_file,): (ResolvedArtifact,), +) -> Result<(), anyhow::Error> { + let (mut vm, agent) = config + .with_vmbus_redirect(true) + .modify_backend(|b| b.with_nic()) + .run() + .await?; + + validate_mana_nic(&agent).await?; + + vm.restart_openhcl(igvm_file, OpenHclServicingFlags::default()) + .await?; + + validate_mana_nic(&agent).await?; + + agent.power_off().await?; + vm.wait_for_clean_teardown().await?; + + Ok(()) +} +/// Test an OpenHCL Linux direct VM with a MANA nic assigned to VTL2 (backed by +/// the MANA emulator), and vmbus relay. Perform servicing and validate that the +/// nic is still functional. +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] +async fn mana_nic_servicing_keepalive( + config: PetriVmBuilder, + (igvm_file,): (ResolvedArtifact,), +) -> Result<(), anyhow::Error> { + let (mut vm, agent) = config + .with_vmbus_redirect(true) + .modify_backend(|b| b.with_nic()) + .with_openhcl_command_line( + "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_SIDECAR=off OPENHCL_MANA_KEEP_ALIVE=1", + ) // disable sidecar until #1345 is fixed + .run() + .await?; + + validate_mana_nic(&agent).await?; + + vm.restart_openhcl( + igvm_file, + OpenHclServicingFlags { + enable_mana_keepalive: true, + ..Default::default() + }, + ) + .await?; + + validate_mana_nic(&agent).await?; + + agent.power_off().await?; + vm.wait_for_clean_teardown().await?; + + Ok(()) +} + +// Test upgrading from 25_05 release to latest, then service again to go through keepalive path +#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64, RELEASE_25_05_LINUX_DIRECT_X64])] +async fn mana_nic_servicing_keepalive_upgrade( + config: PetriVmBuilder, + (to_igvm_file, from_igvm_file): ( + ResolvedArtifact, + ResolvedArtifact, + ), +) -> Result<(), anyhow::Error> { + // Start a VM using 25_05 release IGVM + let (mut vm, agent) = config + // TODO: remove .with_guest_state_lifetime(PetriGuestStateLifetime::Disk). The default (ephemeral) does not exist in the 2505 release. + .with_guest_state_lifetime(PetriGuestStateLifetime::Disk) + .with_custom_openhcl(from_igvm_file) + .with_vmbus_redirect(true) + .modify_backend(|b| b.with_nic()) + .with_openhcl_command_line( + "OPENHCL_ENABLE_VTL2_GPA_POOL=512 OPENHCL_SIDECAR=off OPENHCL_MANA_KEEP_ALIVE=1", + ) // disable sidecar until #1345 is fixed + .run() + .await?; + + validate_mana_nic(&agent).await?; + + // Service to latest IGVM and make sure MANA nic still works + vm.restart_openhcl( + to_igvm_file.clone(), + OpenHclServicingFlags { + enable_mana_keepalive: true, + ..Default::default() + }, + ) + .await?; + + validate_mana_nic(&agent).await?; + + // Service again to latest IGVM to test keepalive path + vm.restart_openhcl( + to_igvm_file, + OpenHclServicingFlags { + enable_mana_keepalive: true, + ..Default::default() + }, + ) + .await?; + + validate_mana_nic(&agent).await?; + + agent.power_off().await?; + vm.wait_for_clean_teardown().await?; + + Ok(()) +} diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs index 4edd661270..5e4327ea4e 100644 --- a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs +++ b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_linux_direct.rs @@ -76,33 +76,6 @@ async fn mana_nic_shared_pool( Ok(()) } -/// Test an OpenHCL Linux direct VM with a MANA nic assigned to VTL2 (backed by -/// the MANA emulator), and vmbus relay. Perform servicing and validate that the -/// nic is still functional. -#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] -async fn mana_nic_servicing( - config: PetriVmBuilder, - (igvm_file,): (ResolvedArtifact,), -) -> Result<(), anyhow::Error> { - let (mut vm, agent) = config - .with_vmbus_redirect(true) - .modify_backend(|b| b.with_nic()) - .run() - .await?; - - validate_mana_nic(&agent).await?; - - vm.restart_openhcl(igvm_file, OpenHclServicingFlags::default()) - .await?; - - validate_mana_nic(&agent).await?; - - agent.power_off().await?; - vm.wait_for_clean_teardown().await?; - - Ok(()) -} - /// Test an OpenHCL Linux direct VM with many NVMe devices assigned to VTL2 and vmbus relay. #[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])] async fn many_nvme_devices_servicing(