-
Notifications
You must be signed in to change notification settings - Fork 158
mana: save and restore mana devices when keepalive is enabled #2123
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 26 commits
eb06840
e9e293f
1d5479d
93dfba9
eea14e1
d6fc130
fe7e8bd
a48267e
1950a50
06ac5b2
87ec2a3
980f5b0
054c767
1baec11
038c2da
e6fcd99
d799d6b
0b4c25b
4dde9aa
3177ddc
c403fc5
8e30a76
6646ca2
b3a95cc
60008ad
6b4351e
c53567f
46ac820
cdad413
b5a4c16
c7925a8
de661a1
ba67c7b
d9b0b21
0058602
3c2648c
64d766e
1150c18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -14,6 +14,7 @@ use guid::Guid; | |||
| use inspect::Inspect; | ||||
| use mana_driver::mana::ManaDevice; | ||||
| use mana_driver::mana::VportState; | ||||
| use mana_driver::save_restore::ManaSavedState; | ||||
| use mesh::rpc::FailableRpc; | ||||
| use mesh::rpc::Rpc; | ||||
| use mesh::rpc::RpcSend; | ||||
|
|
@@ -58,6 +59,7 @@ enum HclNetworkVfManagerMessage { | |||
| HideVtl0VF(Rpc<bool, ()>), | ||||
| Inspect(inspect::Deferred), | ||||
| PacketCapture(FailableRpc<PacketCaptureParams<Socket>, PacketCaptureParams<Socket>>), | ||||
| SaveState(Rpc<(), Option<ManaSavedState>>), | ||||
| } | ||||
|
|
||||
| async fn create_mana_device( | ||||
|
|
@@ -66,7 +68,21 @@ async fn create_mana_device( | |||
| vp_count: u32, | ||||
| max_sub_channels: u16, | ||||
| dma_client: Arc<dyn DmaClient>, | ||||
| mana_state: Option<&ManaSavedState>, | ||||
| ) -> anyhow::Result<ManaDevice<VfioDevice>> { | ||||
| if let Some(mana_state) = mana_state { | ||||
| tracing::info!("restoring MANA device from saved state"); | ||||
| return try_create_mana_device( | ||||
| driver_source, | ||||
| pci_id, | ||||
| vp_count, | ||||
| max_sub_channels, | ||||
| dma_client, | ||||
| Some(mana_state), | ||||
| ) | ||||
| .await; | ||||
| } | ||||
|
|
||||
| // Disable FLR on vfio attach/detach; this allows faster system | ||||
| // startup/shutdown with the caveat that the device needs to be properly | ||||
| // sent through the shutdown path during servicing operations, as that is | ||||
|
|
@@ -90,6 +106,7 @@ async fn create_mana_device( | |||
| vp_count, | ||||
| max_sub_channels, | ||||
| dma_client.clone(), | ||||
| None, | ||||
| ) | ||||
| .await | ||||
| { | ||||
|
|
@@ -119,16 +136,28 @@ async fn try_create_mana_device( | |||
| vp_count: u32, | ||||
| max_sub_channels: u16, | ||||
| dma_client: Arc<dyn DmaClient>, | ||||
| mana_state: Option<&ManaSavedState>, | ||||
| ) -> anyhow::Result<ManaDevice<VfioDevice>> { | ||||
| let device = VfioDevice::new(driver_source, pci_id, dma_client) | ||||
| .await | ||||
| .context("failed to open device")?; | ||||
| // Restore the device if we have saved state from servicing, otherwise create a new one. | ||||
| let device = if mana_state.is_some() { | ||||
| tracing::info!("Restoring VFIO device from saved state"); | ||||
| VfioDevice::restore(driver_source, pci_id, true, dma_client) | ||||
| .instrument(tracing::info_span!("restore_mana_vfio_device")) | ||||
| .await | ||||
| .context("failed to restore device")? | ||||
| } else { | ||||
| VfioDevice::new(driver_source, pci_id, dma_client) | ||||
| .instrument(tracing::info_span!("new_mana_vfio_device")) | ||||
| .await | ||||
| .context("failed to open device")? | ||||
| }; | ||||
|
|
||||
| ManaDevice::new( | ||||
| &driver_source.simple(), | ||||
| device, | ||||
| vp_count, | ||||
| max_sub_channels + 1, | ||||
| mana_state.map(|state| &state.mana_device), | ||||
| ) | ||||
| .instrument(tracing::info_span!("new_mana_device")) | ||||
| .await | ||||
|
|
@@ -643,6 +672,53 @@ impl HclNetworkVFManagerWorker { | |||
| }) | ||||
| .await; | ||||
| } | ||||
| NextWorkItem::ManagerMessage(HclNetworkVfManagerMessage::SaveState(rpc)) => { | ||||
| assert!(self.is_shutdown_active); | ||||
| drop(self.messages.take().unwrap()); | ||||
| rpc.handle(async |_| { | ||||
| futures::future::join_all(self.endpoint_controls.iter_mut().map( | ||||
| async |control| match control.disconnect().await { | ||||
| Ok(Some(mut endpoint)) => { | ||||
| tracing::info!("Network endpoint disconnected"); | ||||
| endpoint.stop().await; | ||||
| } | ||||
| Ok(None) => (), | ||||
| Err(err) => { | ||||
| tracing::error!( | ||||
| err = err.as_ref() as &dyn std::error::Error, | ||||
| "Failed to disconnect endpoint" | ||||
| ); | ||||
| } | ||||
| }, | ||||
| )) | ||||
| .await; | ||||
|
|
||||
| if let Some(device) = self.mana_device.take() { | ||||
| let (saved_state, device) = device.save().await; | ||||
| std::mem::forget(device); | ||||
|
||||
| std::mem::forget(device); |
justus-camp-microsoft marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this statement always true? This can also happen if the save state RPC failed in device.save(), right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah there are two error conditions here - the device being gone and hwc_failure == true and a subsequent failure to save. I added an error enum here and cleaned up the error paths.
Uh oh!
There was an error while loading. Please reload this page.