Skip to content
Draft
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,7 @@ async fn make_disk_type_from_physical_device(
// We can't validate yet that this namespace actually exists. That will
// be checked later.
return Ok(Resource::new(NvmeDiskConfig {
debug_id: controller_instance_id.to_string(),
pci_id,
nsid: sub_device_path,
}));
Expand Down
54 changes: 42 additions & 12 deletions openhcl/underhill_core/src/nvme_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ impl NvmeManager {
enum Request {
Inspect(inspect::Deferred),
ForceLoadDriver(inspect::DeferredUpdate),
GetNamespace(Rpc<(String, u32), Result<nvme_driver::Namespace, NamespaceError>>),
GetNamespace(Rpc<(String, String, u32), Result<nvme_driver::Namespace, NamespaceError>>),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is prone for errors. Make each String strongly typed, so that the caller can't confuse the debug_id and the pci_id.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created strongly typed NvmeDeviceName and PciId wrapper types with GetNamespaceParams struct to prevent parameter confusion in RPC calls. This eliminates the risk of mixing up the name and pci_id parameters. (commit 02fbdbc)

Save(Rpc<(), Result<NvmeManagerSavedState, anyhow::Error>>),
Shutdown {
span: tracing::Span,
Expand All @@ -191,13 +191,22 @@ pub struct NvmeManagerClient {
impl NvmeManagerClient {
pub async fn get_namespace(
&self,
debug_id: String,
pci_id: String,
nsid: u32,
) -> anyhow::Result<nvme_driver::Namespace> {
Ok(self
.sender
.call(Request::GetNamespace, (pci_id.clone(), nsid))
.instrument(tracing::info_span!("nvme_get_namespace", pci_id, nsid))
.call(
Request::GetNamespace,
(debug_id.clone(), pci_id.clone(), nsid),
)
.instrument(tracing::info_span!(
"nvme_get_namespace",
debug_id,
pci_id,
nsid
))
.await
.context("nvme manager is shut down")??)
}
Expand Down Expand Up @@ -235,7 +244,10 @@ impl NvmeManagerWorker {
match req {
Request::Inspect(deferred) => deferred.inspect(&self),
Request::ForceLoadDriver(update) => {
match self.get_driver(update.new_value().to_owned()).await {
match self
.get_driver("force-load".to_string(), update.new_value().to_owned())
.await
{
Ok(_) => {
let pci_id = update.new_value().to_string();
update.succeed(pci_id);
Expand All @@ -246,8 +258,8 @@ impl NvmeManagerWorker {
}
}
Request::GetNamespace(rpc) => {
rpc.handle(async |(pci_id, nsid)| {
self.get_namespace(pci_id.clone(), nsid)
rpc.handle(async |(debug_id, pci_id, nsid)| {
self.get_namespace(debug_id, pci_id.clone(), nsid)
.map_err(|source| NamespaceError { pci_id, source })
.await
})
Expand Down Expand Up @@ -283,9 +295,12 @@ impl NvmeManagerWorker {
if !nvme_keepalive || !self.save_restore_supported {
async {
join_all(self.devices.drain().map(|(pci_id, driver)| {
driver
.shutdown()
.instrument(tracing::info_span!("shutdown_nvme_driver", pci_id))
let debug_id = driver.debug_id();
driver.shutdown().instrument(tracing::info_span!(
"shutdown_nvme_driver",
pci_id,
debug_id
))
}))
.await
}
Expand All @@ -296,6 +311,7 @@ impl NvmeManagerWorker {

async fn get_driver(
&mut self,
debug_id: String,
pci_id: String,
) -> Result<&mut nvme_driver::NvmeDriver<VfioDevice>, InnerError> {
let driver = match self.devices.entry(pci_id.to_owned()) {
Expand Down Expand Up @@ -328,6 +344,7 @@ impl NvmeManagerWorker {
self.vp_count,
device,
self.is_isolated,
debug_id,
)
.instrument(tracing::info_span!(
"nvme_driver_init",
Expand All @@ -344,10 +361,11 @@ impl NvmeManagerWorker {

async fn get_namespace(
&mut self,
debug_id: String,
pci_id: String,
nsid: u32,
) -> Result<nvme_driver::Namespace, InnerError> {
let driver = self.get_driver(pci_id.to_owned()).await?;
let driver = self.get_driver(debug_id, pci_id.to_owned()).await?;
driver
.namespace(nsid)
.await
Expand Down Expand Up @@ -407,6 +425,7 @@ impl NvmeManagerWorker {
vfio_device,
&disk.driver_state,
self.is_isolated,
None, // controller_instance_id is not persisted in saved state
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unfortunate. Is there a way to re-hook-up the name (formerly known as controller_instance_id and debug_id) after a restore, without putting it in the NvmeManagerSavedState?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For restored drivers, I now generate the name as "restored-{pci_id}" using the available PCI ID. This avoids storing the name in NvmeManagerSavedState while still providing a meaningful identifier for tracing. (commit 02fbdbc)

)
.instrument(tracing::info_span!("nvme_driver_restore"))
.await?;
Expand Down Expand Up @@ -440,20 +459,31 @@ impl AsyncResolveResource<DiskHandleKind, NvmeDiskConfig> for NvmeDiskResolver {
) -> Result<Self::Output, Self::Error> {
let namespace = self
.manager
.get_namespace(rsrc.pci_id, rsrc.nsid)
.get_namespace(rsrc.debug_id, rsrc.pci_id, rsrc.nsid)
.await
.context("could not open nvme namespace")?;

Ok(ResolvedDisk::new(disk_nvme::NvmeDisk::new(namespace)).context("invalid disk")?)
}
}

#[derive(MeshPayload, Default)]
#[derive(MeshPayload)]
pub struct NvmeDiskConfig {
pub debug_id: String,
pub pci_id: String,
pub nsid: u32,
}

impl Default for NvmeDiskConfig {
fn default() -> Self {
Self {
debug_id: "force-load".to_string(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More bikeshedding: let's just call this name instead of debug_id.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed debug_id to name throughout the codebase including struct fields, method names, and all call sites. (commit 02fbdbc)

pci_id: String::new(),
nsid: 0,
}
}
}

impl ResourceId<DiskHandleKind> for NvmeDiskConfig {
const ID: &'static str = "nvme";
}
Expand Down
18 changes: 15 additions & 3 deletions vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ pub struct NvmeDriver<T: DeviceBacking> {
#[inspect(flatten)]
task: Option<TaskControl<DriverWorkerTask<T>, WorkerState>>,
device_id: String,
debug_id: String,
identify: Option<Arc<spec::IdentifyController>>,
#[inspect(skip)]
driver: VmTaskDriver,
Expand Down Expand Up @@ -179,11 +180,13 @@ impl<T: DeviceBacking> NvmeDriver<T> {
cpu_count: u32,
device: T,
bounce_buffer: bool,
debug_id: String,
) -> anyhow::Result<Self> {
let pci_id = device.id().to_owned();
let mut this = Self::new_disabled(driver_source, cpu_count, device, bounce_buffer)
.instrument(tracing::info_span!("nvme_new_disabled", pci_id))
.await?;
let mut this =
Self::new_disabled(driver_source, cpu_count, device, bounce_buffer, debug_id)
.instrument(tracing::info_span!("nvme_new_disabled", pci_id))
.await?;
match this
.enable(cpu_count as u16)
.instrument(tracing::info_span!("nvme_enable", pci_id))
Expand All @@ -208,6 +211,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
cpu_count: u32,
mut device: T,
bounce_buffer: bool,
debug_id: String,
) -> anyhow::Result<Self> {
let driver = driver_source.simple();
let bar0 = Bar0(
Expand Down Expand Up @@ -248,6 +252,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {

Ok(Self {
device_id: device.id().to_owned(),
debug_id,
task: Some(TaskControl::new(DriverWorkerTask {
device,
driver: driver.clone(),
Expand Down Expand Up @@ -568,6 +573,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
mut device: T,
saved_state: &NvmeDriverSavedState,
bounce_buffer: bool,
debug_id: String,
) -> anyhow::Result<Self> {
let driver = driver_source.simple();
let bar0_mapping = device
Expand All @@ -594,6 +600,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {

let mut this = Self {
device_id: device.id().to_owned(),
debug_id,
task: Some(TaskControl::new(DriverWorkerTask {
device,
driver: driver.clone(),
Expand Down Expand Up @@ -739,6 +746,11 @@ impl<T: DeviceBacking> NvmeDriver<T> {
pub fn update_servicing_flags(&mut self, nvme_keepalive: bool) {
self.nvme_keepalive = nvme_keepalive;
}

/// Get the debug ID.
pub fn debug_id(&self) -> &str {
&self.debug_id
}
}

async fn handle_asynchronous_events(
Expand Down
44 changes: 35 additions & 9 deletions vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,14 @@ async fn test_nvme_ioqueue_max_mqes(driver: DefaultDriver) {
let cap: Cap = Cap::new().with_mqes_z(max_u16);
device.set_mock_response_u64(Some((0, cap.into())));

let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false).await;
let driver = NvmeDriver::new(
&driver_source,
CPU_COUNT,
device,
false,
"test-nvme".to_string(),
)
.await;
assert!(driver.is_ok());
}

Expand Down Expand Up @@ -113,7 +120,14 @@ async fn test_nvme_ioqueue_invalid_mqes(driver: DefaultDriver) {
// Setup mock response at offset 0
let cap: Cap = Cap::new().with_mqes_z(0);
device.set_mock_response_u64(Some((0, cap.into())));
let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false).await;
let driver = NvmeDriver::new(
&driver_source,
CPU_COUNT,
device,
false,
"test-nvme".to_string(),
)
.await;

assert!(driver.is_err());
}
Expand Down Expand Up @@ -150,9 +164,15 @@ async fn test_nvme_driver(driver: DefaultDriver, allow_dma: bool) {
.await
.unwrap();
let device = NvmeTestEmulatedDevice::new(nvme, msi_set, dma_client.clone());
let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false)
.await
.unwrap();
let driver = NvmeDriver::new(
&driver_source,
CPU_COUNT,
device,
false,
"test-nvme".to_string(),
)
.await
.unwrap();
let namespace = driver.namespace(1).await.unwrap();

// Act: Write 1024 bytes of data to disk starting at LBA 1.
Expand Down Expand Up @@ -266,9 +286,15 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) {
.unwrap();

let device = NvmeTestEmulatedDevice::new(nvme_ctrl, msi_x, dma_client.clone());
let mut nvme_driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false)
.await
.unwrap();
let mut nvme_driver = NvmeDriver::new(
&driver_source,
CPU_COUNT,
device,
false,
"test-nvme".to_string(),
)
.await
.unwrap();
let _ns1 = nvme_driver.namespace(1).await.unwrap();
let saved_state = nvme_driver.save().await.unwrap();
// As of today we do not save namespace data to avoid possible conflict
Expand Down Expand Up @@ -304,7 +330,7 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) {

let _new_device = NvmeTestEmulatedDevice::new(new_nvme_ctrl, new_msi_x, dma_client.clone());
// TODO: Memory restore is disabled for emulated DMA, uncomment once fixed.
// let _new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state)
// let _new_nvme_driver = NvmeDriver::restore(&driver_source, CPU_COUNT, new_device, &saved_state, false, "test-nvme".to_string())
// .await
// .unwrap();
}
Expand Down