diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 925ac7c59f9..8f0e641677a 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1119,6 +1119,7 @@ sled 2eb69596-f081-4e2d-9425-9994926e0832 (role = Gimlet, serial serial1) found at: from fake sled agent address: [fd00:1122:3344:102::1]:12345 usable hw threads: 10 + CPU family: amd_milan usable memory (GiB): 0 reservoir (GiB): 0 physical disks: @@ -1233,6 +1234,7 @@ sled 32d8d836-4d8a-4e54-8fa9-f31d79c42646 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 usable hw threads: 10 + CPU family: amd_milan usable memory (GiB): 0 reservoir (GiB): 0 physical disks: @@ -1347,6 +1349,7 @@ sled 89d02b1b-478c-401a-8e28-7a26f74fa41b (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 usable hw threads: 10 + CPU family: amd_milan usable memory (GiB): 0 reservoir (GiB): 0 physical disks: diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 8d19d5d30d5..deba15ab117 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -124,6 +124,7 @@ sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c (role = Gimlet, serial serial1) found at: from fake sled agent address: [fd00:1122:3344:102::1]:12345 usable hw threads: 10 + CPU family: amd_milan usable memory (GiB): 0 reservoir (GiB): 0 physical disks: @@ -235,6 +236,7 @@ sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 usable hw threads: 10 + CPU family: amd_milan usable memory (GiB): 0 reservoir (GiB): 0 physical disks: @@ -348,6 +350,7 @@ sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 usable hw threads: 10 + CPU family: amd_milan usable memory (GiB): 0 reservoir (GiB): 0 physical disks: diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 3b6954f5e0a..6f7d99259ba 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -40,9 +40,9 @@ use omicron_uuid_kinds::{SledUuid, ZpoolUuid}; use schemars::schema::{Schema, SchemaObject}; use schemars::{JsonSchema, SchemaGenerator}; use serde::{Deserialize, Serialize}; -// Export this type for convenience -- this way, dependents don't have to +// Export these types for convenience -- this way, dependents don't have to // depend on sled-hardware-types. -pub use sled_hardware_types::Baseboard; +pub use sled_hardware_types::{Baseboard, SledCpuFamily}; use strum::EnumIter; use tufaceous_artifact::{ArtifactHash, KnownArtifactKind}; @@ -121,6 +121,7 @@ pub struct Inventory { pub baseboard: Baseboard, pub usable_hardware_threads: u32, pub usable_physical_ram: ByteCount, + pub cpu_family: SledCpuFamily, pub reservoir_size: ByteCount, pub disks: Vec, pub zpools: Vec, diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 02dc3bce2c9..61f904bba0f 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -8,6 +8,7 @@ use crate::ArtifactHash; use crate::Generation; use crate::PhysicalDiskKind; use crate::omicron_zone_config::{self, OmicronZoneNic}; +use crate::sled_cpu_family::SledCpuFamily; use crate::typed_uuid::DbTypedUuid; use crate::{ ByteCount, MacAddr, Name, ServiceKind, SqlU8, SqlU16, SqlU32, @@ -910,6 +911,7 @@ pub struct InvSledAgent { pub sled_role: SledRole, pub usable_hardware_threads: SqlU32, pub usable_physical_ram: ByteCount, + pub cpu_family: SledCpuFamily, pub reservoir_size: ByteCount, // Soft foreign key to an `InvOmicronSledConfig` pub ledgered_sled_config: Option>, @@ -1325,6 +1327,7 @@ impl InvSledAgent { usable_physical_ram: ByteCount::from( sled_agent.usable_physical_ram, ), + cpu_family: sled_agent.cpu_family.into(), reservoir_size: ByteCount::from(sled_agent.reservoir_size), ledgered_sled_config: ledgered_sled_config.map(From::from), reconciler_status, diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index d9e80c57897..9e2e7a3d329 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -104,6 +104,7 @@ mod silo_group; mod silo_user; mod silo_user_password_hash; mod sled; +mod sled_cpu_family; mod sled_instance; mod sled_policy; mod sled_resource_vmm; @@ -225,6 +226,7 @@ pub use silo_group::*; pub use silo_user::*; pub use silo_user_password_hash::*; pub use sled::*; +pub use sled_cpu_family::*; pub use sled_instance::*; pub use sled_policy::to_db_sled_policy; // Do not expose DbSledPolicy pub use sled_resource_vmm::*; diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 14e6ffcc8f0..24407614c53 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(179, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(180, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(180, "sled-cpu-family"), KnownVersion::new(179, "add-pending-mgs-updates-host-phase-1"), KnownVersion::new(178, "change-lldp-management-ip-to-inet"), KnownVersion::new(177, "add-host-ereport-part-number"), diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index 6ed06e20021..631cc92de0a 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -6,6 +6,7 @@ use super::{ByteCount, Generation, SledState, SqlU16, SqlU32}; use crate::collection::DatastoreCollectionConfig; use crate::ipv6; use crate::sled::shared::Baseboard; +use crate::sled_cpu_family::SledCpuFamily; use crate::sled_policy::DbSledPolicy; use chrono::{DateTime, Utc}; use db_macros::Asset; @@ -40,6 +41,8 @@ pub struct SledSystemHardware { // current VMM reservoir size pub reservoir_size: ByteCount, + + pub cpu_family: SledCpuFamily, } /// Database representation of a Sled. @@ -84,6 +87,16 @@ pub struct Sled { // ServiceAddress (Repo Depot API). Uses `ip`. pub repo_depot_port: SqlU16, + + /// The family of this sled's CPU. + /// + /// This is primarily useful for questions about instance CPU platform + /// compatibility; it is too broad for topology-related sled selection + /// and more precise than a more general report of microarchitecture. We + /// likely should include much more about the sled's CPU alongside this for + /// those broader questions and reporting (see + /// for examples). + pub cpu_family: SledCpuFamily, } impl Sled { @@ -185,6 +198,7 @@ impl From for params::SledAgentInfo { usable_physical_ram: sled.usable_physical_ram.into(), reservoir_size: sled.reservoir_size.into(), generation: sled.sled_agent_gen.into(), + cpu_family: sled.cpu_family.into(), decommissioned, } } @@ -229,6 +243,8 @@ pub struct SledUpdate { // ServiceAddress (Repo Depot API). Uses `ip`. pub repo_depot_port: SqlU16, + pub cpu_family: SledCpuFamily, + // Generation number - owned and incremented by sled-agent. pub sled_agent_gen: Generation, } @@ -258,6 +274,7 @@ impl SledUpdate { ip: addr.ip().into(), port: addr.port().into(), repo_depot_port: repo_depot_port.into(), + cpu_family: hardware.cpu_family, sled_agent_gen, } } @@ -296,6 +313,7 @@ impl SledUpdate { repo_depot_port: self.repo_depot_port, last_used_address, sled_agent_gen: self.sled_agent_gen, + cpu_family: self.cpu_family, } } diff --git a/nexus/db-model/src/sled_cpu_family.rs b/nexus/db-model/src/sled_cpu_family.rs new file mode 100644 index 00000000000..703728eca1d --- /dev/null +++ b/nexus/db-model/src/sled_cpu_family.rs @@ -0,0 +1,50 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use serde::{Deserialize, Serialize}; + +impl_enum_type!( + SledCpuFamilyEnum: + + #[derive( + Copy, + Clone, + Debug, + PartialEq, + AsExpression, + FromSqlRow, + Serialize, + Deserialize + )] + pub enum SledCpuFamily; + + Unknown => b"unknown" + AmdMilan => b"amd_milan" + AmdTurin => b"amd_turin" + AmdTurinDense => b"amd_turin_dense" +); + +impl From for SledCpuFamily { + fn from(value: nexus_sled_agent_shared::inventory::SledCpuFamily) -> Self { + use nexus_sled_agent_shared::inventory::SledCpuFamily as InputFamily; + match value { + InputFamily::Unknown => Self::Unknown, + InputFamily::AmdMilan => Self::AmdMilan, + InputFamily::AmdTurin => Self::AmdTurin, + InputFamily::AmdTurinDense => Self::AmdTurinDense, + } + } +} + +impl From for nexus_sled_agent_shared::inventory::SledCpuFamily { + fn from(value: SledCpuFamily) -> Self { + match value { + SledCpuFamily::Unknown => Self::Unknown, + SledCpuFamily::AmdMilan => Self::AmdMilan, + SledCpuFamily::AmdTurin => Self::AmdTurin, + SledCpuFamily::AmdTurinDense => Self::AmdTurinDense, + } + } +} diff --git a/nexus/db-queries/src/db/datastore/crucible_dataset.rs b/nexus/db-queries/src/db/datastore/crucible_dataset.rs index 83b6cd6cb6a..fd9eee898bf 100644 --- a/nexus/db-queries/src/db/datastore/crucible_dataset.rs +++ b/nexus/db-queries/src/db/datastore/crucible_dataset.rs @@ -294,6 +294,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; + use nexus_db_model::SledCpuFamily; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use omicron_common::api::external::ByteCount; @@ -323,6 +324,7 @@ mod test { usable_hardware_threads: 128, usable_physical_ram: (64 << 30).try_into().unwrap(), reservoir_size: (16 << 30).try_into().unwrap(), + cpu_family: SledCpuFamily::AmdMilan, }, Uuid::new_v4(), Generation::new(), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index dd5c27c5056..2403529d724 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -1444,6 +1444,8 @@ impl DataStore { sled_agent.usable_physical_ram, ) .into_sql::(), + nexus_db_model::SledCpuFamily::from(sled_agent.cpu_family) + .into_sql::(), nexus_db_model::ByteCount::from( sled_agent.reservoir_size, ) @@ -1498,6 +1500,7 @@ impl DataStore { sa_dsl::sled_role, sa_dsl::usable_hardware_threads, sa_dsl::usable_physical_ram, + sa_dsl::cpu_family, sa_dsl::reservoir_size, sa_dsl::ledgered_sled_config, sa_dsl::reconciler_status_kind, @@ -1529,6 +1532,7 @@ impl DataStore { _sled_role, _usable_hardware_threads, _usable_physical_ram, + _cpu_family, _reservoir_size, _ledgered_sled_config, _reconciler_status_kind, @@ -3958,6 +3962,7 @@ impl DataStore { sled_role: s.sled_role.into(), usable_hardware_threads: u32::from(s.usable_hardware_threads), usable_physical_ram: s.usable_physical_ram.into(), + cpu_family: s.cpu_family.into(), reservoir_size: s.reservoir_size.into(), // For disks, zpools, and datasets, the map for a sled ID is // only populated if there is at least one disk/zpool/dataset diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index 9409c6c9e1d..0012ee54f64 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -340,7 +340,7 @@ mod test { use nexus_db_lookup::LookupPath; use nexus_sled_agent_shared::inventory::{ Baseboard, ConfigReconcilerInventoryStatus, Inventory, InventoryDisk, - SledRole, ZoneImageResolverInventory, + SledCpuFamily, SledRole, ZoneImageResolverInventory, }; use nexus_types::identity::Asset; use omicron_common::api::external::ByteCount; @@ -693,6 +693,7 @@ mod test { sled_id: SledUuid::from_untyped_uuid(sled.id()), usable_hardware_threads: 10, usable_physical_ram: ByteCount::from(1024 * 1024), + cpu_family: SledCpuFamily::AmdMilan, disks, zpools: vec![], datasets: vec![], diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 435a2e9d3b6..39ce5e08c36 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -309,6 +309,7 @@ impl DataStore { .eq(sled_update.usable_hardware_threads), dsl::usable_physical_ram.eq(sled_update.usable_physical_ram), dsl::reservoir_size.eq(sled_update.reservoir_size), + dsl::cpu_family.eq(sled_update.cpu_family), dsl::sled_agent_gen.eq(sled_update.sled_agent_gen), )) .filter(dsl::sled_agent_gen.lt(sled_update.sled_agent_gen)) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index b6aaf5b4661..05195def6df 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -515,6 +515,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; + use nexus_db_model::SledCpuFamily; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use nexus_db_model::Zpool; @@ -617,6 +618,7 @@ mod test { usable_hardware_threads: 128, usable_physical_ram: (64 << 30).try_into().unwrap(), reservoir_size: (16 << 30).try_into().unwrap(), + cpu_family: SledCpuFamily::AmdMilan, }, rack_id, Generation::new(), diff --git a/nexus/db-queries/src/db/pub_test_utils/helpers.rs b/nexus/db-queries/src/db/pub_test_utils/helpers.rs index c81f6440d0a..9369324e72a 100644 --- a/nexus/db-queries/src/db/pub_test_utils/helpers.rs +++ b/nexus/db-queries/src/db/pub_test_utils/helpers.rs @@ -25,6 +25,7 @@ use nexus_db_model::ProjectImage; use nexus_db_model::ProjectImageIdentity; use nexus_db_model::Resources; use nexus_db_model::SledBaseboard; +use nexus_db_model::SledCpuFamily; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use nexus_db_model::Snapshot; @@ -77,6 +78,7 @@ pub struct SledSystemHardwareBuilder { usable_hardware_threads: u32, usable_physical_ram: i64, reservoir_size: i64, + cpu_family: SledCpuFamily, } impl Default for SledSystemHardwareBuilder { @@ -86,6 +88,7 @@ impl Default for SledSystemHardwareBuilder { usable_hardware_threads: 4, usable_physical_ram: 1 << 40, reservoir_size: 1 << 39, + cpu_family: SledCpuFamily::AmdMilan, } } } @@ -121,12 +124,18 @@ impl SledSystemHardwareBuilder { self } + pub fn cpu_family(&mut self, family: SledCpuFamily) -> &mut Self { + self.cpu_family = family; + self + } + pub fn build(&self) -> SledSystemHardware { SledSystemHardware { is_scrimlet: self.is_scrimlet, usable_hardware_threads: self.usable_hardware_threads, usable_physical_ram: self.usable_physical_ram.try_into().unwrap(), reservoir_size: self.reservoir_size.try_into().unwrap(), + cpu_family: self.cpu_family, } } } diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 616f7df5f3b..39210a32d6f 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -76,6 +76,7 @@ define_enums! { RouterRouteKindEnum => "router_route_kind", SagaStateEnum => "saga_state", ServiceKindEnum => "service_kind", + SledCpuFamilyEnum => "sled_cpu_family", SledPolicyEnum => "sled_policy", SledRoleEnum => "sled_role", SledStateEnum => "sled_state", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index f4a0592475f..0cea5de1cd8 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -971,6 +971,7 @@ table! { sled_state -> crate::enums::SledStateEnum, sled_agent_gen -> Int8, repo_depot_port -> Int4, + cpu_family -> crate::enums::SledCpuFamilyEnum, } } @@ -1620,6 +1621,7 @@ table! { sled_role -> crate::enums::SledRoleEnum, usable_hardware_threads -> Int8, usable_physical_ram -> Int8, + cpu_family -> crate::enums::SledCpuFamilyEnum, reservoir_size -> Int8, ledgered_sled_config -> Nullable, diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 528edd98996..6a30c02584c 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -656,6 +656,7 @@ impl CollectionBuilder { baseboard_id, usable_hardware_threads: inventory.usable_hardware_threads, usable_physical_ram: inventory.usable_physical_ram, + cpu_family: inventory.cpu_family, reservoir_size: inventory.reservoir_size, time_collected, sled_id, diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 09d2857ca12..d5c31bed05b 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -726,6 +726,7 @@ mod test { use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; use nexus_sled_agent_shared::inventory::OmicronZoneType; + use nexus_sled_agent_shared::inventory::SledCpuFamily; use nexus_types::inventory::Collection; use omicron_cockroach_metrics::CockroachClusterAdminClient; use omicron_common::api::external::Generation; @@ -967,6 +968,7 @@ mod test { None, None, sim::ZpoolConfig::None, + SledCpuFamily::AmdMilan, ); let agent = diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index ea413883360..d0fca08aa50 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -29,6 +29,7 @@ use nexus_sled_agent_shared::inventory::InventoryZpool; use nexus_sled_agent_shared::inventory::OmicronSledConfig; use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_sled_agent_shared::inventory::OrphanedDataset; +use nexus_sled_agent_shared::inventory::SledCpuFamily; use nexus_sled_agent_shared::inventory::SledRole; use nexus_sled_agent_shared::inventory::ZoneImageResolverInventory; use nexus_types::inventory::BaseboardId; @@ -966,6 +967,7 @@ pub fn sled_agent( sled_id, usable_hardware_threads: 10, usable_physical_ram: ByteCount::from(1024 * 1024), + cpu_family: SledCpuFamily::AmdMilan, disks, zpools, datasets, diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index e6b1334d391..4d97d109cf5 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -202,6 +202,7 @@ mod api_impl { use nexus_sled_agent_shared::inventory::Inventory; use nexus_sled_agent_shared::inventory::MupdateOverrideInventory; use nexus_sled_agent_shared::inventory::OmicronSledConfig; + use nexus_sled_agent_shared::inventory::SledCpuFamily; use nexus_sled_agent_shared::inventory::SledRole; use nexus_sled_agent_shared::inventory::ZoneImageResolverInventory; use nexus_sled_agent_shared::inventory::ZoneManifestInventory; @@ -313,6 +314,7 @@ mod api_impl { usable_hardware_threads: 64, usable_physical_ram: (1 << 30).into(), reservoir_size: (1 << 29).into(), + cpu_family: SledCpuFamily::AmdMilan, disks: Vec::new(), zpools: Vec::new(), datasets: Vec::new(), diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index e980b5c5049..3a8560438b4 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -24,6 +24,7 @@ use nexus_sled_agent_shared::inventory::InventoryDisk; use nexus_sled_agent_shared::inventory::InventoryZpool; use nexus_sled_agent_shared::inventory::MupdateOverrideBootInventory; use nexus_sled_agent_shared::inventory::OmicronSledConfig; +use nexus_sled_agent_shared::inventory::SledCpuFamily; use nexus_sled_agent_shared::inventory::SledRole; use nexus_sled_agent_shared::inventory::ZoneImageResolverInventory; use nexus_sled_agent_shared::inventory::ZoneKind; @@ -1277,6 +1278,7 @@ impl Sled { sled_id, usable_hardware_threads: 10, usable_physical_ram: ByteCount::from(1024 * 1024), + cpu_family: SledCpuFamily::AmdMilan, // Populate disks, appearing like a real device. disks: zpools .values() @@ -1484,6 +1486,7 @@ impl Sled { sled_id, usable_hardware_threads: inv_sled_agent.usable_hardware_threads, usable_physical_ram: inv_sled_agent.usable_physical_ram, + cpu_family: inv_sled_agent.cpu_family, disks: vec![], zpools: vec![], datasets: vec![], diff --git a/nexus/reconfigurator/rendezvous/src/crucible_dataset.rs b/nexus/reconfigurator/rendezvous/src/crucible_dataset.rs index 0d4fd8a8382..22d68157bf2 100644 --- a/nexus/reconfigurator/rendezvous/src/crucible_dataset.rs +++ b/nexus/reconfigurator/rendezvous/src/crucible_dataset.rs @@ -130,6 +130,7 @@ mod tests { use async_bb8_diesel::AsyncSimpleConnection; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; + use nexus_db_model::SledCpuFamily; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use nexus_db_model::Zpool; @@ -201,6 +202,7 @@ mod tests { usable_hardware_threads: 128, usable_physical_ram: (64 << 30).try_into().unwrap(), reservoir_size: (16 << 30).try_into().unwrap(), + cpu_family: SledCpuFamily::Unknown, }, Uuid::new_v4(), Generation::new(), diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 88fea70e7a1..3a2c6ff404d 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -180,7 +180,8 @@ mod test { use id_map::IdMap; use itertools::Itertools as _; use nexus_db_model::{ - ByteCount, SledBaseboard, SledSystemHardware, SledUpdate, Zpool, + ByteCount, SledBaseboard, SledCpuFamily, SledSystemHardware, + SledUpdate, Zpool, }; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; @@ -359,6 +360,7 @@ mod test { usable_hardware_threads: 4, usable_physical_ram: ByteCount(1000.into()), reservoir_size: ByteCount(999.into()), + cpu_family: SledCpuFamily::AmdMilan, }, rack_id, nexus_db_model::Generation::new(), diff --git a/nexus/src/app/background/tasks/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs index 87c13422bcc..a55d60124d2 100644 --- a/nexus/src/app/background/tasks/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -267,6 +267,7 @@ mod test { use crate::app::background::BackgroundTask; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; + use nexus_db_model::SledCpuFamily; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; use nexus_db_queries::context::OpContext; @@ -443,6 +444,7 @@ mod test { usable_physical_ram: ByteCount::from_gibibytes_u32(16) .into(), reservoir_size: ByteCount::from_gibibytes_u32(8).into(), + cpu_family: SledCpuFamily::AmdMilan, }, rack_id, Generation::new(), diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 715a1504081..799cb4136f7 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -77,6 +77,7 @@ impl super::Nexus { usable_hardware_threads: info.usable_hardware_threads, usable_physical_ram: info.usable_physical_ram.into(), reservoir_size: info.reservoir_size.into(), + cpu_family: info.cpu_family.into(), }, self.rack_id, info.generation.into(), diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 9a76249fb12..f68c5a96573 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -38,6 +38,7 @@ use nexus_db_queries::db::pub_test_utils::crdb; use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots; use nexus_sled_agent_shared::inventory::OmicronSledConfig; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; +use nexus_sled_agent_shared::inventory::SledCpuFamily; use nexus_sled_agent_shared::recovery_silo::RecoverySiloConfig; use nexus_test_interface::NexusServer; use nexus_types::deployment::Blueprint; @@ -1902,7 +1903,18 @@ pub async fn start_sled_agent( Some(nexus_address), Some(update_directory), sim::ZpoolConfig::None, + SledCpuFamily::AmdMilan, ); + start_sled_agent_with_config(log, &config, sled_index, simulated_upstairs) + .await +} + +pub async fn start_sled_agent_with_config( + log: Logger, + config: &sim::Config, + sled_index: u16, + simulated_upstairs: &Arc, +) -> Result { let server = sim::Server::start(&config, &log, true, simulated_upstairs, sled_index) .await diff --git a/nexus/tests/integration_tests/rack.rs b/nexus/tests/integration_tests/rack.rs index 9eebe3d2130..5b2fb969433 100644 --- a/nexus/tests/integration_tests/rack.rs +++ b/nexus/tests/integration_tests/rack.rs @@ -7,8 +7,10 @@ use http::Method; use http::StatusCode; use nexus_client::types::SledId; use nexus_db_model::SledBaseboard; +use nexus_db_model::SledCpuFamily as DbSledCpuFamily; use nexus_db_model::SledSystemHardware; use nexus_db_model::SledUpdate; +use nexus_sled_agent_shared::inventory::SledCpuFamily; use nexus_sled_agent_shared::inventory::SledRole; use nexus_test_utils::TEST_SUITE_PASSWORD; use nexus_test_utils::http_testing::AuthnMode; @@ -135,6 +137,7 @@ async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { usable_hardware_threads: 32, usable_physical_ram: ByteCount::from_gibibytes_u32(100), reservoir_size: ByteCount::from_mebibytes_u32(100), + cpu_family: SledCpuFamily::Unknown, generation: Generation::new(), decommissioned: false, }; @@ -240,6 +243,7 @@ async fn test_sled_add(cptestctx: &ControlPlaneTestContext) { usable_hardware_threads: 8, usable_physical_ram: (1 << 30).try_into().unwrap(), reservoir_size: (1 << 20).try_into().unwrap(), + cpu_family: DbSledCpuFamily::Unknown, }, nexus.rack_id(), Generation::new().into(), diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index a1a707d12a9..fdb12c42a26 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -8,7 +8,7 @@ use crate::deployment::Blueprint; use crate::external_api::params::PhysicalDiskKind; use crate::external_api::shared::Baseboard; use crate::external_api::shared::IpRange; -use nexus_sled_agent_shared::inventory::SledRole; +use nexus_sled_agent_shared::inventory::{SledCpuFamily, SledRole}; use nexus_sled_agent_shared::recovery_silo::RecoverySiloConfig; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; @@ -56,6 +56,9 @@ pub struct SledAgentInfo { /// Must be smaller than "usable_physical_ram" pub reservoir_size: ByteCount, + /// The family of the sled's CPU. + pub cpu_family: SledCpuFamily, + /// The generation number of this request from sled-agent pub generation: Generation, diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 28854edbd6c..0bac20f86e8 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -30,6 +30,7 @@ use nexus_sled_agent_shared::inventory::InventoryDisk; use nexus_sled_agent_shared::inventory::InventoryZpool; use nexus_sled_agent_shared::inventory::OmicronSledConfig; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; +use nexus_sled_agent_shared::inventory::SledCpuFamily; use nexus_sled_agent_shared::inventory::SledRole; use nexus_sled_agent_shared::inventory::ZoneImageResolverInventory; use omicron_common::api::external::ByteCount; @@ -683,6 +684,7 @@ pub struct SledAgent { pub sled_role: SledRole, pub usable_hardware_threads: u32, pub usable_physical_ram: ByteCount, + pub cpu_family: SledCpuFamily, pub reservoir_size: ByteCount, pub disks: Vec, pub zpools: Vec, diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index f64970a7bab..b01e69d70b1 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -611,6 +611,7 @@ fn display_sleds( sled_role, usable_hardware_threads, usable_physical_ram, + cpu_family, reservoir_size, disks, zpools, @@ -643,6 +644,7 @@ fn display_sleds( )?; writeln!(indented, "address: {}", sled_agent_address)?; writeln!(indented, "usable hw threads: {}", usable_hardware_threads)?; + writeln!(indented, "CPU family: {}", cpu_family)?; writeln!( indented, "usable memory (GiB): {}", diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 2729d155ec9..8cbfe5eb09f 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -7532,6 +7532,14 @@ } ] }, + "cpu_family": { + "description": "The family of the sled's CPU.", + "allOf": [ + { + "$ref": "#/components/schemas/SledCpuFamily" + } + ] + }, "decommissioned": { "description": "Whether the sled-agent has been decommissioned by nexus\n\nThis flag is only set to true by nexus. Setting it on an upsert from sled-agent has no effect.", "type": "boolean" @@ -7587,6 +7595,7 @@ }, "required": [ "baseboard", + "cpu_family", "decommissioned", "generation", "repo_depot_port", @@ -7597,6 +7606,39 @@ "usable_physical_ram" ] }, + "SledCpuFamily": { + "description": "Identifies the kind of CPU present on a sled, determined by reading CPUID.\n\nThis is intended to broadly support the control plane answering the question \"can I run this instance on that sled?\" given an instance with either no or some CPU platform requirement. It is not enough information for more precise placement questions - for example, is a CPU a high-frequency part or many-core part? We don't include Genoa here, but in that CPU family there are high frequency parts, many-core parts, and large-cache parts. To support those questions (or satisfactorily answer #8730) we would need to collect additional information and send it along.", + "oneOf": [ + { + "description": "The CPU vendor or its family number don't correspond to any of the known family variants.", + "type": "string", + "enum": [ + "unknown" + ] + }, + { + "description": "AMD Milan processors (or very close). Could be an actual Milan in a Gimlet, a close-to-Milan client Zen 3 part, or Zen 4 (for which Milan is the greatest common denominator).", + "type": "string", + "enum": [ + "amd_milan" + ] + }, + { + "description": "AMD Turin processors (or very close). Could be an actual Turin in a Cosmo, or a close-to-Turin client Zen 5 part.", + "type": "string", + "enum": [ + "amd_turin" + ] + }, + { + "description": "AMD Turin Dense processors. There are no \"Turin Dense-like\" CPUs unlike other cases, so this means a bona fide Zen 5c Turin Dense part.", + "type": "string", + "enum": [ + "amd_turin_dense" + ] + } + ] + }, "SledId": { "type": "object", "properties": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index b0c35ef72fa..22bd3c8b072 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -5000,6 +5000,9 @@ "baseboard": { "$ref": "#/components/schemas/Baseboard" }, + "cpu_family": { + "$ref": "#/components/schemas/SledCpuFamily" + }, "datasets": { "type": "array", "items": { @@ -5063,6 +5066,7 @@ }, "required": [ "baseboard", + "cpu_family", "datasets", "disks", "reconciler_status", @@ -7007,6 +7011,39 @@ "com4" ] }, + "SledCpuFamily": { + "description": "Identifies the kind of CPU present on a sled, determined by reading CPUID.\n\nThis is intended to broadly support the control plane answering the question \"can I run this instance on that sled?\" given an instance with either no or some CPU platform requirement. It is not enough information for more precise placement questions - for example, is a CPU a high-frequency part or many-core part? We don't include Genoa here, but in that CPU family there are high frequency parts, many-core parts, and large-cache parts. To support those questions (or satisfactorily answer #8730) we would need to collect additional information and send it along.", + "oneOf": [ + { + "description": "The CPU vendor or its family number don't correspond to any of the known family variants.", + "type": "string", + "enum": [ + "unknown" + ] + }, + { + "description": "AMD Milan processors (or very close). Could be an actual Milan in a Gimlet, a close-to-Milan client Zen 3 part, or Zen 4 (for which Milan is the greatest common denominator).", + "type": "string", + "enum": [ + "amd_milan" + ] + }, + { + "description": "AMD Turin processors (or very close). Could be an actual Turin in a Cosmo, or a close-to-Turin client Zen 5 part.", + "type": "string", + "enum": [ + "amd_turin" + ] + }, + { + "description": "AMD Turin Dense processors. There are no \"Turin Dense-like\" CPUs unlike other cases, so this means a bona fide Zen 5c Turin Dense part.", + "type": "string", + "enum": [ + "amd_turin_dense" + ] + } + ] + }, "SledDiagnosticsQueryOutput": { "oneOf": [ { diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 3a361c73dbc..8463bdba2b2 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -187,6 +187,24 @@ CREATE TYPE IF NOT EXISTS omicron.public.sled_state AS ENUM ( 'decommissioned' ); +-- The model of CPU installed in a particular sled, discovered by sled-agent +-- and reported to Nexus. This determines what VMs can run on a sled: instances +-- that require a specific minimum CPU platform can only run on sleds whose +-- CPUs support all the features of that platform. +CREATE TYPE IF NOT EXISTS omicron.public.sled_cpu_family AS ENUM ( + -- Sled-agent didn't recognize the sled's CPU. + 'unknown', + + -- AMD Milan, or lab CPU close enough that sled-agent reported it as one. + 'amd_milan', + + -- AMD Turin, or lab CPU close enough that sled-agent reported it as one. + 'amd_turin', + + -- AMD Turin Dense. There are no "Turin Dense-likes", so this is precise. + 'amd_turin_dense' +); + CREATE TABLE IF NOT EXISTS omicron.public.sled ( /* Identity metadata (asset) */ id UUID PRIMARY KEY, @@ -229,7 +247,10 @@ CREATE TABLE IF NOT EXISTS omicron.public.sled ( /* The bound port of the Repo Depot API server, running on the same IP as the sled agent server. */ - repo_depot_port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL + repo_depot_port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, + + /* The sled's detected CPU family. */ + cpu_family omicron.public.sled_cpu_family NOT NULL ); -- Add an index that ensures a given physical sled (identified by serial and @@ -3706,6 +3727,10 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( -- present. mupdate_override_boot_disk_error TEXT, + -- The sled's CPU family. This is also duplicated with the `sled` table, + -- similar to `usable_hardware_threads` and friends above. + cpu_family omicron.public.sled_cpu_family NOT NULL, + CONSTRAINT reconciler_status_sled_config_present_if_running CHECK ( (reconciler_status_kind = 'running' AND reconciler_status_sled_config IS NOT NULL) @@ -6524,7 +6549,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '179.0.0', NULL) + (TRUE, NOW(), NOW(), '180.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/sled-cpu-family/up01.sql b/schema/crdb/sled-cpu-family/up01.sql new file mode 100644 index 00000000000..f1bb76f3389 --- /dev/null +++ b/schema/crdb/sled-cpu-family/up01.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.sled_cpu_family AS ENUM ( + 'unknown', + 'amd_milan', + 'amd_turin', + 'amd_turin_dense' +); diff --git a/schema/crdb/sled-cpu-family/up02.sql b/schema/crdb/sled-cpu-family/up02.sql new file mode 100644 index 00000000000..1409e918dae --- /dev/null +++ b/schema/crdb/sled-cpu-family/up02.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.sled ADD COLUMN IF NOT EXISTS + cpu_family omicron.public.sled_cpu_family NOT NULL DEFAULT 'unknown'; diff --git a/schema/crdb/sled-cpu-family/up03.sql b/schema/crdb/sled-cpu-family/up03.sql new file mode 100644 index 00000000000..612de867e4f --- /dev/null +++ b/schema/crdb/sled-cpu-family/up03.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.sled ALTER COLUMN cpu_family DROP DEFAULT; diff --git a/schema/crdb/sled-cpu-family/up04.sql b/schema/crdb/sled-cpu-family/up04.sql new file mode 100644 index 00000000000..b2fd0b97156 --- /dev/null +++ b/schema/crdb/sled-cpu-family/up04.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.inv_sled_agent ADD COLUMN IF NOT EXISTS + cpu_family omicron.public.sled_cpu_family NOT NULL DEFAULT 'unknown'; diff --git a/schema/crdb/sled-cpu-family/up05.sql b/schema/crdb/sled-cpu-family/up05.sql new file mode 100644 index 00000000000..61db961a1b5 --- /dev/null +++ b/schema/crdb/sled-cpu-family/up05.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.inv_sled_agent ALTER COLUMN cpu_family DROP DEFAULT; diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs index ca96b2513e1..88ca421c555 100644 --- a/sled-agent/src/bin/sled-agent-sim.rs +++ b/sled-agent/src/bin/sled-agent-sim.rs @@ -21,7 +21,7 @@ use omicron_sled_agent::sim::{ run_standalone_server, }; use omicron_uuid_kinds::SledUuid; -use sled_hardware_types::Baseboard; +use sled_hardware_types::{Baseboard, SledCpuFamily}; use std::net::SocketAddr; use std::net::SocketAddrV6; @@ -110,6 +110,7 @@ async fn do_run() -> Result<(), CmdError> { hardware_threads: 32, physical_ram: 64 * (1 << 30), reservoir_ram: 32 * (1 << 30), + cpu_family: SledCpuFamily::AmdMilan, baseboard: Baseboard::Gimlet { identifier: format!("sim-{}", args.uuid), model: String::from("sim-gimlet"), @@ -122,6 +123,7 @@ async fn do_run() -> Result<(), CmdError> { Some(args.nexus_addr), Some(tmp.path()), ZpoolConfig::TenVirtualU2s, + SledCpuFamily::AmdMilan, ) }; diff --git a/sled-agent/src/nexus.rs b/sled-agent/src/nexus.rs index 3faeed749bb..e9e28b5c606 100644 --- a/sled-agent/src/nexus.rs +++ b/sled-agent/src/nexus.rs @@ -77,6 +77,28 @@ impl ConvertInto } } +impl ConvertInto + for sled_hardware_types::SledCpuFamily +{ + fn convert(self) -> nexus_client::types::SledCpuFamily { + use sled_hardware_types::SledCpuFamily as SharedSledCpuFamily; + match self { + SharedSledCpuFamily::Unknown => { + nexus_client::types::SledCpuFamily::Unknown + } + SharedSledCpuFamily::AmdMilan => { + nexus_client::types::SledCpuFamily::AmdMilan + } + SharedSledCpuFamily::AmdTurin => { + nexus_client::types::SledCpuFamily::AmdTurin + } + SharedSledCpuFamily::AmdTurinDense => { + nexus_client::types::SledCpuFamily::AmdTurinDense + } + } + } +} + // Somewhat arbitrary bound size, large enough that we should never hit it. const QUEUE_SIZE: usize = 256; @@ -275,6 +297,7 @@ impl NexusNotifierTask { .usable_physical_ram_bytes() .into(), reservoir_size: vmm_reservoir_manager.reservoir_size().into(), + cpu_family: hardware.cpu_family().convert(), generation, decommissioned: false, } @@ -654,6 +677,7 @@ mod test { usable_physical_ram: ByteCount::from(1024 * 1024 * 1024u32) .into(), reservoir_size: ByteCount::from(0u32).into(), + cpu_family: nexus_client::types::SledCpuFamily::Unknown, generation: Generation::new(), decommissioned: false, })); diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 3732bca059a..37c74805c3c 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -1155,6 +1155,7 @@ impl ServicePortBuilder { mod tests { use super::*; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; + use nexus_sled_agent_shared::inventory::SledCpuFamily; use nexus_sled_agent_shared::inventory::ZoneImageResolverInventory; use omicron_common::address::IpRange; use omicron_common::api::external::ByteCount; @@ -1372,6 +1373,7 @@ mod tests { baseboard: Baseboard::Unknown, usable_hardware_threads: 32, usable_physical_ram: ByteCount::try_from(1_u64 << 40).unwrap(), + cpu_family: SledCpuFamily::AmdMilan, reservoir_size: ByteCount::try_from(1_u64 << 40).unwrap(), disks, zpools: vec![], diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 508733d4f2e..5cf06ba0c32 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1749,7 +1749,7 @@ mod test { use nexus_reconfigurator_blippy::{Blippy, BlippyReportSortKey}; use nexus_sled_agent_shared::inventory::{ Baseboard, ConfigReconcilerInventoryStatus, Inventory, InventoryDisk, - OmicronZoneType, SledRole, ZoneImageResolverInventory, + OmicronZoneType, SledCpuFamily, SledRole, ZoneImageResolverInventory, }; use omicron_common::{ address::{Ipv6Subnet, SLED_PREFIX, get_sled_address}, @@ -1775,6 +1775,7 @@ mod test { baseboard: Baseboard::Unknown, usable_hardware_threads: 32, usable_physical_ram: ByteCount::from_gibibytes_u32(16), + cpu_family: SledCpuFamily::AmdMilan, reservoir_size: ByteCount::from_gibibytes_u32(0), disks: (0..u2_count) .map(|i| InventoryDisk { diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index dbd9f00c22e..58454d2a507 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -10,7 +10,7 @@ use dropshot::ConfigDropshot; use omicron_uuid_kinds::SledUuid; use serde::Deserialize; use serde::Serialize; -pub use sled_hardware_types::Baseboard; +pub use sled_hardware_types::{Baseboard, SledCpuFamily}; use std::net::Ipv6Addr; use std::net::{IpAddr, SocketAddr}; @@ -56,6 +56,12 @@ pub struct ConfigHardware { pub hardware_threads: u32, pub physical_ram: u64, pub reservoir_ram: u64, + /// The kind of CPU to report the simulated sled as. In reality this is + /// constrained by `baseboard`; a `Baseboard::Gimlet` will only have an + /// `SledCpuFamily::AmdMilan`. A future `Baseboard::Cosmo` will *never* have + /// a `SledCpuFamily::AmdMilan`. Because the baseboard does not imply a + /// specific individual CPU family, though, it's simpler to record here. + pub cpu_family: SledCpuFamily, pub baseboard: Baseboard, } @@ -93,6 +99,7 @@ impl Config { nexus_address: Option, update_directory: Option<&Utf8Path>, zpool_config: ZpoolConfig, + cpu_family: SledCpuFamily, ) -> Config { // This IP range is guaranteed by RFC 6666 to discard traffic. // For tests that don't use a Nexus, we use this address to simulate a @@ -133,6 +140,7 @@ impl Config { hardware_threads: TEST_HARDWARE_THREADS, physical_ram: TEST_PHYSICAL_RAM, reservoir_ram: TEST_RESERVOIR_RAM, + cpu_family, baseboard: Baseboard::Gimlet { identifier: format!("sim-{}", id), model: String::from("sim-gimlet"), diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index 05c75e18c0e..f252e327834 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -8,7 +8,7 @@ use super::config::Config; use super::http_entrypoints::api as http_api; use super::sled_agent::SledAgent; use super::storage::PantryServer; -use crate::nexus::NexusClient; +use crate::nexus::{ConvertInto, NexusClient}; use crate::rack_setup::SledConfig; use crate::rack_setup::service::build_initial_blueprint_from_sled_configs; use crate::rack_setup::{ @@ -166,6 +166,7 @@ impl Server { config.hardware.reservoir_ram, ) .unwrap(), + cpu_family: config.hardware.cpu_family.convert(), generation: Generation::new(), decommissioned: false, }, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index ac3b6b45882..c75d6944b8b 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -756,6 +756,7 @@ impl SledAgent { self.config.hardware.physical_ram, ) .context("usable_physical_ram")?, + cpu_family: self.config.hardware.cpu_family, reservoir_size: ByteCount::try_from( self.config.hardware.reservoir_ram, ) diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 5d23a5794d5..b9927bfed56 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -1097,6 +1097,7 @@ impl SledAgent { self.inner.hardware.online_processor_count(); let usable_physical_ram = self.inner.hardware.usable_physical_ram_bytes(); + let cpu_family = self.inner.hardware.cpu_family(); let reservoir_size = self.inner.instances.reservoir_size(); let sled_role = if is_scrimlet { SledRole::Scrimlet } else { SledRole::Gimlet }; @@ -1119,6 +1120,7 @@ impl SledAgent { baseboard, usable_hardware_threads, usable_physical_ram: ByteCount::try_from(usable_physical_ram)?, + cpu_family, reservoir_size, disks, zpools, diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs index 057db6012b6..3f673e0b4ca 100644 --- a/sled-hardware/src/illumos/mod.rs +++ b/sled-hardware/src/illumos/mod.rs @@ -9,7 +9,7 @@ use gethostname::gethostname; use illumos_devinfo::{DevInfo, DevLinkType, DevLinks, Node, Property}; use libnvme::{Nvme, controller::Controller}; use omicron_common::disk::{DiskIdentity, DiskVariant}; -use sled_hardware_types::Baseboard; +use sled_hardware_types::{Baseboard, SledCpuFamily}; use slog::Logger; use slog::debug; use slog::error; @@ -797,6 +797,11 @@ impl HardwareManager { .unwrap_or_else(|| Baseboard::unknown()) } + pub fn cpu_family(&self) -> SledCpuFamily { + let log = self.log.new(slog::o!("component" => "detect_cpu_family")); + crate::detect_cpu_family(&log) + } + pub fn online_processor_count(&self) -> u32 { self.inner.lock().unwrap().online_processor_count } diff --git a/sled-hardware/src/lib.rs b/sled-hardware/src/lib.rs index 18c6b4ba3a2..582c13f4053 100644 --- a/sled-hardware/src/lib.rs +++ b/sled-hardware/src/lib.rs @@ -149,3 +149,151 @@ impl MemoryReservations { vmm_eligible } } + +/// Detects the current sled's CPU family using the CPUID instruction. +/// +/// TODO: Ideally we would call into libtopo and pass along the information +/// identified there. See +/// . +/// +/// Everything here is duplicative with CPU identification done by the kernel. +/// You'll even find a very similar (but much more comprehensive) AMD family +/// mapping at `amd_revmap` in `usr/src/uts/intel/os/cpuid_subr.c`. But +/// sled-agent does not yet know about libtopo, getting topo snapshots, walking +/// them, or any of that, so the parsing is performed again here. +#[cfg(target_arch = "x86_64")] +pub fn detect_cpu_family(log: &Logger) -> sled_hardware_types::SledCpuFamily { + use core::arch::x86_64::__cpuid_count; + use sled_hardware_types::SledCpuFamily; + + // Read leaf 0 to figure out the processor's vendor and whether leaf 1 + // (which contains family, model, and stepping information) is available. + let leaf_0 = unsafe { __cpuid_count(0, 0) }; + + info!(log, "read CPUID leaf 0 to detect CPU vendor"; "values" => ?leaf_0); + + // If leaf 1 is unavailable, there's no way to figure out what family this + // processor belongs to. + if leaf_0.eax < 1 { + return SledCpuFamily::Unknown; + } + + // Check the vendor ID string in ebx/ecx/edx. + match (leaf_0.ebx, leaf_0.ecx, leaf_0.edx) { + // "AuthenticAMD"; see AMD APM volume 3 (March 2024) section E.3.1. + (0x68747541, 0x444D4163, 0x69746E65) => {} + _ => return SledCpuFamily::Unknown, + } + + // Feature detection after this point is AMD-specific - if we find ourselves + // supporting other CPU vendors we'll want to split this out accordingly. + + // Per AMD APM volume 3 (March 2024) section E.3.2, the processor family + // number is computed as follows: + // + // - Read bits 11:8 of leaf 1 eax to get the "base" family value. If this + // value is less than 0xF, the family value is equal to the base family + // value. + // - If the base family value is 0xF, eax[27:20] contains the "extended" + // family value, and the actual family value is the sum of the base and + // the extended values. + let leaf_1 = unsafe { __cpuid_count(1, 0) }; + let mut family = (leaf_1.eax & 0x00000F00) >> 8; + if family == 0xF { + family += (leaf_1.eax & 0x0FF00000) >> 20; + } + + // Also from the APM volume 3 section E.3.2, the processor model number is + // computed as follows: + // + // - Read bits 7:4 of leaf 1 eax to get the "base" model value. + // - If the "base" family value is less than 0xF, the "base" model stands. + // Otherwise, four additional bits of the model come from eax[19:16]. + // + // If the computed family number is 0xF or greater, that implies the "base" + // family was 0xF or greater as well. + let mut model = (leaf_1.eax & 0x000000F0) >> 4; + if family >= 0xF { + model |= (leaf_1.eax & 0x000F0000) >> 12; + } + + info!( + log, + "read CPUID leaf 1 to detect CPU family"; + "leaf1.eax" => format_args!("{:#08x}", leaf_1.eax), + "leaf1.ebx" => format_args!("{:#08x}", leaf_1.ebx), + "leaf1.ecx" => format_args!("{:#08x}", leaf_1.ecx), + "leaf1.edx" => format_args!("{:#08x}", leaf_1.edx), + "parsed_family" => format_args!("{family:#x}"), + "parsed_model" => format_args!("{model:#x}"), + ); + + // Match on the family/model ranges we've detected. Notably client parts are + // reported as if they were their server counterparts; the feature parity is + // close enough that guests probably won't run into issues. This lowers + // friction for testing migrations where the control plane would need to + // tell what hosts could be compatible with a VMM's CPU platform. + // + // TODO(?): Exhaustively check that client parts support all CPU features of + // the corresponding Oxide CPU platform before doing this "as-if" reporting. + // Lab systems built out of client parts may have hardware which support all + // features in the corresponding instance CPU platform, but have individual + // features disabled in the BIOS or by client part microcode. This can + // result in funky situations, like an Oxide CPU platform advertising CPU + // features that lab systems don't support. This is unlikely, but take + // AVX512 as an example: users can often disable AVX512 entirely on Zen 5 + // BIOSes. In this case a VM on a 9000-series Ryzen will be told those + // instructions are available only for the guest to get #UD at runtime. + match family { + 0x19 if model <= 0x0F => { + // This covers both Milan and Zen 3-based Threadrippers. I don't + // have a 5000-series Threadripper on hand to test but I believe + // they are feature-compatible. + SledCpuFamily::AmdMilan + } + 0x19 if model >= 0x10 && model <= 0x1F => { + // This covers both Genoa and Zen 4-based Threadrippers. Again, + // don't have a comparable Threadripper to test here. + // + // We intend to expose Turin and Milan as families a guest can + // choose, skipping the Zen 4 EPYC parts. So, round this down to + // Milan; if we're here it's a lab system and the alternative is + // "unknown". + SledCpuFamily::AmdMilan + } + 0x19 if model >= 0x20 && model <= 0x2F => { + // These are client Zen 3 parts aka Vermeer. Feature-wise, they are + // missing INVLPGB from Milan, but are otherwise close, and we don't + // expose INVLPGB to guests currently anyway. + SledCpuFamily::AmdMilan + } + 0x19 if model >= 0x60 && model <= 0x6F => { + // These are client Zen 4 parts aka Raphael. Similar to the above + // with Genoa and Vermeer, round these down to Milan in support of + // lab clusters instead of calling them unknown. + SledCpuFamily::AmdMilan + } + 0x1A if model <= 0x0F => SledCpuFamily::AmdTurin, + 0x1A if model >= 0x10 && model <= 0x1F => { + // These are Turin Dense. From a CPU feature perspective they're + // equivalently capable to Turin, but they are physically distinct + // and sled operators should be able to see that. + SledCpuFamily::AmdTurinDense + } + 0x1A if model >= 0x40 && model <= 0x4F => { + // These are client Zen 5 parts aka Granite Ridge. Won't be in a + // rack, but plausibly in a lab cluster. Like other non-server + // parts, these don't have INVLPGB, which we don't expose to guests. + // They should otherwise be a sufficient stand-in for Turin. + SledCpuFamily::AmdTurin + } + // Remaining family/model ranges in known families are likely mobile + // parts and intentionally rolled up into "Unknown." There, it's harder + // to predict what features out of the corresponding CPU platform would + // actually be present. It's also less likely that someone has a laptop + // or APU as part of a development cluster! + // + // Other families are, of course, unknown. + _ => SledCpuFamily::Unknown, + } +} diff --git a/sled-hardware/src/non_illumos/mod.rs b/sled-hardware/src/non_illumos/mod.rs index c54afe87301..fa660ad0caa 100644 --- a/sled-hardware/src/non_illumos/mod.rs +++ b/sled-hardware/src/non_illumos/mod.rs @@ -6,7 +6,7 @@ use crate::SledMode; use crate::disk::{DiskPaths, Partition, PooledDiskError, UnparsedDisk}; use omicron_common::disk::{DiskIdentity, DiskVariant}; use omicron_uuid_kinds::ZpoolUuid; -use sled_hardware_types::Baseboard; +use sled_hardware_types::{Baseboard, SledCpuFamily}; use slog::Logger; use std::collections::HashMap; use tokio::sync::broadcast; @@ -41,6 +41,10 @@ impl HardwareManager { unimplemented!("Accessing hardware unsupported on non-illumos"); } + pub fn cpu_family(&self) -> SledCpuFamily { + unimplemented!("Accessing hardware unsupported on non-illumos"); + } + pub fn online_processor_count(&self) -> u32 { unimplemented!("Accessing hardware unsupported on non-illumos"); } diff --git a/sled-hardware/types/src/lib.rs b/sled-hardware/types/src/lib.rs index b34b5b1f422..ce4a29da4c0 100644 --- a/sled-hardware/types/src/lib.rs +++ b/sled-hardware/types/src/lib.rs @@ -95,3 +95,53 @@ impl std::fmt::Display for Baseboard { } } } + +/// Identifies the kind of CPU present on a sled, determined by reading CPUID. +/// +/// This is intended to broadly support the control plane answering the question +/// "can I run this instance on that sled?" given an instance with either no or +/// some CPU platform requirement. It is not enough information for more precise +/// placement questions - for example, is a CPU a high-frequency part or +/// many-core part? We don't include Genoa here, but in that CPU family there +/// are high frequency parts, many-core parts, and large-cache parts. To support +/// those questions (or satisfactorily answer #8730) we would need to collect +/// additional information and send it along. +#[derive( + Serialize, Deserialize, Copy, Clone, Debug, PartialEq, Eq, JsonSchema, +)] +#[serde(rename_all = "snake_case")] +pub enum SledCpuFamily { + /// The CPU vendor or its family number don't correspond to any of the + /// known family variants. + Unknown, + + /// AMD Milan processors (or very close). Could be an actual Milan in a + /// Gimlet, a close-to-Milan client Zen 3 part, or Zen 4 (for which Milan is + /// the greatest common denominator). + AmdMilan, + + /// AMD Turin processors (or very close). Could be an actual Turin in a + /// Cosmo, or a close-to-Turin client Zen 5 part. + AmdTurin, + + /// AMD Turin Dense processors. There are no "Turin Dense-like" CPUs unlike + /// other cases, so this means a bona fide Zen 5c Turin Dense part. + AmdTurinDense, +} + +impl SledCpuFamily { + fn as_str(&self) -> &'static str { + match self { + SledCpuFamily::Unknown => "unknown", + SledCpuFamily::AmdMilan => "amd_milan", + SledCpuFamily::AmdTurin => "amd_turin", + SledCpuFamily::AmdTurinDense => "amd_turin_dense", + } + } +} + +impl std::fmt::Display for SledCpuFamily { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +}