From 56500a4f71ea95b37209c9d6f9ac0d00b46fe592 Mon Sep 17 00:00:00 2001 From: Fabien Dupont Date: Thu, 11 Sep 2025 13:35:19 +0200 Subject: [PATCH 1/5] test: Add comprehensive tests for existing DGX A100 mock Establish baseline tests for the original A100 implementation before refactoring to shared architecture. This enables proper TDD approach where we can detect regressions during refactoring. Tests cover: - Server creation and basic properties - Device handling and indexing (8 devices) - Device properties (name, architecture, memory, etc.) - Device access by UUID and PCI bus ID - MIG mode operations - MIG profile configurations and access - GPU instance lifecycle and placements - Compute instance lifecycle - Init/shutdown behavior - Multiple device uniqueness - A100-specific characteristics All tests pass with the existing implementation at fd3e42f. Signed-off-by: Fabien Dupont --- pkg/nvml/mock/dgxa100/dgxa100_test.go | 527 ++++++++++++++++++++++++++ 1 file changed, 527 insertions(+) create mode 100644 pkg/nvml/mock/dgxa100/dgxa100_test.go diff --git a/pkg/nvml/mock/dgxa100/dgxa100_test.go b/pkg/nvml/mock/dgxa100/dgxa100_test.go new file mode 100644 index 0000000..4324f7c --- /dev/null +++ b/pkg/nvml/mock/dgxa100/dgxa100_test.go @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxa100 + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// TestServerCreation verifies server creation and basic properties +func TestServerCreation(t *testing.T) { + server := New() + require.NotNil(t, server) + + // Test interface compliance + require.Implements(t, (*nvml.Interface)(nil), server) + require.Implements(t, (*nvml.ExtendedInterface)(nil), server) + + // Test device count + count, ret := server.DeviceGetCount() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, count) + + // Test system information + driver, ret := server.SystemGetDriverVersion() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "550.54.15", driver) + + nvmlVer, ret := server.SystemGetNVMLVersion() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "12.550.54.15", nvmlVer) + + cudaVer, ret := server.SystemGetCudaDriverVersion() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 12040, cudaVer) +} + +// TestDeviceHandling verifies device access and indexing +func TestDeviceHandling(t *testing.T) { + server := New() + + // Test valid device indices + for i := 0; i < 8; i++ { + device, ret := server.DeviceGetHandleByIndex(i) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + // Test device index + index, ret := device.GetIndex() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, i, index) + + // Test minor number + minor, ret := device.GetMinorNumber() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, i, minor) + } + + // Test invalid device indices + _, ret := server.DeviceGetHandleByIndex(-1) + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) + + _, ret = server.DeviceGetHandleByIndex(8) + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) +} + +// TestDeviceProperties verifies all device properties +func TestDeviceProperties(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + // Test device name + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "Mock NVIDIA A100-SXM4-40GB", name) + + // Test architecture + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_AMPERE), arch) + + // Test brand + brand, ret := device.GetBrand() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.BRAND_NVIDIA, brand) + + // Test CUDA compute capability + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, major) + require.Equal(t, 0, minor) + + // Test memory info (40GB) + memory, ret := device.GetMemoryInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint64(42949672960), memory.Total) + + // Test PCI device ID + pciInfo, ret := device.GetPciInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0x20B010DE), pciInfo.PciDeviceId) + + // Test UUID is set + uuid, ret := device.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.NotEmpty(t, uuid) + require.Contains(t, uuid, "GPU-") +} + +// TestDeviceAccessByUUID verifies UUID-based device access +func TestDeviceAccessByUUID(t *testing.T) { + server := New() + + // Get device by index and its UUID + originalDevice, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + uuid, ret := originalDevice.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.NotEmpty(t, uuid) + + // Get device by UUID + deviceByUUID, ret := server.DeviceGetHandleByUUID(uuid) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, originalDevice, deviceByUUID) + + // Test invalid UUID + _, ret = server.DeviceGetHandleByUUID("invalid-uuid") + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) +} + +// TestDeviceAccessByPciBusId verifies PCI bus ID-based device access +func TestDeviceAccessByPciBusId(t *testing.T) { + server := New() + + // Test each device's PCI bus ID + for i := 0; i < 8; i++ { + originalDevice, ret := server.DeviceGetHandleByIndex(i) + require.Equal(t, nvml.SUCCESS, ret) + + expectedPciBusID := fmt.Sprintf("0000:%02x:00.0", i) + + // Get device by PCI bus ID + deviceByPci, ret := server.DeviceGetHandleByPciBusId(expectedPciBusID) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, originalDevice, deviceByPci) + } + + // Test invalid PCI bus ID + _, ret := server.DeviceGetHandleByPciBusId("invalid-pci-id") + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) +} + +// TestMIGModeOperations verifies MIG mode handling +func TestMIGModeOperations(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Initially MIG should be disabled + current, pending, ret := device.GetMigMode() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 0, current) + require.Equal(t, 0, pending) + + // Enable MIG mode + currentRet, pendingRet := device.SetMigMode(1) + require.Equal(t, nvml.SUCCESS, currentRet) + require.Equal(t, nvml.SUCCESS, pendingRet) + + // Verify MIG is enabled + current, pending, ret = device.GetMigMode() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 1, current) + require.Equal(t, 1, pending) + + // Disable MIG mode + currentRet, pendingRet = device.SetMigMode(0) + require.Equal(t, nvml.SUCCESS, currentRet) + require.Equal(t, nvml.SUCCESS, pendingRet) + + // Verify MIG is disabled + current, pending, ret = device.GetMigMode() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 0, current) + require.Equal(t, 0, pending) +} + +// TestMIGProfilesExist verifies MIG profile configuration exists +func TestMIGProfilesExist(t *testing.T) { + // Test that MIGProfiles variable is accessible + require.NotNil(t, MIGProfiles) + require.NotNil(t, MIGProfiles.GpuInstanceProfiles) + require.NotNil(t, MIGProfiles.ComputeInstanceProfiles) + + // Test that MIGPlacements variable is accessible + require.NotNil(t, MIGPlacements) + require.NotNil(t, MIGPlacements.GpuInstancePossiblePlacements) + require.NotNil(t, MIGPlacements.ComputeInstancePossiblePlacements) + + // Test expected profile types exist + expectedProfiles := []int{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + nvml.GPU_INSTANCE_PROFILE_2_SLICE, + nvml.GPU_INSTANCE_PROFILE_3_SLICE, + nvml.GPU_INSTANCE_PROFILE_4_SLICE, + nvml.GPU_INSTANCE_PROFILE_7_SLICE, + } + + for _, profileId := range expectedProfiles { + profile, exists := MIGProfiles.GpuInstanceProfiles[profileId] + require.True(t, exists, "Profile %d should exist", profileId) + require.Equal(t, uint32(profileId), profile.Id) + require.Greater(t, profile.MemorySizeMB, uint64(0)) + } +} + +// TestGpuInstanceProfileInfo verifies GPU instance profile access +func TestGpuInstanceProfileInfo(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Test valid profile access + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), profileInfo.Id) + require.Equal(t, uint32(1), profileInfo.SliceCount) + require.Equal(t, uint64(4864), profileInfo.MemorySizeMB) // 1g.5gb + + // Test 7-slice profile + profileInfo7, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_7_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_7_SLICE), profileInfo7.Id) + require.Equal(t, uint32(7), profileInfo7.SliceCount) + require.Equal(t, uint64(40192), profileInfo7.MemorySizeMB) // 7g.40gb + + // Test invalid profile + _, ret = device.GetGpuInstanceProfileInfo(-1) + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) + + // Test unsupported profile (use a valid range but unsupported profile) + _, ret = device.GetGpuInstanceProfileInfo(5) // Valid range but not in MIGProfiles + require.Equal(t, nvml.ERROR_NOT_SUPPORTED, ret) +} + +// TestGpuInstancePlacements verifies GPU instance placement access +func TestGpuInstancePlacements(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Test 1-slice placements + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + placements, ret := device.GetGpuInstancePossiblePlacements(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, placements, 7) // Should have 7 possible placements for 1-slice + + // Test 7-slice placements + profileInfo7, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_7_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + placements7, ret := device.GetGpuInstancePossiblePlacements(&profileInfo7) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, placements7, 1) // Should have 1 placement for 7-slice (full GPU) + require.Equal(t, uint32(0), placements7[0].Start) + require.Equal(t, uint32(8), placements7[0].Size) +} + +// TestGpuInstanceLifecycle verifies complete GPU instance lifecycle +func TestGpuInstanceLifecycle(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Get 1-slice profile + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + // Create GPU instance + gi, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi) + + // Test GPU instance info + giInfo, ret := gi.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, device, giInfo.Device) + require.Equal(t, profileInfo.Id, giInfo.ProfileId) + require.Equal(t, uint32(0), giInfo.Id) // First instance should have ID 0 + + // Get GPU instances for this profile + instances, ret := device.GetGpuInstances(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, instances, 1) + require.Equal(t, gi, instances[0]) + + // Destroy GPU instance + ret = gi.Destroy() + require.Equal(t, nvml.SUCCESS, ret) + + // Verify instance is removed + instances, ret = device.GetGpuInstances(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, instances, 0) +} + +// TestGpuInstanceWithPlacement verifies GPU instance creation with placement +func TestGpuInstanceWithPlacement(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Get profile and placement + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + placements, ret := device.GetGpuInstancePossiblePlacements(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotEmpty(t, placements) + + // Create GPU instance with specific placement + gi, ret := device.CreateGpuInstanceWithPlacement(&profileInfo, &placements[0]) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi) + + // Verify placement in instance info + giInfo, ret := gi.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, placements[0], giInfo.Placement) + + // Clean up + ret = gi.Destroy() + require.Equal(t, nvml.SUCCESS, ret) +} + +// TestComputeInstanceLifecycle verifies complete compute instance lifecycle +func TestComputeInstanceLifecycle(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Create GPU instance first + giProfileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + gi, ret := device.CreateGpuInstance(&giProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi) + + // Get compute instance profile + ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo( + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED, + ) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE), ciProfileInfo.Id) + + // Test invalid engine profile + _, ret = gi.GetComputeInstanceProfileInfo( + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + 999, // Invalid engine profile + ) + require.Equal(t, nvml.ERROR_NOT_SUPPORTED, ret) + + // Get compute instance placements + _, ret = gi.GetComputeInstancePossiblePlacements(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + // Note: Original implementation has empty placements (TODO comment) + + // Create compute instance + ci, ret := gi.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, ci) + + // Test compute instance info + ciInfo, ret := ci.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, device, ciInfo.Device) + require.Equal(t, gi, ciInfo.GpuInstance) + require.Equal(t, ciProfileInfo.Id, ciInfo.ProfileId) + require.Equal(t, uint32(0), ciInfo.Id) // First instance should have ID 0 + + // Get compute instances for this profile + instances, ret := gi.GetComputeInstances(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, instances, 1) + require.Equal(t, ci, instances[0]) + + // Destroy compute instance + ret = ci.Destroy() + require.Equal(t, nvml.SUCCESS, ret) + + // Verify compute instance is removed + instances, ret = gi.GetComputeInstances(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, instances, 0) + + // Destroy GPU instance + ret = gi.Destroy() + require.Equal(t, nvml.SUCCESS, ret) +} + +// TestInitShutdownLifecycle verifies init/shutdown behavior +func TestInitShutdownLifecycle(t *testing.T) { + server := New() + + // Test init + ret := server.Init() + require.Equal(t, nvml.SUCCESS, ret) + + // Test lookup symbol + err := server.LookupSymbol("nvmlInit") + require.NoError(t, err) + + // Test extensions + ext := server.Extensions() + require.NotNil(t, ext) + require.Equal(t, server, ext) + + // Test shutdown + ret = server.Shutdown() + require.Equal(t, nvml.SUCCESS, ret) +} + +// TestMultipleDevices verifies all devices are unique and correctly indexed +func TestMultipleDevices(t *testing.T) { + server := New() + + devices := make([]nvml.Device, 8) + uuids := make(map[string]bool) + + // Get all devices and verify uniqueness + for i := 0; i < 8; i++ { + device, ret := server.DeviceGetHandleByIndex(i) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + devices[i] = device + + // Verify UUID is unique + uuid, ret := device.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.NotEmpty(t, uuid) + require.False(t, uuids[uuid], "UUID %s should be unique", uuid) + uuids[uuid] = true + + // Verify device properties are consistent + index, ret := device.GetIndex() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, i, index) + + minor, ret := device.GetMinorNumber() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, i, minor) + + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "Mock NVIDIA A100-SXM4-40GB", name) + } + + // Verify all devices are distinct objects + for i := 0; i < 8; i++ { + for j := i + 1; j < 8; j++ { + require.NotEqual(t, devices[i], devices[j], "Devices %d and %d should be different objects", i, j) + } + } +} + +// TestA100SpecificCharacteristics tests A100-specific values +func TestA100SpecificCharacteristics(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Test A100 doesn't support P2P in MIG (IsP2pSupported should be 0) + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), profileInfo.IsP2pSupported) + + // Test A100 memory values are correct + profile1 := MIGProfiles.GpuInstanceProfiles[nvml.GPU_INSTANCE_PROFILE_1_SLICE] + require.Equal(t, uint64(4864), profile1.MemorySizeMB) // 1g.5gb + + profile7 := MIGProfiles.GpuInstanceProfiles[nvml.GPU_INSTANCE_PROFILE_7_SLICE] + require.Equal(t, uint64(40192), profile7.MemorySizeMB) // 7g.40gb + + // Test A100 architecture + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_AMPERE), arch) + + // Test A100 CUDA compute capability + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, major) // Ampere + require.Equal(t, 0, minor) + + // Test A100 PCI device ID + pciInfo, ret := device.GetPciInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0x20B010DE), pciInfo.PciDeviceId) // A100-SXM4-40GB +} \ No newline at end of file From d4b327a545487c71fc01f80cf8b23b630f3af14f Mon Sep 17 00:00:00 2001 From: Fabien Dupont Date: Mon, 22 Sep 2025 16:48:58 +0200 Subject: [PATCH 2/5] feat: Implement shared factory system and add A30 GPU configurations - Implement shared factory system in pkg/nvml/mock/shared/ to eliminate code duplication - Add comprehensive A30 GPU configurations with MIG profiles (56 SMs, 1/2/4-slice support) - Refactor dgxa100 to use shared factory while maintaining backward compatibility - Create modular GPU configurations in shared/gpus/ for A100 and A30 families - Add comprehensive documentation covering architecture and usage examples - Maintain thread safety and proper NVML return codes - Support all A100 variants (SXM4 40GB/80GB, PCIe 40GB/80GB) and A30 PCIe 24GB Signed-off-by: Fabien Dupont --- pkg/nvml/mock/README.md | 270 +++++++++++++++ pkg/nvml/mock/dgxa100/dgxa100.go | 384 +++------------------ pkg/nvml/mock/dgxa100/dgxa100_test.go | 166 ++++----- pkg/nvml/mock/dgxa100/mig-profile.go | 471 -------------------------- pkg/nvml/mock/shared/gpus/a100.go | 456 +++++++++++++++++++++++++ pkg/nvml/mock/shared/gpus/a30.go | 250 ++++++++++++++ pkg/nvml/mock/shared/shared.go | 426 +++++++++++++++++++++++ 7 files changed, 1526 insertions(+), 897 deletions(-) create mode 100644 pkg/nvml/mock/README.md delete mode 100644 pkg/nvml/mock/dgxa100/mig-profile.go create mode 100644 pkg/nvml/mock/shared/gpus/a100.go create mode 100644 pkg/nvml/mock/shared/gpus/a30.go create mode 100644 pkg/nvml/mock/shared/shared.go diff --git a/pkg/nvml/mock/README.md b/pkg/nvml/mock/README.md new file mode 100644 index 0000000..46ad4f7 --- /dev/null +++ b/pkg/nvml/mock/README.md @@ -0,0 +1,270 @@ +# NVML Mock Framework + +This package provides mock implementations of NVIDIA's NVML (NVIDIA Management Library) for testing and development purposes. The framework uses a shared factory system to define GPU configurations that can be easily extended and customized. + +## Architecture + +``` +pkg/nvml/mock/ +├── shared/ +│ ├── shared.go # Core shared factory and types +│ └── gpus/ # GPU configuration definitions +│ ├── a100.go # A100 GPU variants +│ └── a30.go # A30 GPU variants +└── dgxa100/ # DGX A100 implementation + ├── dgxa100.go # Server and device implementation + ├── gpus.go # Legacy A100 configurations and MIG profiles + └── dgxa100_test.go # Comprehensive tests +``` + +## Core Concepts + +### Shared Factory (`shared.Config`) +Define the characteristics of individual GPU models including: +- Device properties (name, architecture, brand, PCI device ID) +- Compute capabilities (CUDA version, compute capability) +- Memory configuration +- MIG (Multi-Instance GPU) profiles and placements + +### Server Configuration (`shared.ServerConfig`) +Define complete system configurations including: +- GPU configuration and count +- Driver, NVML, and CUDA versions + +### MIG Profile Configuration (`shared.MIGProfileConfig`) +Define Multi-Instance GPU capabilities including: +- GPU instance profiles (slice configurations) +- Compute instance profiles +- Placement constraints and possibilities + +## Usage Examples + +### Basic Usage + +```go +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" +) + +// Create default A100 system +serverA100 := dgxa100.New() // A100-SXM4-40GB (8 GPUs) + +// Create specific A100 variants +serverA100_80GB := dgxa100.NewServerWithGPU(gpus.A100_SXM4_80GB) +serverA100_PCIE := dgxa100.NewServerWithGPU(gpus.A100_PCIE_40GB) +``` + +### Device Creation + +```go +// Create device with default configuration +device := dgxa100.NewDevice(0) + +// Create device with specific GPU variant +deviceA100_80GB := dgxa100.NewDeviceWithGPU(gpus.A100_SXM4_80GB, 0) +deviceA100_PCIE := dgxa100.NewDeviceWithGPU(gpus.A100_PCIE_40GB, 1) +``` + +### Accessing GPU Configurations + +```go +// Available GPU configurations +// A100 Family +gpus.A100_SXM4_40GB // A100 SXM4 40GB +gpus.A100_SXM4_80GB // A100 SXM4 80GB +gpus.A100_PCIE_40GB // A100 PCIe 40GB +gpus.A100_PCIE_80GB // A100 PCIe 80GB + +// A30 Family +gpus.A30_PCIE_24GB // A30 PCIe 24GB + +// Inspect configurations +fmt.Printf("GPU: %s\n", gpus.A100_SXM4_80GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.A100_SXM4_80GB.MemoryMB) +fmt.Printf("Architecture: %v\n", gpus.A100_SXM4_80GB.Architecture) +fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId) +``` + +## Available GPU Models + +### A100 Family (Ampere Architecture, 108 SMs) + +- **A100 SXM4 40GB** (`gpus.A100_SXM4_40GB`) + - Form factor: SXM4 + - Memory: 40GB HBM2 + - PCI Device ID: 0x20B010DE + - CUDA Capability: 8.0 + - SMs per slice: 14 (1-slice), 28 (2-slice), 42 (3-slice), 56 (4-slice), 98 (7-slice) + - MIG P2P: Not supported (`IsP2pSupported: 0`) + +- **A100 SXM4 80GB** (`gpus.A100_SXM4_80GB`) + - Form factor: SXM4 + - Memory: 80GB HBM2e + - PCI Device ID: 0x20B210DE + - CUDA Capability: 8.0 + +- **A100 PCIe 40GB** (`gpus.A100_PCIE_40GB`) + - Form factor: PCIe + - Memory: 40GB HBM2 + - PCI Device ID: 0x20F110DE + - CUDA Capability: 8.0 + +- **A100 PCIe 80GB** (`gpus.A100_PCIE_80GB`) + - Form factor: PCIe + - Memory: 80GB HBM2e + - PCI Device ID: 0x20B510DE + - CUDA Capability: 8.0 + +### A30 Family (Ampere Architecture, 56 SMs) + +- **A30 PCIe 24GB** (`gpus.A30_PCIE_24GB`) + - Form factor: PCIe + - Memory: 24GB HBM2 + - PCI Device ID: 0x20B710DE + - CUDA Capability: 8.0 + - SMs per slice: 14 (1-slice), 28 (2-slice), 56 (4-slice) + - MIG P2P: Not supported (`IsP2pSupported: 0`) + - MIG slices: 1, 2, 4 (no 3-slice or 7-slice support) + +## Available Server Models + +### DGX A100 Family + +- **DGX A100 40GB** (default) + - 8x A100 SXM4 40GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +## MIG (Multi-Instance GPU) Support + +All GPU configurations include comprehensive MIG profile definitions: + +- **A100**: No P2P support in MIG (`IsP2pSupported: 0`) + - Memory profiles differ between 40GB and 80GB variants + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) +- **A30**: No P2P support in MIG (`IsP2pSupported: 0`) + - Supports limited MIG slice configurations (1, 2, 4 slices only) + - 56 SMs total with 14 SMs per slice + +### MIG Operations + +```go +// Create server with MIG support +server := dgxa100.New() +device, _ := server.DeviceGetHandleByIndex(0) + +// Enable MIG mode +device.SetMigMode(1) + +// Get available GPU instance profiles +profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + +// Create GPU instance +gi, ret := device.CreateGpuInstance(&profileInfo) + +// Create compute instance within GPU instance +ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo( + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED +) +ci, ret := gi.CreateComputeInstance(&ciProfileInfo) +``` + +## Testing + +The framework includes comprehensive tests covering: +- Server creation and device enumeration +- Device properties and capabilities +- MIG mode operations and lifecycle +- GPU and compute instance management +- Memory and PCI information +- Multi-device scenarios + +```bash +# Run all mock tests +go test ./pkg/nvml/mock/... + +# Run A100 specific tests +go test -v ./pkg/nvml/mock/dgxa100/ + +# Run specific test +go test -v ./pkg/nvml/mock/dgxa100/ -run TestMIGProfilesExist +``` + +## Extending the Framework + +### Adding GPU Variants + +Add new configurations to the appropriate file in `shared/gpus/`: +```go +var A100_PCIE_24GB = shared.Config{ + Name: "NVIDIA A100-PCIE-24GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 24576, // 24GB + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20F010DE, + MIGProfiles: a100_24gb_MIGProfiles, +} +``` + +### Adding GPU Generations + +1. **Create new package** (e.g., `dgxa100/`) +2. **Define GPU configurations** in `shared/gpus/a100.go` +3. **Define MIG profiles** with appropriate memory and SM allocations +4. **Implement server and device factory functions** +5. **Add comprehensive tests** + +Example structure for A100 generation: +```go +// In shared/gpus/a100.go +var A100_SXM4_80GB = shared.Config{ + Name: "NVIDIA A100 SXM4 80GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B210DE, + MIGProfiles: a100MIGProfiles, +} + +// In dgxa100/dgxa100.go +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.A100_SXM4_80GB, + GPUCount: 4, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} +``` + +## Backward Compatibility + +The framework maintains full backward compatibility: +- All existing `dgxa100.New()` calls continue to work unchanged +- Legacy global variables (`MIGProfiles`, `MIGPlacements`) are preserved +- Device names maintain "Mock" prefix for test compatibility +- All existing tests pass without modification +- A100 configurations now reference `shared/gpus` package + +## Performance Considerations + +- Configurations are defined as static variables (no runtime overhead) +- Device creation uses shared factory (fast) +- MIG profiles are shared between devices of the same type +- Mock functions use direct field access (minimal latency) + +## Implementation Notes + +- **Thread Safety**: Device implementations include proper mutex usage +- **Memory Management**: No memory leaks in device/instance lifecycle +- **Error Handling**: Proper NVML return codes for all operations +- **Standards Compliance**: Follows official NVML API patterns and behaviors +- **Separation of Concerns**: GPU configs in `shared/gpus`, server logic in package-specific files diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index af65037..fc2384d 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,365 +17,63 @@ package dgxa100 import ( - "fmt" - "sync" - - "github.com/google/uuid" - "github.com/NVIDIA/go-nvml/pkg/nvml" - "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" ) -type Server struct { - mock.Interface - mock.ExtendedInterface - Devices [8]nvml.Device - DriverVersion string - NvmlVersion string - CudaDriverVersion int -} -type Device struct { - mock.Device - sync.RWMutex - UUID string - Name string - Brand nvml.BrandType - Architecture nvml.DeviceArchitecture - PciBusID string - Minor int - Index int - CudaComputeCapability CudaComputeCapability - MigMode int - GpuInstances map[*GpuInstance]struct{} - GpuInstanceCounter uint32 - MemoryInfo nvml.Memory -} - -type GpuInstance struct { - mock.GpuInstance - sync.RWMutex - Info nvml.GpuInstanceInfo - ComputeInstances map[*ComputeInstance]struct{} - ComputeInstanceCounter uint32 -} - -type ComputeInstance struct { - mock.ComputeInstance - Info nvml.ComputeInstanceInfo -} - -type CudaComputeCapability struct { - Major int - Minor int -} - -var _ nvml.Interface = (*Server)(nil) -var _ nvml.Device = (*Device)(nil) -var _ nvml.GpuInstance = (*GpuInstance)(nil) -var _ nvml.ComputeInstance = (*ComputeInstance)(nil) +// Backwards compatible type aliases +type Server = shared.Server +type Device = shared.Device +type GpuInstance = shared.GpuInstance +type ComputeInstance = shared.ComputeInstance +type CudaComputeCapability = shared.CudaComputeCapability func New() *Server { - server := &Server{ - Devices: [8]nvml.Device{ - NewDevice(0), - NewDevice(1), - NewDevice(2), - NewDevice(3), - NewDevice(4), - NewDevice(5), - NewDevice(6), - NewDevice(7), - }, + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.A100_SXM4_40GB, + GPUCount: 8, DriverVersion: "550.54.15", NvmlVersion: "12.550.54.15", CudaDriverVersion: 12040, - } - server.setMockFuncs() - return server + }) } func NewDevice(index int) *Device { - device := &Device{ - UUID: "GPU-" + uuid.New().String(), - Name: "Mock NVIDIA A100-SXM4-40GB", - Brand: nvml.BRAND_NVIDIA, - Architecture: nvml.DEVICE_ARCH_AMPERE, - PciBusID: fmt.Sprintf("0000:%02x:00.0", index), - Minor: index, - Index: index, - CudaComputeCapability: CudaComputeCapability{ - Major: 8, - Minor: 0, - }, - GpuInstances: make(map[*GpuInstance]struct{}), - GpuInstanceCounter: 0, - MemoryInfo: nvml.Memory{Total: 42949672960, Free: 0, Used: 0}, - } - device.setMockFuncs() - return device -} - -func NewGpuInstance(info nvml.GpuInstanceInfo) *GpuInstance { - gi := &GpuInstance{ - Info: info, - ComputeInstances: make(map[*ComputeInstance]struct{}), - ComputeInstanceCounter: 0, - } - gi.setMockFuncs() - return gi -} - -func NewComputeInstance(info nvml.ComputeInstanceInfo) *ComputeInstance { - ci := &ComputeInstance{ - Info: info, - } - ci.setMockFuncs() - return ci -} - -func (s *Server) setMockFuncs() { - s.ExtensionsFunc = func() nvml.ExtendedInterface { - return s - } - - s.LookupSymbolFunc = func(symbol string) error { - return nil - } - - s.InitFunc = func() nvml.Return { - return nvml.SUCCESS - } - - s.ShutdownFunc = func() nvml.Return { - return nvml.SUCCESS - } - - s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { - return s.DriverVersion, nvml.SUCCESS - } - - s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { - return s.NvmlVersion, nvml.SUCCESS - } - - s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { - return s.CudaDriverVersion, nvml.SUCCESS - } - - s.DeviceGetCountFunc = func() (int, nvml.Return) { - return len(s.Devices), nvml.SUCCESS - } - - s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { - if index < 0 || index >= len(s.Devices) { - return nil, nvml.ERROR_INVALID_ARGUMENT - } - return s.Devices[index], nvml.SUCCESS - } - - s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { - for _, d := range s.Devices { - if uuid == d.(*Device).UUID { - return d, nvml.SUCCESS - } - } - return nil, nvml.ERROR_INVALID_ARGUMENT - } - - s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { - for _, d := range s.Devices { - if busID == d.(*Device).PciBusID { - return d, nvml.SUCCESS - } - } - return nil, nvml.ERROR_INVALID_ARGUMENT - } + return shared.NewDeviceFromConfig(gpus.A100_SXM4_40GB, index) } -func (d *Device) setMockFuncs() { - d.GetMinorNumberFunc = func() (int, nvml.Return) { - return d.Minor, nvml.SUCCESS - } - - d.GetIndexFunc = func() (int, nvml.Return) { - return d.Index, nvml.SUCCESS - } - - d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { - return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS - } - - d.GetUUIDFunc = func() (string, nvml.Return) { - return d.UUID, nvml.SUCCESS - } - - d.GetNameFunc = func() (string, nvml.Return) { - return d.Name, nvml.SUCCESS - } - - d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { - return d.Brand, nvml.SUCCESS - } - - d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { - return d.Architecture, nvml.SUCCESS - } - - d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { - return d.MemoryInfo, nvml.SUCCESS - } - - d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { - p := nvml.PciInfo{ - PciDeviceId: 0x20B010DE, - } - return p, nvml.SUCCESS - } - - d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { - d.MigMode = mode - return nvml.SUCCESS, nvml.SUCCESS - } - - d.GetMigModeFunc = func() (int, int, nvml.Return) { - return d.MigMode, d.MigMode, nvml.SUCCESS - } - - d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { - if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { - return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT - } - - if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { - return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS - } - - d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { - return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS - } - - d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { - d.Lock() - defer d.Unlock() - giInfo := nvml.GpuInstanceInfo{ - Device: d, - Id: d.GpuInstanceCounter, - ProfileId: info.Id, - } - d.GpuInstanceCounter++ - gi := NewGpuInstance(giInfo) - d.GpuInstances[gi] = struct{}{} - return gi, nvml.SUCCESS - } - - d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { - d.Lock() - defer d.Unlock() - giInfo := nvml.GpuInstanceInfo{ - Device: d, - Id: d.GpuInstanceCounter, - ProfileId: info.Id, - Placement: *placement, - } - d.GpuInstanceCounter++ - gi := NewGpuInstance(giInfo) - d.GpuInstances[gi] = struct{}{} - return gi, nvml.SUCCESS - } - - d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { - d.RLock() - defer d.RUnlock() - var gis []nvml.GpuInstance - for gi := range d.GpuInstances { - if gi.Info.ProfileId == info.Id { - gis = append(gis, gi) - } - } - return gis, nvml.SUCCESS - } +// NewServerWithGPU creates a new server with a specific A100 GPU variant +func NewServerWithGPU(gpuConfig shared.Config) *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpuConfig, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) } -func (gi *GpuInstance) setMockFuncs() { - gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { - return gi.Info, nvml.SUCCESS - } - - gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { - if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT - } - - if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - giProfileId := int(gi.Info.ProfileId) - - if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS - } - - gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { - return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS - } - - gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { - gi.Lock() - defer gi.Unlock() - ciInfo := nvml.ComputeInstanceInfo{ - Device: gi.Info.Device, - GpuInstance: gi, - Id: gi.ComputeInstanceCounter, - ProfileId: info.Id, - } - gi.ComputeInstanceCounter++ - ci := NewComputeInstance(ciInfo) - gi.ComputeInstances[ci] = struct{}{} - return ci, nvml.SUCCESS - } - - gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { - gi.RLock() - defer gi.RUnlock() - var cis []nvml.ComputeInstance - for ci := range gi.ComputeInstances { - if ci.Info.ProfileId == info.Id { - cis = append(cis, ci) - } - } - return cis, nvml.SUCCESS - } - - gi.DestroyFunc = func() nvml.Return { - d := gi.Info.Device.(*Device) - d.Lock() - defer d.Unlock() - delete(d.GpuInstances, gi) - return nvml.SUCCESS - } +// NewDeviceWithGPU creates a new device with a specific A100 GPU variant +func NewDeviceWithGPU(gpuConfig shared.Config, index int) *Device { + return shared.NewDeviceFromConfig(gpuConfig, index) } -func (ci *ComputeInstance) setMockFuncs() { - ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { - return ci.Info, nvml.SUCCESS +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.A100_SXM4_40GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.A100_SXM4_40GB.MIGProfiles.ComputeInstanceProfiles, } - ci.DestroyFunc = func() nvml.Return { - gi := ci.Info.GpuInstance.(*GpuInstance) - gi.Lock() - defer gi.Unlock() - delete(gi.ComputeInstances, ci) - return nvml.SUCCESS + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.A100_SXM4_40GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.A100_SXM4_40GB.MIGProfiles.ComputeInstancePlacements, } -} +) diff --git a/pkg/nvml/mock/dgxa100/dgxa100_test.go b/pkg/nvml/mock/dgxa100/dgxa100_test.go index 4324f7c..3f852df 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100_test.go +++ b/pkg/nvml/mock/dgxa100/dgxa100_test.go @@ -29,25 +29,25 @@ import ( func TestServerCreation(t *testing.T) { server := New() require.NotNil(t, server) - + // Test interface compliance require.Implements(t, (*nvml.Interface)(nil), server) require.Implements(t, (*nvml.ExtendedInterface)(nil), server) - + // Test device count count, ret := server.DeviceGetCount() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, 8, count) - + // Test system information driver, ret := server.SystemGetDriverVersion() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, "550.54.15", driver) - + nvmlVer, ret := server.SystemGetNVMLVersion() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, "12.550.54.15", nvmlVer) - + cudaVer, ret := server.SystemGetCudaDriverVersion() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, 12040, cudaVer) @@ -56,28 +56,28 @@ func TestServerCreation(t *testing.T) { // TestDeviceHandling verifies device access and indexing func TestDeviceHandling(t *testing.T) { server := New() - + // Test valid device indices for i := 0; i < 8; i++ { device, ret := server.DeviceGetHandleByIndex(i) require.Equal(t, nvml.SUCCESS, ret) require.NotNil(t, device) - + // Test device index index, ret := device.GetIndex() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, i, index) - + // Test minor number minor, ret := device.GetMinorNumber() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, i, minor) } - + // Test invalid device indices _, ret := server.DeviceGetHandleByIndex(-1) require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) - + _, ret = server.DeviceGetHandleByIndex(8) require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) } @@ -88,38 +88,38 @@ func TestDeviceProperties(t *testing.T) { device, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) require.NotNil(t, device) - + // Test device name name, ret := device.GetName() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, "Mock NVIDIA A100-SXM4-40GB", name) - + // Test architecture arch, ret := device.GetArchitecture() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_AMPERE), arch) - + // Test brand brand, ret := device.GetBrand() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, nvml.BRAND_NVIDIA, brand) - + // Test CUDA compute capability major, minor, ret := device.GetCudaComputeCapability() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, 8, major) require.Equal(t, 0, minor) - + // Test memory info (40GB) memory, ret := device.GetMemoryInfo() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, uint64(42949672960), memory.Total) - + // Test PCI device ID pciInfo, ret := device.GetPciInfo() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, uint32(0x20B010DE), pciInfo.PciDeviceId) - + // Test UUID is set uuid, ret := device.GetUUID() require.Equal(t, nvml.SUCCESS, ret) @@ -130,20 +130,20 @@ func TestDeviceProperties(t *testing.T) { // TestDeviceAccessByUUID verifies UUID-based device access func TestDeviceAccessByUUID(t *testing.T) { server := New() - + // Get device by index and its UUID originalDevice, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) - + uuid, ret := originalDevice.GetUUID() require.Equal(t, nvml.SUCCESS, ret) require.NotEmpty(t, uuid) - + // Get device by UUID deviceByUUID, ret := server.DeviceGetHandleByUUID(uuid) require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, originalDevice, deviceByUUID) - + // Test invalid UUID _, ret = server.DeviceGetHandleByUUID("invalid-uuid") require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) @@ -152,20 +152,20 @@ func TestDeviceAccessByUUID(t *testing.T) { // TestDeviceAccessByPciBusId verifies PCI bus ID-based device access func TestDeviceAccessByPciBusId(t *testing.T) { server := New() - + // Test each device's PCI bus ID for i := 0; i < 8; i++ { originalDevice, ret := server.DeviceGetHandleByIndex(i) require.Equal(t, nvml.SUCCESS, ret) - + expectedPciBusID := fmt.Sprintf("0000:%02x:00.0", i) - + // Get device by PCI bus ID deviceByPci, ret := server.DeviceGetHandleByPciBusId(expectedPciBusID) require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, originalDevice, deviceByPci) } - + // Test invalid PCI bus ID _, ret := server.DeviceGetHandleByPciBusId("invalid-pci-id") require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) @@ -176,29 +176,29 @@ func TestMIGModeOperations(t *testing.T) { server := New() device, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) - + // Initially MIG should be disabled current, pending, ret := device.GetMigMode() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, 0, current) require.Equal(t, 0, pending) - + // Enable MIG mode currentRet, pendingRet := device.SetMigMode(1) require.Equal(t, nvml.SUCCESS, currentRet) require.Equal(t, nvml.SUCCESS, pendingRet) - + // Verify MIG is enabled current, pending, ret = device.GetMigMode() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, 1, current) require.Equal(t, 1, pending) - + // Disable MIG mode currentRet, pendingRet = device.SetMigMode(0) require.Equal(t, nvml.SUCCESS, currentRet) require.Equal(t, nvml.SUCCESS, pendingRet) - + // Verify MIG is disabled current, pending, ret = device.GetMigMode() require.Equal(t, nvml.SUCCESS, ret) @@ -212,12 +212,12 @@ func TestMIGProfilesExist(t *testing.T) { require.NotNil(t, MIGProfiles) require.NotNil(t, MIGProfiles.GpuInstanceProfiles) require.NotNil(t, MIGProfiles.ComputeInstanceProfiles) - + // Test that MIGPlacements variable is accessible require.NotNil(t, MIGPlacements) require.NotNil(t, MIGPlacements.GpuInstancePossiblePlacements) require.NotNil(t, MIGPlacements.ComputeInstancePossiblePlacements) - + // Test expected profile types exist expectedProfiles := []int{ nvml.GPU_INSTANCE_PROFILE_1_SLICE, @@ -228,7 +228,7 @@ func TestMIGProfilesExist(t *testing.T) { nvml.GPU_INSTANCE_PROFILE_4_SLICE, nvml.GPU_INSTANCE_PROFILE_7_SLICE, } - + for _, profileId := range expectedProfiles { profile, exists := MIGProfiles.GpuInstanceProfiles[profileId] require.True(t, exists, "Profile %d should exist", profileId) @@ -242,25 +242,25 @@ func TestGpuInstanceProfileInfo(t *testing.T) { server := New() device, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) - + // Test valid profile access profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), profileInfo.Id) require.Equal(t, uint32(1), profileInfo.SliceCount) require.Equal(t, uint64(4864), profileInfo.MemorySizeMB) // 1g.5gb - - // Test 7-slice profile + + // Test 7-slice profile profileInfo7, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_7_SLICE) require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_7_SLICE), profileInfo7.Id) require.Equal(t, uint32(7), profileInfo7.SliceCount) require.Equal(t, uint64(40192), profileInfo7.MemorySizeMB) // 7g.40gb - + // Test invalid profile _, ret = device.GetGpuInstanceProfileInfo(-1) require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) - + // Test unsupported profile (use a valid range but unsupported profile) _, ret = device.GetGpuInstanceProfileInfo(5) // Valid range but not in MIGProfiles require.Equal(t, nvml.ERROR_NOT_SUPPORTED, ret) @@ -271,19 +271,19 @@ func TestGpuInstancePlacements(t *testing.T) { server := New() device, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) - + // Test 1-slice placements profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) require.Equal(t, nvml.SUCCESS, ret) - + placements, ret := device.GetGpuInstancePossiblePlacements(&profileInfo) require.Equal(t, nvml.SUCCESS, ret) require.Len(t, placements, 7) // Should have 7 possible placements for 1-slice - + // Test 7-slice placements profileInfo7, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_7_SLICE) require.Equal(t, nvml.SUCCESS, ret) - + placements7, ret := device.GetGpuInstancePossiblePlacements(&profileInfo7) require.Equal(t, nvml.SUCCESS, ret) require.Len(t, placements7, 1) // Should have 1 placement for 7-slice (full GPU) @@ -296,33 +296,33 @@ func TestGpuInstanceLifecycle(t *testing.T) { server := New() device, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) - + // Get 1-slice profile profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) require.Equal(t, nvml.SUCCESS, ret) - + // Create GPU instance gi, ret := device.CreateGpuInstance(&profileInfo) require.Equal(t, nvml.SUCCESS, ret) require.NotNil(t, gi) - + // Test GPU instance info giInfo, ret := gi.GetInfo() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, device, giInfo.Device) require.Equal(t, profileInfo.Id, giInfo.ProfileId) require.Equal(t, uint32(0), giInfo.Id) // First instance should have ID 0 - + // Get GPU instances for this profile instances, ret := device.GetGpuInstances(&profileInfo) require.Equal(t, nvml.SUCCESS, ret) require.Len(t, instances, 1) require.Equal(t, gi, instances[0]) - + // Destroy GPU instance ret = gi.Destroy() require.Equal(t, nvml.SUCCESS, ret) - + // Verify instance is removed instances, ret = device.GetGpuInstances(&profileInfo) require.Equal(t, nvml.SUCCESS, ret) @@ -334,25 +334,25 @@ func TestGpuInstanceWithPlacement(t *testing.T) { server := New() device, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) - + // Get profile and placement profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) require.Equal(t, nvml.SUCCESS, ret) - + placements, ret := device.GetGpuInstancePossiblePlacements(&profileInfo) require.Equal(t, nvml.SUCCESS, ret) require.NotEmpty(t, placements) - + // Create GPU instance with specific placement gi, ret := device.CreateGpuInstanceWithPlacement(&profileInfo, &placements[0]) require.Equal(t, nvml.SUCCESS, ret) require.NotNil(t, gi) - + // Verify placement in instance info giInfo, ret := gi.GetInfo() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, placements[0], giInfo.Placement) - + // Clean up ret = gi.Destroy() require.Equal(t, nvml.SUCCESS, ret) @@ -363,15 +363,15 @@ func TestComputeInstanceLifecycle(t *testing.T) { server := New() device, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) - + // Create GPU instance first giProfileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) require.Equal(t, nvml.SUCCESS, ret) - + gi, ret := device.CreateGpuInstance(&giProfileInfo) require.Equal(t, nvml.SUCCESS, ret) require.NotNil(t, gi) - + // Get compute instance profile ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo( nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, @@ -379,24 +379,24 @@ func TestComputeInstanceLifecycle(t *testing.T) { ) require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE), ciProfileInfo.Id) - + // Test invalid engine profile _, ret = gi.GetComputeInstanceProfileInfo( nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, 999, // Invalid engine profile ) require.Equal(t, nvml.ERROR_NOT_SUPPORTED, ret) - + // Get compute instance placements _, ret = gi.GetComputeInstancePossiblePlacements(&ciProfileInfo) require.Equal(t, nvml.SUCCESS, ret) // Note: Original implementation has empty placements (TODO comment) - + // Create compute instance ci, ret := gi.CreateComputeInstance(&ciProfileInfo) require.Equal(t, nvml.SUCCESS, ret) require.NotNil(t, ci) - + // Test compute instance info ciInfo, ret := ci.GetInfo() require.Equal(t, nvml.SUCCESS, ret) @@ -404,22 +404,22 @@ func TestComputeInstanceLifecycle(t *testing.T) { require.Equal(t, gi, ciInfo.GpuInstance) require.Equal(t, ciProfileInfo.Id, ciInfo.ProfileId) require.Equal(t, uint32(0), ciInfo.Id) // First instance should have ID 0 - + // Get compute instances for this profile instances, ret := gi.GetComputeInstances(&ciProfileInfo) require.Equal(t, nvml.SUCCESS, ret) require.Len(t, instances, 1) require.Equal(t, ci, instances[0]) - + // Destroy compute instance ret = ci.Destroy() require.Equal(t, nvml.SUCCESS, ret) - + // Verify compute instance is removed instances, ret = gi.GetComputeInstances(&ciProfileInfo) require.Equal(t, nvml.SUCCESS, ret) require.Len(t, instances, 0) - + // Destroy GPU instance ret = gi.Destroy() require.Equal(t, nvml.SUCCESS, ret) @@ -428,20 +428,20 @@ func TestComputeInstanceLifecycle(t *testing.T) { // TestInitShutdownLifecycle verifies init/shutdown behavior func TestInitShutdownLifecycle(t *testing.T) { server := New() - + // Test init ret := server.Init() require.Equal(t, nvml.SUCCESS, ret) - + // Test lookup symbol err := server.LookupSymbol("nvmlInit") require.NoError(t, err) - + // Test extensions ext := server.Extensions() require.NotNil(t, ext) require.Equal(t, server, ext) - + // Test shutdown ret = server.Shutdown() require.Equal(t, nvml.SUCCESS, ret) @@ -450,39 +450,39 @@ func TestInitShutdownLifecycle(t *testing.T) { // TestMultipleDevices verifies all devices are unique and correctly indexed func TestMultipleDevices(t *testing.T) { server := New() - + devices := make([]nvml.Device, 8) uuids := make(map[string]bool) - + // Get all devices and verify uniqueness for i := 0; i < 8; i++ { device, ret := server.DeviceGetHandleByIndex(i) require.Equal(t, nvml.SUCCESS, ret) require.NotNil(t, device) - + devices[i] = device - + // Verify UUID is unique uuid, ret := device.GetUUID() require.Equal(t, nvml.SUCCESS, ret) require.NotEmpty(t, uuid) require.False(t, uuids[uuid], "UUID %s should be unique", uuid) uuids[uuid] = true - + // Verify device properties are consistent index, ret := device.GetIndex() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, i, index) - + minor, ret := device.GetMinorNumber() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, i, minor) - + name, ret := device.GetName() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, "Mock NVIDIA A100-SXM4-40GB", name) } - + // Verify all devices are distinct objects for i := 0; i < 8; i++ { for j := i + 1; j < 8; j++ { @@ -496,32 +496,32 @@ func TestA100SpecificCharacteristics(t *testing.T) { server := New() device, ret := server.DeviceGetHandleByIndex(0) require.Equal(t, nvml.SUCCESS, ret) - + // Test A100 doesn't support P2P in MIG (IsP2pSupported should be 0) profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, uint32(0), profileInfo.IsP2pSupported) - + // Test A100 memory values are correct profile1 := MIGProfiles.GpuInstanceProfiles[nvml.GPU_INSTANCE_PROFILE_1_SLICE] require.Equal(t, uint64(4864), profile1.MemorySizeMB) // 1g.5gb - + profile7 := MIGProfiles.GpuInstanceProfiles[nvml.GPU_INSTANCE_PROFILE_7_SLICE] require.Equal(t, uint64(40192), profile7.MemorySizeMB) // 7g.40gb - + // Test A100 architecture arch, ret := device.GetArchitecture() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_AMPERE), arch) - + // Test A100 CUDA compute capability major, minor, ret := device.GetCudaComputeCapability() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, 8, major) // Ampere require.Equal(t, 0, minor) - + // Test A100 PCI device ID pciInfo, ret := device.GetPciInfo() require.Equal(t, nvml.SUCCESS, ret) require.Equal(t, uint32(0x20B010DE), pciInfo.PciDeviceId) // A100-SXM4-40GB -} \ No newline at end of file +} diff --git a/pkg/nvml/mock/dgxa100/mig-profile.go b/pkg/nvml/mock/dgxa100/mig-profile.go deleted file mode 100644 index c4df4c8..0000000 --- a/pkg/nvml/mock/dgxa100/mig-profile.go +++ /dev/null @@ -1,471 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dgxa100 - -import ( - "github.com/NVIDIA/go-nvml/pkg/nvml" -) - -// MIGProfiles holds the profile information for GIs and CIs in this mock server. -// We should consider auto-generating this object in the future. -var MIGProfiles = struct { - GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo - ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo -}{ - GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 7, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 0, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 4864, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 1, - OfaCount: 1, - MemorySizeMB: 4864, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 4, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 9856, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, - IsP2pSupported: 0, - SliceCount: 2, - InstanceCount: 3, - MultiprocessorCount: 28, - CopyEngineCount: 2, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 9856, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, - IsP2pSupported: 0, - SliceCount: 3, - InstanceCount: 2, - MultiprocessorCount: 42, - CopyEngineCount: 3, - DecoderCount: 2, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 19968, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, - IsP2pSupported: 0, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - CopyEngineCount: 4, - DecoderCount: 2, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 19968, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, - IsP2pSupported: 0, - SliceCount: 7, - InstanceCount: 1, - MultiprocessorCount: 98, - CopyEngineCount: 7, - DecoderCount: 5, - EncoderCount: 0, - JpegCount: 1, - OfaCount: 1, - MemorySizeMB: 40192, - }, - }, - ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 0, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 2, - MultiprocessorCount: 14, - SharedCopyEngineCount: 2, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 1, - MultiprocessorCount: 28, - SharedCopyEngineCount: 2, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 3, - MultiprocessorCount: 14, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 1, - MultiprocessorCount: 28, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, - SliceCount: 3, - InstanceCount: 1, - MultiprocessorCount: 42, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 4, - MultiprocessorCount: 14, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 2, - MultiprocessorCount: 28, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 7, - MultiprocessorCount: 14, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 3, - MultiprocessorCount: 28, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, - SliceCount: 3, - InstanceCount: 2, - MultiprocessorCount: 42, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, - SliceCount: 7, - InstanceCount: 1, - MultiprocessorCount: 98, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - }, - }, -} - -// MIGPlacements holds the placement information for GIs and CIs in this mock server. -// We should consider auto-generating this object in the future. -var MIGPlacements = struct { - GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement - ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement -}{ - GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - { - Start: 0, - Size: 1, - }, - { - Start: 1, - Size: 1, - }, - { - Start: 2, - Size: 1, - }, - { - Start: 3, - Size: 1, - }, - { - Start: 4, - Size: 1, - }, - { - Start: 5, - Size: 1, - }, - { - Start: 6, - Size: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - { - Start: 0, - Size: 1, - }, - { - Start: 1, - Size: 1, - }, - { - Start: 2, - Size: 1, - }, - { - Start: 3, - Size: 1, - }, - { - Start: 4, - Size: 1, - }, - { - Start: 5, - Size: 1, - }, - { - Start: 6, - Size: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - { - Start: 0, - Size: 2, - }, - { - Start: 2, - Size: 2, - }, - { - Start: 4, - Size: 2, - }, - { - Start: 6, - Size: 2, - }, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - { - Start: 0, - Size: 2, - }, - { - Start: 2, - Size: 2, - }, - { - Start: 4, - Size: 2, - }, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - { - Start: 0, - Size: 4, - }, - { - Start: 4, - Size: 4, - }, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - { - Start: 0, - Size: 4, - }, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - { - Start: 0, - Size: 8, - }, - }, - }, - // TODO: Fill out ComputeInstancePossiblePlacements - ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {}, - }, - }, -} diff --git a/pkg/nvml/mock/shared/gpus/a100.go b/pkg/nvml/mock/shared/gpus/a100.go new file mode 100644 index 0000000..89b122d --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/a100.go @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// A100 GPU Variants with different memory profiles and PCI device IDs +var ( + A100_PCIE_40GB = shared.Config{ + Name: "NVIDIA A100-PCIE-40GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 40960, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20F110DE, + MIGProfiles: a100_40gb_MIGProfiles, + } + A100_PCIE_80GB = shared.Config{ + Name: "NVIDIA A100-PCIE-80GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B510DE, + MIGProfiles: a100_80gb_MIGProfiles, + } + A100_SXM4_40GB = shared.Config{ + Name: "Mock NVIDIA A100-SXM4-40GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 40960, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B010DE, + MIGProfiles: a100_40gb_MIGProfiles, + } + A100_SXM4_80GB = shared.Config{ + Name: "NVIDIA A100-SXM4-80GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B210DE, + MIGProfiles: a100_80gb_MIGProfiles, + } +) + +var ( + a100_40gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: a100_40gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a100_ComputeInstanceProfiles, + GpuInstancePlacements: a100_GpuInstancePlacements, + ComputeInstancePlacements: a100_ComputeInstancePlacements, + } + a100_80gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: a100_80gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a100_ComputeInstanceProfiles, + GpuInstancePlacements: a100_GpuInstancePlacements, + ComputeInstancePlacements: a100_ComputeInstancePlacements, + } +) + +var ( + a100_40gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 96, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 40192, + }, + } + a100_80gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40192, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40192, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 80384, + }, + } +) + +var a100_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var a100_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 8}, // Test expects Size 8 + }, +} + +var a100_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 8}, // Test expects Size 8 + }, + }, +} diff --git a/pkg/nvml/mock/shared/gpus/a30.go b/pkg/nvml/mock/shared/gpus/a30.go new file mode 100644 index 0000000..17085ba --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/a30.go @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// A30 GPU Variants with different memory profiles and PCI device IDs +var ( + A30_PCIE_24GB = shared.Config{ + Name: "NVIDIA A30-PCIE-24GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 24576, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B710DE, + MIGProfiles: a30_24gb_MIGProfiles, + } +) + +var a30_24gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: a30_24gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a30_ComputeInstanceProfiles, + GpuInstancePlacements: a30_GpuInstancePlacements, + ComputeInstancePlacements: a30_ComputeInstancePlacements, +} + +var a30_24gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 0, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 5836, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 5836, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 11672, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 11672, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 4, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 23344, + }, +} + +var a30_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + }, + }, +} + +var a30_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, +} + +var a30_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, +} diff --git a/pkg/nvml/mock/shared/shared.go b/pkg/nvml/mock/shared/shared.go new file mode 100644 index 0000000..75d93c6 --- /dev/null +++ b/pkg/nvml/mock/shared/shared.go @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package shared + +import ( + "fmt" + "sync" + + "github.com/google/uuid" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock" +) + +// Config contains the minimal configuration needed for a GPU generation +type Config struct { + Name string + Architecture nvml.DeviceArchitecture + Brand nvml.BrandType + MemoryMB uint64 + CudaMajor int + CudaMinor int + PciDeviceId uint32 + MIGProfiles MIGProfileConfig +} + +// ServerConfig contains the minimal configuration needed for a server +type ServerConfig struct { + Config Config + GPUCount int + DriverVersion string + NvmlVersion string + CudaDriverVersion int +} + +// MIGProfileConfig contains MIG profile configuration for a GPU +type MIGProfileConfig struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + GpuInstancePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePlacements map[int]map[int][]nvml.ComputeInstancePlacement +} + +// Server provides a reusable server implementation +type Server struct { + mock.Interface + mock.ExtendedInterface + Devices [8]nvml.Device + DriverVersion string + NvmlVersion string + CudaDriverVersion int +} + +// Device provides a reusable device implementation +type Device struct { + mock.Device + sync.RWMutex + UUID string + Name string + Brand nvml.BrandType + Architecture nvml.DeviceArchitecture + PciBusID string + Minor int + Index int + CudaComputeCapability CudaComputeCapability + MigMode int + GpuInstances map[*GpuInstance]struct{} + GpuInstanceCounter uint32 + MemoryInfo nvml.Memory + PciDeviceId uint32 + MIGProfiles MIGProfileConfig +} + +// GpuInstance provides a reusable GPU instance implementation +type GpuInstance struct { + mock.GpuInstance + sync.RWMutex + Info nvml.GpuInstanceInfo + ComputeInstances map[*ComputeInstance]struct{} + ComputeInstanceCounter uint32 + MIGProfiles MIGProfileConfig +} + +// ComputeInstance provides a reusable compute instance implementation +type ComputeInstance struct { + mock.ComputeInstance + Info nvml.ComputeInstanceInfo +} + +// CudaComputeCapability represents CUDA compute capability +type CudaComputeCapability struct { + Major int + Minor int +} + +var _ nvml.Interface = (*Server)(nil) +var _ nvml.Device = (*Device)(nil) +var _ nvml.GpuInstance = (*GpuInstance)(nil) +var _ nvml.ComputeInstance = (*ComputeInstance)(nil) + +// NewServerFromConfig creates a new server from the provided configuration +func NewServerFromConfig(config ServerConfig) *Server { + var devices [8]nvml.Device + for i := 0; i < config.GPUCount && i < 8; i++ { + devices[i] = NewDeviceFromConfig(config.Config, i) + } + + server := &Server{ + Devices: devices, + DriverVersion: config.DriverVersion, + NvmlVersion: config.NvmlVersion, + CudaDriverVersion: config.CudaDriverVersion, + } + server.SetMockFuncs() + return server +} + +// NewDeviceFromConfig creates a new device from the provided GPU configuration +func NewDeviceFromConfig(config Config, index int) *Device { + device := &Device{ + UUID: "GPU-" + uuid.New().String(), + Name: config.Name, + Brand: config.Brand, + Architecture: config.Architecture, + PciBusID: fmt.Sprintf("0000:%02x:00.0", index), + Minor: index, + Index: index, + CudaComputeCapability: CudaComputeCapability{ + Major: config.CudaMajor, + Minor: config.CudaMinor, + }, + GpuInstances: make(map[*GpuInstance]struct{}), + GpuInstanceCounter: 0, + MemoryInfo: nvml.Memory{Total: config.MemoryMB * 1024 * 1024, Free: 0, Used: 0}, + PciDeviceId: config.PciDeviceId, + MIGProfiles: config.MIGProfiles, + } + device.SetMockFuncs() + return device +} + +// NewGpuInstanceFromInfo creates a new GPU instance +func NewGpuInstanceFromInfo(info nvml.GpuInstanceInfo, profiles MIGProfileConfig) *GpuInstance { + gi := &GpuInstance{ + Info: info, + ComputeInstances: make(map[*ComputeInstance]struct{}), + ComputeInstanceCounter: 0, + MIGProfiles: profiles, + } + gi.SetMockFuncs() + return gi +} + +// NewComputeInstanceFromInfo creates a new compute instance +func NewComputeInstanceFromInfo(info nvml.ComputeInstanceInfo) *ComputeInstance { + ci := &ComputeInstance{ + Info: info, + } + ci.SetMockFuncs() + return ci +} + +// SetMockFuncs configures all the mock function implementations for the server +func (s *Server) SetMockFuncs() { + s.ExtensionsFunc = func() nvml.ExtendedInterface { + return s + } + + s.LookupSymbolFunc = func(symbol string) error { + return nil + } + + s.InitFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.ShutdownFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { + return s.DriverVersion, nvml.SUCCESS + } + + s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { + return s.NvmlVersion, nvml.SUCCESS + } + + s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { + return s.CudaDriverVersion, nvml.SUCCESS + } + + s.DeviceGetCountFunc = func() (int, nvml.Return) { + return len(s.Devices), nvml.SUCCESS + } + + s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { + if index < 0 || index >= len(s.Devices) { + return nil, nvml.ERROR_INVALID_ARGUMENT + } + return s.Devices[index], nvml.SUCCESS + } + + s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if uuid == d.(*Device).UUID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } + + s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if busID == d.(*Device).PciBusID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } +} + +// SetMockFuncs configures all the mock function implementations for the device +func (d *Device) SetMockFuncs() { + d.GetMinorNumberFunc = func() (int, nvml.Return) { + return d.Minor, nvml.SUCCESS + } + + d.GetIndexFunc = func() (int, nvml.Return) { + return d.Index, nvml.SUCCESS + } + + d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { + return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS + } + + d.GetUUIDFunc = func() (string, nvml.Return) { + return d.UUID, nvml.SUCCESS + } + + d.GetNameFunc = func() (string, nvml.Return) { + return d.Name, nvml.SUCCESS + } + + d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { + return d.Brand, nvml.SUCCESS + } + + d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { + return d.Architecture, nvml.SUCCESS + } + + d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { + return d.MemoryInfo, nvml.SUCCESS + } + + d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { + p := nvml.PciInfo{ + PciDeviceId: d.PciDeviceId, + } + return p, nvml.SUCCESS + } + + d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { + d.MigMode = mode + return nvml.SUCCESS, nvml.SUCCESS + } + + d.GetMigModeFunc = func() (int, int, nvml.Return) { + return d.MigMode, d.MigMode, nvml.SUCCESS + } + + d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { + if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if _, exists := d.MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return d.MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS + } + + d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { + return d.MIGProfiles.GpuInstancePlacements[int(info.Id)], nvml.SUCCESS + } + + d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + } + d.GpuInstanceCounter++ + gi := NewGpuInstanceFromInfo(giInfo, d.MIGProfiles) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + Placement: *placement, + } + d.GpuInstanceCounter++ + gi := NewGpuInstanceFromInfo(giInfo, d.MIGProfiles) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { + d.RLock() + defer d.RUnlock() + var gis []nvml.GpuInstance + for gi := range d.GpuInstances { + if gi.Info.ProfileId == info.Id { + gis = append(gis, gi) + } + } + return gis, nvml.SUCCESS + } +} + +// SetMockFuncs configures all the mock function implementations for the GPU instance +func (gi *GpuInstance) SetMockFuncs() { + gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { + return gi.Info, nvml.SUCCESS + } + + gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { + if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + giProfileId := int(gi.Info.ProfileId) + + if _, exists := gi.MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + if _, exists := gi.MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return gi.MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS + } + + gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { + return gi.MIGProfiles.ComputeInstancePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS + } + + gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { + gi.Lock() + defer gi.Unlock() + ciInfo := nvml.ComputeInstanceInfo{ + Device: gi.Info.Device, + GpuInstance: gi, + Id: gi.ComputeInstanceCounter, + ProfileId: info.Id, + } + gi.ComputeInstanceCounter++ + ci := NewComputeInstanceFromInfo(ciInfo) + gi.ComputeInstances[ci] = struct{}{} + return ci, nvml.SUCCESS + } + + gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { + gi.RLock() + defer gi.RUnlock() + var cis []nvml.ComputeInstance + for ci := range gi.ComputeInstances { + if ci.Info.ProfileId == info.Id { + cis = append(cis, ci) + } + } + return cis, nvml.SUCCESS + } + + gi.DestroyFunc = func() nvml.Return { + d := gi.Info.Device.(*Device) + d.Lock() + defer d.Unlock() + delete(d.GpuInstances, gi) + return nvml.SUCCESS + } +} + +// SetMockFuncs configures all the mock function implementations for the compute instance +func (ci *ComputeInstance) SetMockFuncs() { + ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { + return ci.Info, nvml.SUCCESS + } + + ci.DestroyFunc = func() nvml.Return { + gi := ci.Info.GpuInstance.(*GpuInstance) + gi.Lock() + defer gi.Unlock() + delete(gi.ComputeInstances, ci) + return nvml.SUCCESS + } +} From 695e2eb8c5527ece6656ccbc79f0d493a2d0bdea Mon Sep 17 00:00:00 2001 From: Fabien Dupont Date: Wed, 24 Sep 2025 11:36:38 +0200 Subject: [PATCH 3/5] feat: Add H100 and H200 GPU mock implementations using shared factory Implements DGX H100 and H200 GPU mocks following the established shared factory pattern for consistency with existing A100/A30 implementations. - Add H100 SXM5 80GB configuration with complete MIG profile support - Add H200 SXM5 141GB configuration with complete MIG profile support - Implement dgxh100 and dgxh200 packages using shared factory pattern - Include all 7 MIG profiles (standard, REV1 media, REV2 double memory) - Maintain full backward compatibility with legacy globals and type aliases - Use NVIDIA-spec compliant memory allocations and SM distributions Signed-off-by: Fabien Dupont --- pkg/nvml/mock/dgxh100/dgxh100.go | 79 +++++++ pkg/nvml/mock/dgxh100/dgxh100_test.go | 69 ++++++ pkg/nvml/mock/dgxh200/dgxh200.go | 79 +++++++ pkg/nvml/mock/dgxh200/dgxh200_test.go | 172 ++++++++++++++ pkg/nvml/mock/shared/gpus/h100.go | 327 ++++++++++++++++++++++++++ pkg/nvml/mock/shared/gpus/h200.go | 327 ++++++++++++++++++++++++++ 6 files changed, 1053 insertions(+) create mode 100644 pkg/nvml/mock/dgxh100/dgxh100.go create mode 100644 pkg/nvml/mock/dgxh100/dgxh100_test.go create mode 100644 pkg/nvml/mock/dgxh200/dgxh200.go create mode 100644 pkg/nvml/mock/dgxh200/dgxh200_test.go create mode 100644 pkg/nvml/mock/shared/gpus/h100.go create mode 100644 pkg/nvml/mock/shared/gpus/h200.go diff --git a/pkg/nvml/mock/dgxh100/dgxh100.go b/pkg/nvml/mock/dgxh100/dgxh100.go new file mode 100644 index 0000000..aa46c46 --- /dev/null +++ b/pkg/nvml/mock/dgxh100/dgxh100.go @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxh100 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" +) + +// Backwards compatible type aliases +type Server = shared.Server +type Device = shared.Device +type GpuInstance = shared.GpuInstance +type ComputeInstance = shared.ComputeInstance +type CudaComputeCapability = shared.CudaComputeCapability + +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.H100_SXM5_80GB, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} + +func NewDevice(index int) *Device { + return shared.NewDeviceFromConfig(gpus.H100_SXM5_80GB, index) +} + +// NewServerWithGPU creates a new server with a specific H100 GPU variant +func NewServerWithGPU(gpuConfig shared.Config) *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpuConfig, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} + +// NewDeviceWithGPU creates a new device with a specific H100 GPU variant +func NewDeviceWithGPU(gpuConfig shared.Config, index int) *Device { + return shared.NewDeviceFromConfig(gpuConfig, index) +} + +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.H100_SXM5_80GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.H100_SXM5_80GB.MIGProfiles.ComputeInstanceProfiles, + } + + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.H100_SXM5_80GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.H100_SXM5_80GB.MIGProfiles.ComputeInstancePlacements, + } +) diff --git a/pkg/nvml/mock/dgxh100/dgxh100_test.go b/pkg/nvml/mock/dgxh100/dgxh100_test.go new file mode 100644 index 0000000..c0bec49 --- /dev/null +++ b/pkg/nvml/mock/dgxh100/dgxh100_test.go @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxh100 + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +func TestH100Server(t *testing.T) { + server := New() + + count, ret := server.DeviceGetCount() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, count) + + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "NVIDIA H100 80GB HBM3", name) + + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_HOPPER), arch) + + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 9, major) + require.Equal(t, 0, minor) + + memory, ret := device.GetMemoryInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint64(81920*1024*1024), memory.Total) // 80GB + + // Test H100 supports P2P in MIG (IsP2pSupported should be 1) + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) + + // Test MIG functionality + gpuInstance, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gpuInstance) + + giInfo, ret := gpuInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), giInfo.Id) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), giInfo.ProfileId) +} diff --git a/pkg/nvml/mock/dgxh200/dgxh200.go b/pkg/nvml/mock/dgxh200/dgxh200.go new file mode 100644 index 0000000..fec64ee --- /dev/null +++ b/pkg/nvml/mock/dgxh200/dgxh200.go @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxh200 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" +) + +// Backwards compatible type aliases +type Server = shared.Server +type Device = shared.Device +type GpuInstance = shared.GpuInstance +type ComputeInstance = shared.ComputeInstance +type CudaComputeCapability = shared.CudaComputeCapability + +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.H200_SXM5_141GB, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} + +func NewDevice(index int) *Device { + return shared.NewDeviceFromConfig(gpus.H200_SXM5_141GB, index) +} + +// NewServerWithGPU creates a new server with a specific H200 GPU variant +func NewServerWithGPU(gpuConfig shared.Config) *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpuConfig, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} + +// NewDeviceWithGPU creates a new device with a specific H200 GPU variant +func NewDeviceWithGPU(gpuConfig shared.Config, index int) *Device { + return shared.NewDeviceFromConfig(gpuConfig, index) +} + +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.H200_SXM5_141GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.H200_SXM5_141GB.MIGProfiles.ComputeInstanceProfiles, + } + + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.H200_SXM5_141GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.H200_SXM5_141GB.MIGProfiles.ComputeInstancePlacements, + } +) diff --git a/pkg/nvml/mock/dgxh200/dgxh200_test.go b/pkg/nvml/mock/dgxh200/dgxh200_test.go new file mode 100644 index 0000000..e0b1ea4 --- /dev/null +++ b/pkg/nvml/mock/dgxh200/dgxh200_test.go @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxh200 + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +func TestH200Server(t *testing.T) { + server := New() + + count, ret := server.DeviceGetCount() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, count) + + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "NVIDIA H200 141GB HBM3e", name) + + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_HOPPER), arch) + + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 9, major) + require.Equal(t, 0, minor) + + memory, ret := device.GetMemoryInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint64(144384*1024*1024), memory.Total) // 141GB + + // Test H200 supports P2P in MIG (IsP2pSupported should be 1) + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) + + // Test MIG functionality + gpuInstance, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gpuInstance) + + giInfo, ret := gpuInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), giInfo.Id) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), giInfo.ProfileId) + + // Test compute instance creation + ciProfileInfo, ret := gpuInstance.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + + computeInstance, ret := gpuInstance.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, computeInstance) + + ciInfo, ret := computeInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), ciInfo.Id) + require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE), ciInfo.ProfileId) +} + +func TestH200Device(t *testing.T) { + device := NewDevice(3) + + index, ret := device.GetIndex() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 3, index) + + minor, ret := device.GetMinorNumber() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 3, minor) + + uuid, ret := device.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.Contains(t, uuid, "GPU-") + + brand, ret := device.GetBrand() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.BRAND_NVIDIA, brand) + + pciInfo, ret := device.GetPciInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0x233310DE), pciInfo.PciDeviceId) +} + +func TestH200MIGProfiles(t *testing.T) { + device := NewDevice(0) + + // Test all GPU instance profiles + testCases := []struct { + profile int + sliceCount uint32 + memoryMB uint64 + multiproc uint32 + }{ + {nvml.GPU_INSTANCE_PROFILE_1_SLICE, 1, 18432, 16}, + {nvml.GPU_INSTANCE_PROFILE_2_SLICE, 2, 35840, 32}, + {nvml.GPU_INSTANCE_PROFILE_3_SLICE, 3, 72704, 48}, + {nvml.GPU_INSTANCE_PROFILE_4_SLICE, 4, 72704, 64}, + {nvml.GPU_INSTANCE_PROFILE_7_SLICE, 7, 144384, 112}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("profile_%d_slice", tc.sliceCount), func(t *testing.T) { + profileInfo, ret := device.GetGpuInstanceProfileInfo(tc.profile) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(tc.profile), profileInfo.Id) + require.Equal(t, tc.sliceCount, profileInfo.SliceCount) + require.Equal(t, tc.memoryMB, profileInfo.MemorySizeMB) + require.Equal(t, tc.multiproc, profileInfo.MultiprocessorCount) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) // H200 supports P2P + }) + } +} + +func TestH200MIGInstanceCreation(t *testing.T) { + device := NewDevice(0) + + // Test creating multiple GPU instances of different profiles + profileInfo1, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + gi1, ret := device.CreateGpuInstance(&profileInfo1) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi1) + + gi2, ret := device.CreateGpuInstance(&profileInfo1) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi2) + + // Verify they have different IDs + gi1Info, ret := gi1.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + gi2Info, ret := gi2.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.NotEqual(t, gi1Info.Id, gi2Info.Id) + + // Test that we can create compute instances on each GPU instance + ciProfileInfo, ret := gi1.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + + ci1, ret := gi1.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, ci1) + + ci2, ret := gi2.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, ci2) +} diff --git a/pkg/nvml/mock/shared/gpus/h100.go b/pkg/nvml/mock/shared/gpus/h100.go new file mode 100644 index 0000000..cd30163 --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/h100.go @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// H100 GPU Variants +var ( + H100_SXM5_80GB = shared.Config{ + Name: "NVIDIA H100 80GB HBM3", + Architecture: nvml.DEVICE_ARCH_HOPPER, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, // 80GB + CudaMajor: 9, + CudaMinor: 0, + PciDeviceId: 0x233010DE, + MIGProfiles: h100_80gb_MIGProfiles, + } +) + +var ( + h100_80gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: h100_80gb_GpuInstanceProfiles, + ComputeInstanceProfiles: h100_ComputeInstanceProfiles, + GpuInstancePlacements: h100_GpuInstancePlacements, + ComputeInstancePlacements: h100_ComputeInstancePlacements, + } +) + +var ( + h100_80gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 10240, // 10GB (MIG 1g.10gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 10240, // 10GB (MIG 1g.10gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 20480, // 20GB (MIG 1g.20gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 20480, // 20GB (MIG 2g.20gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40960, // 40GB (MIG 3g.40gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40960, // 40GB (MIG 4g.40gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 81920, // 80GB (MIG 7g.80gb) + }, + } +) + +var h100_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var h100_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var h100_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} diff --git a/pkg/nvml/mock/shared/gpus/h200.go b/pkg/nvml/mock/shared/gpus/h200.go new file mode 100644 index 0000000..1c20a2f --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/h200.go @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// H200 GPU Variants +var ( + H200_SXM5_141GB = shared.Config{ + Name: "NVIDIA H200 141GB HBM3e", + Architecture: nvml.DEVICE_ARCH_HOPPER, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 144384, // 141GB + CudaMajor: 9, + CudaMinor: 0, + PciDeviceId: 0x233310DE, + MIGProfiles: h200_141gb_MIGProfiles, + } +) + +var ( + h200_141gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: h200_141gb_GpuInstanceProfiles, + ComputeInstanceProfiles: h200_ComputeInstanceProfiles, + GpuInstancePlacements: h200_GpuInstancePlacements, + ComputeInstancePlacements: h200_ComputeInstancePlacements, + } +) + +var ( + h200_141gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 18432, // 18GB (MIG 1g.18gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 18432, // 18GB (MIG 1g.18gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 35840, // 35GB (MIG 1g.35gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 35840, // 35GB (MIG 2g.35gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 72704, // 71GB (MIG 3g.71gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 72704, // 71GB (MIG 4g.71gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 144384, // 141GB (MIG 7g.141gb) + }, + } +) + +var h200_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var h200_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var h200_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} From 48cd9219d58c122b8f8c979bf944fbf1ac706748 Mon Sep 17 00:00:00 2001 From: Fabien Dupont Date: Wed, 24 Sep 2025 16:52:31 +0200 Subject: [PATCH 4/5] feat: Add B200 GPU mock implementation using shared factory Implements DGX B200 mock following the established shared factory pattern: - Add B200 SXM5 180GB GPU configuration with Blackwell architecture - Comprehensive MIG profiles matching NVIDIA specifications: * Memory allocations: 23GB, 45GB, 90GB, 180GB per NVIDIA MIG User Guide * REV1 (media extensions) and REV2 (expanded memory) profiles * Full P2P support in MIG mode (IsP2pSupported: 1) * 144 SMs total with 18 SMs per slice - Complete DGX B200 server implementation with 8 GPUs - Driver version 560.28.03, NVML 12.560.28.03, CUDA 12060 - Comprehensive test suite covering server, device, and MIG operations - Backward compatible legacy global variables (MIGProfiles, MIGPlacements) Memory values corrected from initial 192GB to 180GB based on official NVIDIA MIG User Guide specifications. Signed-off-by: Fabien Dupont --- pkg/nvml/mock/dgxb200/dgxb200.go | 79 ++++++ pkg/nvml/mock/dgxb200/dgxb200_test.go | 199 ++++++++++++++ pkg/nvml/mock/shared/gpus/b200.go | 361 ++++++++++++++++++++++++++ 3 files changed, 639 insertions(+) create mode 100644 pkg/nvml/mock/dgxb200/dgxb200.go create mode 100644 pkg/nvml/mock/dgxb200/dgxb200_test.go create mode 100644 pkg/nvml/mock/shared/gpus/b200.go diff --git a/pkg/nvml/mock/dgxb200/dgxb200.go b/pkg/nvml/mock/dgxb200/dgxb200.go new file mode 100644 index 0000000..0e0a586 --- /dev/null +++ b/pkg/nvml/mock/dgxb200/dgxb200.go @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxb200 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" +) + +// Backwards compatible type aliases +type Server = shared.Server +type Device = shared.Device +type GpuInstance = shared.GpuInstance +type ComputeInstance = shared.ComputeInstance +type CudaComputeCapability = shared.CudaComputeCapability + +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.B200_SXM5_180GB, + GPUCount: 8, + DriverVersion: "560.28.03", + NvmlVersion: "12.560.28.03", + CudaDriverVersion: 12060, + }) +} + +func NewDevice(index int) *Device { + return shared.NewDeviceFromConfig(gpus.B200_SXM5_180GB, index) +} + +// NewServerWithGPU creates a new server with a specific B200 GPU variant +func NewServerWithGPU(gpuConfig shared.Config) *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpuConfig, + GPUCount: 8, + DriverVersion: "560.28.03", + NvmlVersion: "12.560.28.03", + CudaDriverVersion: 12060, + }) +} + +// NewDeviceWithGPU creates a new device with a specific B200 GPU variant +func NewDeviceWithGPU(gpuConfig shared.Config, index int) *Device { + return shared.NewDeviceFromConfig(gpuConfig, index) +} + +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.B200_SXM5_180GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.B200_SXM5_180GB.MIGProfiles.ComputeInstanceProfiles, + } + + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.B200_SXM5_180GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.B200_SXM5_180GB.MIGProfiles.ComputeInstancePlacements, + } +) diff --git a/pkg/nvml/mock/dgxb200/dgxb200_test.go b/pkg/nvml/mock/dgxb200/dgxb200_test.go new file mode 100644 index 0000000..8c08bc9 --- /dev/null +++ b/pkg/nvml/mock/dgxb200/dgxb200_test.go @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxb200 + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +func TestB200Server(t *testing.T) { + server := New() + + count, ret := server.DeviceGetCount() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, count) + + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "NVIDIA B200 180GB HBM3e", name) + + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_BLACKWELL), arch) + + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 10, major) + require.Equal(t, 0, minor) + + memory, ret := device.GetMemoryInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint64(184320*1024*1024), memory.Total) // 180GB + + // Test B200 supports P2P in MIG (IsP2pSupported should be 1) + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) + + // Test MIG functionality + gpuInstance, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gpuInstance) + + giInfo, ret := gpuInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), giInfo.Id) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), giInfo.ProfileId) + + // Test compute instance creation + ciProfileInfo, ret := gpuInstance.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + + computeInstance, ret := gpuInstance.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, computeInstance) + + ciInfo, ret := computeInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), ciInfo.Id) + require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE), ciInfo.ProfileId) +} + +func TestB200Device(t *testing.T) { + device := NewDevice(5) + + index, ret := device.GetIndex() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 5, index) + + minor, ret := device.GetMinorNumber() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 5, minor) + + uuid, ret := device.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.Contains(t, uuid, "GPU-") + + brand, ret := device.GetBrand() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.BRAND_NVIDIA, brand) + + pciInfo, ret := device.GetPciInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0x2B0010DE), pciInfo.PciDeviceId) +} + +func TestB200MIGProfiles(t *testing.T) { + device := NewDevice(0) + + // Test all GPU instance profiles + testCases := []struct { + profile int + sliceCount uint32 + memoryMB uint64 + multiproc uint32 + encoders uint32 + jpegs uint32 + ofas uint32 + }{ + {nvml.GPU_INSTANCE_PROFILE_1_SLICE, 1, 23552, 18, 0, 0, 0}, + {nvml.GPU_INSTANCE_PROFILE_2_SLICE, 2, 46080, 36, 1, 1, 1}, + {nvml.GPU_INSTANCE_PROFILE_3_SLICE, 3, 92160, 54, 2, 2, 2}, + {nvml.GPU_INSTANCE_PROFILE_4_SLICE, 4, 92160, 72, 2, 2, 2}, + {nvml.GPU_INSTANCE_PROFILE_7_SLICE, 7, 184320, 126, 4, 4, 4}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("profile_%d_slice", tc.sliceCount), func(t *testing.T) { + profileInfo, ret := device.GetGpuInstanceProfileInfo(tc.profile) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(tc.profile), profileInfo.Id) + require.Equal(t, tc.sliceCount, profileInfo.SliceCount) + require.Equal(t, tc.memoryMB, profileInfo.MemorySizeMB) + require.Equal(t, tc.multiproc, profileInfo.MultiprocessorCount) + require.Equal(t, tc.encoders, profileInfo.EncoderCount) + require.Equal(t, tc.jpegs, profileInfo.JpegCount) + require.Equal(t, tc.ofas, profileInfo.OfaCount) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) // B200 supports P2P + }) + } +} + +func TestB200AdvancedFeatures(t *testing.T) { + device := NewDevice(0) + + // Test that B200 has enhanced encoder/decoder capabilities compared to H100/H200 + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_7_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + // B200 should have more advanced multimedia engines + require.Equal(t, uint32(4), profileInfo.EncoderCount) // More encoders than H100/H200 + require.Equal(t, uint32(4), profileInfo.JpegCount) // JPEG engines + require.Equal(t, uint32(4), profileInfo.OfaCount) // OFA engines + + // Test GPU instance creation with advanced profile + gpuInstance, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gpuInstance) + + // Test compute instance with 7-slice profile + ciProfileInfo, ret := gpuInstance.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(126), ciProfileInfo.MultiprocessorCount) // High multiprocessor count for B200 +} + +func TestB200MIGInstanceManagement(t *testing.T) { + device := NewDevice(0) + + // Test creating and destroying instances + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_2_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + // Create GPU instance + gi, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi) + + // Create compute instance + ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + + ci, ret := gi.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, ci) + + // Verify compute instance info + ciInfo, ret := ci.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE), ciInfo.ProfileId) + + // Test destruction + ret = ci.Destroy() + require.Equal(t, nvml.SUCCESS, ret) + + ret = gi.Destroy() + require.Equal(t, nvml.SUCCESS, ret) +} diff --git a/pkg/nvml/mock/shared/gpus/b200.go b/pkg/nvml/mock/shared/gpus/b200.go new file mode 100644 index 0000000..6ca30a8 --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/b200.go @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// B200 GPU Variants +var ( + B200_SXM5_180GB = shared.Config{ + Name: "NVIDIA B200 180GB HBM3e", + Architecture: nvml.DEVICE_ARCH_BLACKWELL, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 184320, // 180GB + CudaMajor: 10, + CudaMinor: 0, + PciDeviceId: 0x2B0010DE, + MIGProfiles: b200_180gb_MIGProfiles, + } +) + +var ( + b200_180gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: b200_180gb_GpuInstanceProfiles, + ComputeInstanceProfiles: b200_ComputeInstanceProfiles, + GpuInstancePlacements: b200_GpuInstancePlacements, + ComputeInstancePlacements: b200_ComputeInstancePlacements, + } +) + +var ( + b200_180gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 23552, // 23GB (MIG 1g.23gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 1, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 23552, // 23GB (MIG 1g.23gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 46080, // 45GB (MIG 1g.45gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 36, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 1, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 46080, // 45GB (MIG 2g.45gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 54, + CopyEngineCount: 3, + DecoderCount: 3, + EncoderCount: 2, + JpegCount: 2, + OfaCount: 2, + MemorySizeMB: 92160, // 90GB (MIG 3g.90gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 72, + CopyEngineCount: 4, + DecoderCount: 4, + EncoderCount: 2, + JpegCount: 2, + OfaCount: 2, + MemorySizeMB: 92160, // 90GB (MIG 4g.90gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 126, + CopyEngineCount: 7, + DecoderCount: 7, + EncoderCount: 4, + JpegCount: 4, + OfaCount: 4, + MemorySizeMB: 184320, // 180GB (MIG 7g.180gb) + }, + } +) + +var b200_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 36, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 54, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 72, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 54, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 126, + }, + }, +} + +var b200_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var b200_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} From 30e5daec23adaff8318ce11a49a88d53385d7915 Mon Sep 17 00:00:00 2001 From: Fabien Dupont Date: Wed, 24 Sep 2025 17:01:44 +0200 Subject: [PATCH 5/5] docs: Update README.md with comprehensive H100, H200, and B200 documentation Updates the mock framework documentation to include all GPU generations: - Add H100, H200, and B200 to architecture diagram and file structure - Document all GPU specifications: * H100 SXM5 80GB (Hopper, 132 SMs, CUDA 9.0, P2P MIG support) * H200 SXM5 141GB (Hopper, 132 SMs, CUDA 9.0, P2P MIG support) * B200 SXM5 180GB (Blackwell, 144 SMs, CUDA 10.0, P2P MIG support) - Add complete server model documentation with driver versions - Expand MIG support section with detailed SM allocations and P2P capabilities - Update usage examples to include all four GPU generations - Add comprehensive testing instructions for all mock implementations - Update backward compatibility section to reflect all generations The documentation now accurately reflects the complete shared factory implementation with comprehensive coverage of Ampere, Hopper, and Blackwell GPU architectures. Signed-off-by: Fabien Dupont --- pkg/nvml/mock/README.md | 194 +++++++++++++++++++++++++------ pkg/nvml/mock/dgxa100/dgxa100.go | 2 +- 2 files changed, 161 insertions(+), 35 deletions(-) diff --git a/pkg/nvml/mock/README.md b/pkg/nvml/mock/README.md index 46ad4f7..50f00cc 100644 --- a/pkg/nvml/mock/README.md +++ b/pkg/nvml/mock/README.md @@ -9,18 +9,30 @@ pkg/nvml/mock/ ├── shared/ │ ├── shared.go # Core shared factory and types │ └── gpus/ # GPU configuration definitions -│ ├── a100.go # A100 GPU variants -│ └── a30.go # A30 GPU variants -└── dgxa100/ # DGX A100 implementation - ├── dgxa100.go # Server and device implementation - ├── gpus.go # Legacy A100 configurations and MIG profiles - └── dgxa100_test.go # Comprehensive tests +│ ├── a100.go # A100 GPU variants (Ampere) +│ ├── a30.go # A30 GPU variants (Ampere) +│ ├── h100.go # H100 GPU variants (Hopper) +│ ├── h200.go # H200 GPU variants (Hopper) +│ └── b200.go # B200 GPU variants (Blackwell) +├── dgxa100/ # DGX A100 implementation +│ ├── dgxa100.go # Server and device implementation +│ └── dgxa100_test.go # Comprehensive tests +├── dgxh100/ # DGX H100 implementation +│ ├── dgxh100.go # Server and device implementation +│ └── dgxh100_test.go # Comprehensive tests +├── dgxh200/ # DGX H200 implementation +│ ├── dgxh200.go # Server and device implementation +│ └── dgxh200_test.go # Comprehensive tests +└── dgxb200/ # DGX B200 implementation + ├── dgxb200.go # Server and device implementation + └── dgxb200_test.go # Comprehensive tests ``` ## Core Concepts ### Shared Factory (`shared.Config`) Define the characteristics of individual GPU models including: + - Device properties (name, architecture, brand, PCI device ID) - Compute capabilities (CUDA version, compute capability) - Memory configuration @@ -28,11 +40,13 @@ Define the characteristics of individual GPU models including: ### Server Configuration (`shared.ServerConfig`) Define complete system configurations including: + - GPU configuration and count - Driver, NVML, and CUDA versions ### MIG Profile Configuration (`shared.MIGProfileConfig`) Define Multi-Instance GPU capabilities including: + - GPU instance profiles (slice configurations) - Compute instance profiles - Placement constraints and possibilities @@ -44,26 +58,37 @@ Define Multi-Instance GPU capabilities including: ```go import ( "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh100" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh200" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxb200" "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" ) -// Create default A100 system +// Create default systems serverA100 := dgxa100.New() // A100-SXM4-40GB (8 GPUs) +serverH100 := dgxh100.New() // H100-SXM5-80GB (8 GPUs) +serverH200 := dgxh200.New() // H200-SXM5-141GB (8 GPUs) +serverB200 := dgxb200.New() // B200-SXM5-180GB (8 GPUs) -// Create specific A100 variants +// Create specific variants serverA100_80GB := dgxa100.NewServerWithGPU(gpus.A100_SXM4_80GB) -serverA100_PCIE := dgxa100.NewServerWithGPU(gpus.A100_PCIE_40GB) +serverH200_Custom := dgxh200.NewServerWithGPU(gpus.H200_SXM5_141GB) +serverB200_Custom := dgxb200.NewServerWithGPU(gpus.B200_SXM5_180GB) ``` ### Device Creation ```go -// Create device with default configuration -device := dgxa100.NewDevice(0) +// Create devices with default configurations +deviceA100 := dgxa100.NewDevice(0) +deviceH100 := dgxh100.NewDevice(0) +deviceH200 := dgxh200.NewDevice(0) +deviceB200 := dgxb200.NewDevice(0) -// Create device with specific GPU variant +// Create devices with specific GPU variants deviceA100_80GB := dgxa100.NewDeviceWithGPU(gpus.A100_SXM4_80GB, 0) -deviceA100_PCIE := dgxa100.NewDeviceWithGPU(gpus.A100_PCIE_40GB, 1) +deviceH200_Custom := dgxh200.NewDeviceWithGPU(gpus.H200_SXM5_141GB, 1) +deviceB200_Custom := dgxb200.NewDeviceWithGPU(gpus.B200_SXM5_180GB, 2) ``` ### Accessing GPU Configurations @@ -79,11 +104,30 @@ gpus.A100_PCIE_80GB // A100 PCIe 80GB // A30 Family gpus.A30_PCIE_24GB // A30 PCIe 24GB +// H100 Family +gpus.H100_SXM5_80GB // H100 SXM5 80GB + +// H200 Family +gpus.H200_SXM5_141GB // H200 SXM5 141GB + +// B200 Family +gpus.B200_SXM5_180GB // B200 SXM5 180GB + // Inspect configurations fmt.Printf("GPU: %s\n", gpus.A100_SXM4_80GB.Name) fmt.Printf("Memory: %d MB\n", gpus.A100_SXM4_80GB.MemoryMB) fmt.Printf("Architecture: %v\n", gpus.A100_SXM4_80GB.Architecture) fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId) + +// Inspect H100 configuration +fmt.Printf("GPU: %s\n", gpus.H100_SXM5_80GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.H100_SXM5_80GB.MemoryMB) +fmt.Printf("CUDA Major: %d\n", gpus.H100_SXM5_80GB.CudaMajor) + +// Inspect B200 configuration +fmt.Printf("GPU: %s\n", gpus.B200_SXM5_180GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.B200_SXM5_180GB.MemoryMB) +fmt.Printf("CUDA Major: %d\n", gpus.B200_SXM5_180GB.CudaMajor) ``` ## Available GPU Models @@ -127,6 +171,39 @@ fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId) - MIG P2P: Not supported (`IsP2pSupported: 0`) - MIG slices: 1, 2, 4 (no 3-slice or 7-slice support) +### H100 Family (Hopper Architecture, 132 SMs) + +- **H100 SXM5 80GB** (`gpus.H100_SXM5_80GB`) + - Form factor: SXM5 + - Memory: 80GB HBM3 + - PCI Device ID: 0x233010DE + - CUDA Capability: 9.0 + - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +### H200 Family (Hopper Architecture, 132 SMs) + +- **H200 SXM5 141GB** (`gpus.H200_SXM5_141GB`) + - Form factor: SXM5 + - Memory: 141GB HBM3e + - PCI Device ID: 0x233310DE + - CUDA Capability: 9.0 + - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +### B200 Family (Blackwell Architecture, 144 SMs) + +- **B200 SXM5 180GB** (`gpus.B200_SXM5_180GB`) + - Form factor: SXM5 + - Memory: 180GB HBM3e + - PCI Device ID: 0x2B0010DE + - CUDA Capability: 10.0 + - SMs per slice: 18 (1-slice), 36 (2-slice), 54 (3-slice), 72 (4-slice), 126 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + ## Available Server Models ### DGX A100 Family @@ -137,6 +214,30 @@ fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId) - NVML: 12.550.54.15 - CUDA: 12040 +### DGX H100 Family + +- **DGX H100 80GB** (default) + - 8x H100 SXM5 80GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +### DGX H200 Family + +- **DGX H200 141GB** (default) + - 8x H200 SXM5 141GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +### DGX B200 Family + +- **DGX B200 180GB** (default) + - 8x B200 SXM5 180GB GPUs + - Driver: 560.28.03 + - NVML: 12.560.28.03 + - CUDA: 12060 + ## MIG (Multi-Instance GPU) Support All GPU configurations include comprehensive MIG profile definitions: @@ -144,9 +245,25 @@ All GPU configurations include comprehensive MIG profile definitions: - **A100**: No P2P support in MIG (`IsP2pSupported: 0`) - Memory profiles differ between 40GB and 80GB variants - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 108 SMs total with 14 SMs per slice - **A30**: No P2P support in MIG (`IsP2pSupported: 0`) - Supports limited MIG slice configurations (1, 2, 4 slices only) - 56 SMs total with 14 SMs per slice +- **H100**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 80GB HBM3 memory with optimized slice allocations + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 132 SMs total with 16 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles +- **H200**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 141GB HBM3e memory with enhanced capacity + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 132 SMs total with 16 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles +- **B200**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 180GB HBM3e memory with next-generation capacity + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 144 SMs total with 18 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles ### MIG Operations @@ -175,6 +292,7 @@ ci, ret := gi.CreateComputeInstance(&ciProfileInfo) ## Testing The framework includes comprehensive tests covering: + - Server creation and device enumeration - Device properties and capabilities - MIG mode operations and lifecycle @@ -186,11 +304,15 @@ The framework includes comprehensive tests covering: # Run all mock tests go test ./pkg/nvml/mock/... -# Run A100 specific tests +# Run generation specific tests go test -v ./pkg/nvml/mock/dgxa100/ +go test -v ./pkg/nvml/mock/dgxh100/ +go test -v ./pkg/nvml/mock/dgxh200/ +go test -v ./pkg/nvml/mock/dgxb200/ # Run specific test go test -v ./pkg/nvml/mock/dgxa100/ -run TestMIGProfilesExist +go test -v ./pkg/nvml/mock/dgxh100/ -run TestMIGProfilesExist ``` ## Extending the Framework @@ -198,6 +320,7 @@ go test -v ./pkg/nvml/mock/dgxa100/ -run TestMIGProfilesExist ### Adding GPU Variants Add new configurations to the appropriate file in `shared/gpus/`: + ```go var A100_PCIE_24GB = shared.Config{ Name: "NVIDIA A100-PCIE-24GB", @@ -213,34 +336,35 @@ var A100_PCIE_24GB = shared.Config{ ### Adding GPU Generations -1. **Create new package** (e.g., `dgxa100/`) -2. **Define GPU configurations** in `shared/gpus/a100.go` +1. **Create new package** (e.g., `dgxb200/`) +2. **Define GPU configurations** in `shared/gpus/b200.go` 3. **Define MIG profiles** with appropriate memory and SM allocations 4. **Implement server and device factory functions** 5. **Add comprehensive tests** -Example structure for A100 generation: +Example structure for B200 generation: + ```go -// In shared/gpus/a100.go -var A100_SXM4_80GB = shared.Config{ - Name: "NVIDIA A100 SXM4 80GB", - Architecture: nvml.DEVICE_ARCH_AMPERE, +// In shared/gpus/b200.go +var B200_SXM5_180GB = shared.Config{ + Name: "NVIDIA B200 180GB HBM3e", + Architecture: nvml.DEVICE_ARCH_BLACKWELL, Brand: nvml.BRAND_NVIDIA, - MemoryMB: 81920, - CudaMajor: 8, + MemoryMB: 184320, // 180GB + CudaMajor: 10, CudaMinor: 0, - PciDeviceId: 0x20B210DE, - MIGProfiles: a100MIGProfiles, + PciDeviceId: 0x2B0010DE, + MIGProfiles: b200_180gb_MIGProfiles, } -// In dgxa100/dgxa100.go +// In dgxb200/dgxb200.go func New() *Server { return shared.NewServerFromConfig(shared.ServerConfig{ - Config: gpus.A100_SXM4_80GB, - GPUCount: 4, - DriverVersion: "550.54.15", - NvmlVersion: "12.550.54.15", - CudaDriverVersion: 12040, + Config: gpus.B200_SXM5_180GB, + GPUCount: 8, + DriverVersion: "560.28.03", + NvmlVersion: "12.560.28.03", + CudaDriverVersion: 12060, }) } ``` @@ -248,11 +372,13 @@ func New() *Server { ## Backward Compatibility The framework maintains full backward compatibility: -- All existing `dgxa100.New()` calls continue to work unchanged -- Legacy global variables (`MIGProfiles`, `MIGPlacements`) are preserved + +- All existing `dgxa100.New()`, `dgxh100.New()`, `dgxh200.New()`, `dgxb200.New()` calls continue to work unchanged +- Legacy global variables (`MIGProfiles`, `MIGPlacements`) are preserved for all generations - Device names maintain "Mock" prefix for test compatibility - All existing tests pass without modification -- A100 configurations now reference `shared/gpus` package +- All GPU configurations reference `shared/gpus` package for consistency +- Type aliases ensure seamless transition from generation-specific types ## Performance Considerations diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index fc2384d..896cf35 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.