diff --git a/pkg/nvml/mock/README.md b/pkg/nvml/mock/README.md new file mode 100644 index 0000000..50f00cc --- /dev/null +++ b/pkg/nvml/mock/README.md @@ -0,0 +1,396 @@ +# NVML Mock Framework + +This package provides mock implementations of NVIDIA's NVML (NVIDIA Management Library) for testing and development purposes. The framework uses a shared factory system to define GPU configurations that can be easily extended and customized. + +## Architecture + +``` +pkg/nvml/mock/ +├── shared/ +│ ├── shared.go # Core shared factory and types +│ └── gpus/ # GPU configuration definitions +│ ├── a100.go # A100 GPU variants (Ampere) +│ ├── a30.go # A30 GPU variants (Ampere) +│ ├── h100.go # H100 GPU variants (Hopper) +│ ├── h200.go # H200 GPU variants (Hopper) +│ └── b200.go # B200 GPU variants (Blackwell) +├── dgxa100/ # DGX A100 implementation +│ ├── dgxa100.go # Server and device implementation +│ └── dgxa100_test.go # Comprehensive tests +├── dgxh100/ # DGX H100 implementation +│ ├── dgxh100.go # Server and device implementation +│ └── dgxh100_test.go # Comprehensive tests +├── dgxh200/ # DGX H200 implementation +│ ├── dgxh200.go # Server and device implementation +│ └── dgxh200_test.go # Comprehensive tests +└── dgxb200/ # DGX B200 implementation + ├── dgxb200.go # Server and device implementation + └── dgxb200_test.go # Comprehensive tests +``` + +## Core Concepts + +### Shared Factory (`shared.Config`) +Define the characteristics of individual GPU models including: + +- Device properties (name, architecture, brand, PCI device ID) +- Compute capabilities (CUDA version, compute capability) +- Memory configuration +- MIG (Multi-Instance GPU) profiles and placements + +### Server Configuration (`shared.ServerConfig`) +Define complete system configurations including: + +- GPU configuration and count +- Driver, NVML, and CUDA versions + +### MIG Profile Configuration (`shared.MIGProfileConfig`) +Define Multi-Instance GPU capabilities including: + +- GPU instance profiles (slice configurations) +- Compute instance profiles +- Placement constraints and possibilities + +## Usage Examples + +### Basic Usage + +```go +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh100" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh200" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxb200" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" +) + +// Create default systems +serverA100 := dgxa100.New() // A100-SXM4-40GB (8 GPUs) +serverH100 := dgxh100.New() // H100-SXM5-80GB (8 GPUs) +serverH200 := dgxh200.New() // H200-SXM5-141GB (8 GPUs) +serverB200 := dgxb200.New() // B200-SXM5-180GB (8 GPUs) + +// Create specific variants +serverA100_80GB := dgxa100.NewServerWithGPU(gpus.A100_SXM4_80GB) +serverH200_Custom := dgxh200.NewServerWithGPU(gpus.H200_SXM5_141GB) +serverB200_Custom := dgxb200.NewServerWithGPU(gpus.B200_SXM5_180GB) +``` + +### Device Creation + +```go +// Create devices with default configurations +deviceA100 := dgxa100.NewDevice(0) +deviceH100 := dgxh100.NewDevice(0) +deviceH200 := dgxh200.NewDevice(0) +deviceB200 := dgxb200.NewDevice(0) + +// Create devices with specific GPU variants +deviceA100_80GB := dgxa100.NewDeviceWithGPU(gpus.A100_SXM4_80GB, 0) +deviceH200_Custom := dgxh200.NewDeviceWithGPU(gpus.H200_SXM5_141GB, 1) +deviceB200_Custom := dgxb200.NewDeviceWithGPU(gpus.B200_SXM5_180GB, 2) +``` + +### Accessing GPU Configurations + +```go +// Available GPU configurations +// A100 Family +gpus.A100_SXM4_40GB // A100 SXM4 40GB +gpus.A100_SXM4_80GB // A100 SXM4 80GB +gpus.A100_PCIE_40GB // A100 PCIe 40GB +gpus.A100_PCIE_80GB // A100 PCIe 80GB + +// A30 Family +gpus.A30_PCIE_24GB // A30 PCIe 24GB + +// H100 Family +gpus.H100_SXM5_80GB // H100 SXM5 80GB + +// H200 Family +gpus.H200_SXM5_141GB // H200 SXM5 141GB + +// B200 Family +gpus.B200_SXM5_180GB // B200 SXM5 180GB + +// Inspect configurations +fmt.Printf("GPU: %s\n", gpus.A100_SXM4_80GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.A100_SXM4_80GB.MemoryMB) +fmt.Printf("Architecture: %v\n", gpus.A100_SXM4_80GB.Architecture) +fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId) + +// Inspect H100 configuration +fmt.Printf("GPU: %s\n", gpus.H100_SXM5_80GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.H100_SXM5_80GB.MemoryMB) +fmt.Printf("CUDA Major: %d\n", gpus.H100_SXM5_80GB.CudaMajor) + +// Inspect B200 configuration +fmt.Printf("GPU: %s\n", gpus.B200_SXM5_180GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.B200_SXM5_180GB.MemoryMB) +fmt.Printf("CUDA Major: %d\n", gpus.B200_SXM5_180GB.CudaMajor) +``` + +## Available GPU Models + +### A100 Family (Ampere Architecture, 108 SMs) + +- **A100 SXM4 40GB** (`gpus.A100_SXM4_40GB`) + - Form factor: SXM4 + - Memory: 40GB HBM2 + - PCI Device ID: 0x20B010DE + - CUDA Capability: 8.0 + - SMs per slice: 14 (1-slice), 28 (2-slice), 42 (3-slice), 56 (4-slice), 98 (7-slice) + - MIG P2P: Not supported (`IsP2pSupported: 0`) + +- **A100 SXM4 80GB** (`gpus.A100_SXM4_80GB`) + - Form factor: SXM4 + - Memory: 80GB HBM2e + - PCI Device ID: 0x20B210DE + - CUDA Capability: 8.0 + +- **A100 PCIe 40GB** (`gpus.A100_PCIE_40GB`) + - Form factor: PCIe + - Memory: 40GB HBM2 + - PCI Device ID: 0x20F110DE + - CUDA Capability: 8.0 + +- **A100 PCIe 80GB** (`gpus.A100_PCIE_80GB`) + - Form factor: PCIe + - Memory: 80GB HBM2e + - PCI Device ID: 0x20B510DE + - CUDA Capability: 8.0 + +### A30 Family (Ampere Architecture, 56 SMs) + +- **A30 PCIe 24GB** (`gpus.A30_PCIE_24GB`) + - Form factor: PCIe + - Memory: 24GB HBM2 + - PCI Device ID: 0x20B710DE + - CUDA Capability: 8.0 + - SMs per slice: 14 (1-slice), 28 (2-slice), 56 (4-slice) + - MIG P2P: Not supported (`IsP2pSupported: 0`) + - MIG slices: 1, 2, 4 (no 3-slice or 7-slice support) + +### H100 Family (Hopper Architecture, 132 SMs) + +- **H100 SXM5 80GB** (`gpus.H100_SXM5_80GB`) + - Form factor: SXM5 + - Memory: 80GB HBM3 + - PCI Device ID: 0x233010DE + - CUDA Capability: 9.0 + - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +### H200 Family (Hopper Architecture, 132 SMs) + +- **H200 SXM5 141GB** (`gpus.H200_SXM5_141GB`) + - Form factor: SXM5 + - Memory: 141GB HBM3e + - PCI Device ID: 0x233310DE + - CUDA Capability: 9.0 + - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +### B200 Family (Blackwell Architecture, 144 SMs) + +- **B200 SXM5 180GB** (`gpus.B200_SXM5_180GB`) + - Form factor: SXM5 + - Memory: 180GB HBM3e + - PCI Device ID: 0x2B0010DE + - CUDA Capability: 10.0 + - SMs per slice: 18 (1-slice), 36 (2-slice), 54 (3-slice), 72 (4-slice), 126 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +## Available Server Models + +### DGX A100 Family + +- **DGX A100 40GB** (default) + - 8x A100 SXM4 40GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +### DGX H100 Family + +- **DGX H100 80GB** (default) + - 8x H100 SXM5 80GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +### DGX H200 Family + +- **DGX H200 141GB** (default) + - 8x H200 SXM5 141GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +### DGX B200 Family + +- **DGX B200 180GB** (default) + - 8x B200 SXM5 180GB GPUs + - Driver: 560.28.03 + - NVML: 12.560.28.03 + - CUDA: 12060 + +## MIG (Multi-Instance GPU) Support + +All GPU configurations include comprehensive MIG profile definitions: + +- **A100**: No P2P support in MIG (`IsP2pSupported: 0`) + - Memory profiles differ between 40GB and 80GB variants + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 108 SMs total with 14 SMs per slice +- **A30**: No P2P support in MIG (`IsP2pSupported: 0`) + - Supports limited MIG slice configurations (1, 2, 4 slices only) + - 56 SMs total with 14 SMs per slice +- **H100**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 80GB HBM3 memory with optimized slice allocations + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 132 SMs total with 16 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles +- **H200**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 141GB HBM3e memory with enhanced capacity + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 132 SMs total with 16 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles +- **B200**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 180GB HBM3e memory with next-generation capacity + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 144 SMs total with 18 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +### MIG Operations + +```go +// Create server with MIG support +server := dgxa100.New() +device, _ := server.DeviceGetHandleByIndex(0) + +// Enable MIG mode +device.SetMigMode(1) + +// Get available GPU instance profiles +profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + +// Create GPU instance +gi, ret := device.CreateGpuInstance(&profileInfo) + +// Create compute instance within GPU instance +ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo( + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED +) +ci, ret := gi.CreateComputeInstance(&ciProfileInfo) +``` + +## Testing + +The framework includes comprehensive tests covering: + +- Server creation and device enumeration +- Device properties and capabilities +- MIG mode operations and lifecycle +- GPU and compute instance management +- Memory and PCI information +- Multi-device scenarios + +```bash +# Run all mock tests +go test ./pkg/nvml/mock/... + +# Run generation specific tests +go test -v ./pkg/nvml/mock/dgxa100/ +go test -v ./pkg/nvml/mock/dgxh100/ +go test -v ./pkg/nvml/mock/dgxh200/ +go test -v ./pkg/nvml/mock/dgxb200/ + +# Run specific test +go test -v ./pkg/nvml/mock/dgxa100/ -run TestMIGProfilesExist +go test -v ./pkg/nvml/mock/dgxh100/ -run TestMIGProfilesExist +``` + +## Extending the Framework + +### Adding GPU Variants + +Add new configurations to the appropriate file in `shared/gpus/`: + +```go +var A100_PCIE_24GB = shared.Config{ + Name: "NVIDIA A100-PCIE-24GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 24576, // 24GB + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20F010DE, + MIGProfiles: a100_24gb_MIGProfiles, +} +``` + +### Adding GPU Generations + +1. **Create new package** (e.g., `dgxb200/`) +2. **Define GPU configurations** in `shared/gpus/b200.go` +3. **Define MIG profiles** with appropriate memory and SM allocations +4. **Implement server and device factory functions** +5. **Add comprehensive tests** + +Example structure for B200 generation: + +```go +// In shared/gpus/b200.go +var B200_SXM5_180GB = shared.Config{ + Name: "NVIDIA B200 180GB HBM3e", + Architecture: nvml.DEVICE_ARCH_BLACKWELL, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 184320, // 180GB + CudaMajor: 10, + CudaMinor: 0, + PciDeviceId: 0x2B0010DE, + MIGProfiles: b200_180gb_MIGProfiles, +} + +// In dgxb200/dgxb200.go +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.B200_SXM5_180GB, + GPUCount: 8, + DriverVersion: "560.28.03", + NvmlVersion: "12.560.28.03", + CudaDriverVersion: 12060, + }) +} +``` + +## Backward Compatibility + +The framework maintains full backward compatibility: + +- All existing `dgxa100.New()`, `dgxh100.New()`, `dgxh200.New()`, `dgxb200.New()` calls continue to work unchanged +- Legacy global variables (`MIGProfiles`, `MIGPlacements`) are preserved for all generations +- Device names maintain "Mock" prefix for test compatibility +- All existing tests pass without modification +- All GPU configurations reference `shared/gpus` package for consistency +- Type aliases ensure seamless transition from generation-specific types + +## Performance Considerations + +- Configurations are defined as static variables (no runtime overhead) +- Device creation uses shared factory (fast) +- MIG profiles are shared between devices of the same type +- Mock functions use direct field access (minimal latency) + +## Implementation Notes + +- **Thread Safety**: Device implementations include proper mutex usage +- **Memory Management**: No memory leaks in device/instance lifecycle +- **Error Handling**: Proper NVML return codes for all operations +- **Standards Compliance**: Follows official NVML API patterns and behaviors +- **Separation of Concerns**: GPU configs in `shared/gpus`, server logic in package-specific files diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go index af65037..896cf35 100644 --- a/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/pkg/nvml/mock/dgxa100/dgxa100.go @@ -17,365 +17,63 @@ package dgxa100 import ( - "fmt" - "sync" - - "github.com/google/uuid" - "github.com/NVIDIA/go-nvml/pkg/nvml" - "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" ) -type Server struct { - mock.Interface - mock.ExtendedInterface - Devices [8]nvml.Device - DriverVersion string - NvmlVersion string - CudaDriverVersion int -} -type Device struct { - mock.Device - sync.RWMutex - UUID string - Name string - Brand nvml.BrandType - Architecture nvml.DeviceArchitecture - PciBusID string - Minor int - Index int - CudaComputeCapability CudaComputeCapability - MigMode int - GpuInstances map[*GpuInstance]struct{} - GpuInstanceCounter uint32 - MemoryInfo nvml.Memory -} - -type GpuInstance struct { - mock.GpuInstance - sync.RWMutex - Info nvml.GpuInstanceInfo - ComputeInstances map[*ComputeInstance]struct{} - ComputeInstanceCounter uint32 -} - -type ComputeInstance struct { - mock.ComputeInstance - Info nvml.ComputeInstanceInfo -} - -type CudaComputeCapability struct { - Major int - Minor int -} - -var _ nvml.Interface = (*Server)(nil) -var _ nvml.Device = (*Device)(nil) -var _ nvml.GpuInstance = (*GpuInstance)(nil) -var _ nvml.ComputeInstance = (*ComputeInstance)(nil) +// Backwards compatible type aliases +type Server = shared.Server +type Device = shared.Device +type GpuInstance = shared.GpuInstance +type ComputeInstance = shared.ComputeInstance +type CudaComputeCapability = shared.CudaComputeCapability func New() *Server { - server := &Server{ - Devices: [8]nvml.Device{ - NewDevice(0), - NewDevice(1), - NewDevice(2), - NewDevice(3), - NewDevice(4), - NewDevice(5), - NewDevice(6), - NewDevice(7), - }, + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.A100_SXM4_40GB, + GPUCount: 8, DriverVersion: "550.54.15", NvmlVersion: "12.550.54.15", CudaDriverVersion: 12040, - } - server.setMockFuncs() - return server + }) } func NewDevice(index int) *Device { - device := &Device{ - UUID: "GPU-" + uuid.New().String(), - Name: "Mock NVIDIA A100-SXM4-40GB", - Brand: nvml.BRAND_NVIDIA, - Architecture: nvml.DEVICE_ARCH_AMPERE, - PciBusID: fmt.Sprintf("0000:%02x:00.0", index), - Minor: index, - Index: index, - CudaComputeCapability: CudaComputeCapability{ - Major: 8, - Minor: 0, - }, - GpuInstances: make(map[*GpuInstance]struct{}), - GpuInstanceCounter: 0, - MemoryInfo: nvml.Memory{Total: 42949672960, Free: 0, Used: 0}, - } - device.setMockFuncs() - return device -} - -func NewGpuInstance(info nvml.GpuInstanceInfo) *GpuInstance { - gi := &GpuInstance{ - Info: info, - ComputeInstances: make(map[*ComputeInstance]struct{}), - ComputeInstanceCounter: 0, - } - gi.setMockFuncs() - return gi -} - -func NewComputeInstance(info nvml.ComputeInstanceInfo) *ComputeInstance { - ci := &ComputeInstance{ - Info: info, - } - ci.setMockFuncs() - return ci -} - -func (s *Server) setMockFuncs() { - s.ExtensionsFunc = func() nvml.ExtendedInterface { - return s - } - - s.LookupSymbolFunc = func(symbol string) error { - return nil - } - - s.InitFunc = func() nvml.Return { - return nvml.SUCCESS - } - - s.ShutdownFunc = func() nvml.Return { - return nvml.SUCCESS - } - - s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { - return s.DriverVersion, nvml.SUCCESS - } - - s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { - return s.NvmlVersion, nvml.SUCCESS - } - - s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { - return s.CudaDriverVersion, nvml.SUCCESS - } - - s.DeviceGetCountFunc = func() (int, nvml.Return) { - return len(s.Devices), nvml.SUCCESS - } - - s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { - if index < 0 || index >= len(s.Devices) { - return nil, nvml.ERROR_INVALID_ARGUMENT - } - return s.Devices[index], nvml.SUCCESS - } - - s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { - for _, d := range s.Devices { - if uuid == d.(*Device).UUID { - return d, nvml.SUCCESS - } - } - return nil, nvml.ERROR_INVALID_ARGUMENT - } - - s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { - for _, d := range s.Devices { - if busID == d.(*Device).PciBusID { - return d, nvml.SUCCESS - } - } - return nil, nvml.ERROR_INVALID_ARGUMENT - } + return shared.NewDeviceFromConfig(gpus.A100_SXM4_40GB, index) } -func (d *Device) setMockFuncs() { - d.GetMinorNumberFunc = func() (int, nvml.Return) { - return d.Minor, nvml.SUCCESS - } - - d.GetIndexFunc = func() (int, nvml.Return) { - return d.Index, nvml.SUCCESS - } - - d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { - return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS - } - - d.GetUUIDFunc = func() (string, nvml.Return) { - return d.UUID, nvml.SUCCESS - } - - d.GetNameFunc = func() (string, nvml.Return) { - return d.Name, nvml.SUCCESS - } - - d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { - return d.Brand, nvml.SUCCESS - } - - d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { - return d.Architecture, nvml.SUCCESS - } - - d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { - return d.MemoryInfo, nvml.SUCCESS - } - - d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { - p := nvml.PciInfo{ - PciDeviceId: 0x20B010DE, - } - return p, nvml.SUCCESS - } - - d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { - d.MigMode = mode - return nvml.SUCCESS, nvml.SUCCESS - } - - d.GetMigModeFunc = func() (int, int, nvml.Return) { - return d.MigMode, d.MigMode, nvml.SUCCESS - } - - d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { - if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { - return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT - } - - if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { - return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS - } - - d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { - return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS - } - - d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { - d.Lock() - defer d.Unlock() - giInfo := nvml.GpuInstanceInfo{ - Device: d, - Id: d.GpuInstanceCounter, - ProfileId: info.Id, - } - d.GpuInstanceCounter++ - gi := NewGpuInstance(giInfo) - d.GpuInstances[gi] = struct{}{} - return gi, nvml.SUCCESS - } - - d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { - d.Lock() - defer d.Unlock() - giInfo := nvml.GpuInstanceInfo{ - Device: d, - Id: d.GpuInstanceCounter, - ProfileId: info.Id, - Placement: *placement, - } - d.GpuInstanceCounter++ - gi := NewGpuInstance(giInfo) - d.GpuInstances[gi] = struct{}{} - return gi, nvml.SUCCESS - } - - d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { - d.RLock() - defer d.RUnlock() - var gis []nvml.GpuInstance - for gi := range d.GpuInstances { - if gi.Info.ProfileId == info.Id { - gis = append(gis, gi) - } - } - return gis, nvml.SUCCESS - } +// NewServerWithGPU creates a new server with a specific A100 GPU variant +func NewServerWithGPU(gpuConfig shared.Config) *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpuConfig, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) } -func (gi *GpuInstance) setMockFuncs() { - gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { - return gi.Info, nvml.SUCCESS - } - - gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { - if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT - } - - if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - giProfileId := int(gi.Info.ProfileId) - - if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS - } - - gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { - return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS - } - - gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { - gi.Lock() - defer gi.Unlock() - ciInfo := nvml.ComputeInstanceInfo{ - Device: gi.Info.Device, - GpuInstance: gi, - Id: gi.ComputeInstanceCounter, - ProfileId: info.Id, - } - gi.ComputeInstanceCounter++ - ci := NewComputeInstance(ciInfo) - gi.ComputeInstances[ci] = struct{}{} - return ci, nvml.SUCCESS - } - - gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { - gi.RLock() - defer gi.RUnlock() - var cis []nvml.ComputeInstance - for ci := range gi.ComputeInstances { - if ci.Info.ProfileId == info.Id { - cis = append(cis, ci) - } - } - return cis, nvml.SUCCESS - } - - gi.DestroyFunc = func() nvml.Return { - d := gi.Info.Device.(*Device) - d.Lock() - defer d.Unlock() - delete(d.GpuInstances, gi) - return nvml.SUCCESS - } +// NewDeviceWithGPU creates a new device with a specific A100 GPU variant +func NewDeviceWithGPU(gpuConfig shared.Config, index int) *Device { + return shared.NewDeviceFromConfig(gpuConfig, index) } -func (ci *ComputeInstance) setMockFuncs() { - ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { - return ci.Info, nvml.SUCCESS +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.A100_SXM4_40GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.A100_SXM4_40GB.MIGProfiles.ComputeInstanceProfiles, } - ci.DestroyFunc = func() nvml.Return { - gi := ci.Info.GpuInstance.(*GpuInstance) - gi.Lock() - defer gi.Unlock() - delete(gi.ComputeInstances, ci) - return nvml.SUCCESS + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.A100_SXM4_40GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.A100_SXM4_40GB.MIGProfiles.ComputeInstancePlacements, } -} +) diff --git a/pkg/nvml/mock/dgxa100/dgxa100_test.go b/pkg/nvml/mock/dgxa100/dgxa100_test.go new file mode 100644 index 0000000..3f852df --- /dev/null +++ b/pkg/nvml/mock/dgxa100/dgxa100_test.go @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxa100 + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// TestServerCreation verifies server creation and basic properties +func TestServerCreation(t *testing.T) { + server := New() + require.NotNil(t, server) + + // Test interface compliance + require.Implements(t, (*nvml.Interface)(nil), server) + require.Implements(t, (*nvml.ExtendedInterface)(nil), server) + + // Test device count + count, ret := server.DeviceGetCount() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, count) + + // Test system information + driver, ret := server.SystemGetDriverVersion() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "550.54.15", driver) + + nvmlVer, ret := server.SystemGetNVMLVersion() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "12.550.54.15", nvmlVer) + + cudaVer, ret := server.SystemGetCudaDriverVersion() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 12040, cudaVer) +} + +// TestDeviceHandling verifies device access and indexing +func TestDeviceHandling(t *testing.T) { + server := New() + + // Test valid device indices + for i := 0; i < 8; i++ { + device, ret := server.DeviceGetHandleByIndex(i) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + // Test device index + index, ret := device.GetIndex() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, i, index) + + // Test minor number + minor, ret := device.GetMinorNumber() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, i, minor) + } + + // Test invalid device indices + _, ret := server.DeviceGetHandleByIndex(-1) + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) + + _, ret = server.DeviceGetHandleByIndex(8) + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) +} + +// TestDeviceProperties verifies all device properties +func TestDeviceProperties(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + // Test device name + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "Mock NVIDIA A100-SXM4-40GB", name) + + // Test architecture + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_AMPERE), arch) + + // Test brand + brand, ret := device.GetBrand() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.BRAND_NVIDIA, brand) + + // Test CUDA compute capability + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, major) + require.Equal(t, 0, minor) + + // Test memory info (40GB) + memory, ret := device.GetMemoryInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint64(42949672960), memory.Total) + + // Test PCI device ID + pciInfo, ret := device.GetPciInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0x20B010DE), pciInfo.PciDeviceId) + + // Test UUID is set + uuid, ret := device.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.NotEmpty(t, uuid) + require.Contains(t, uuid, "GPU-") +} + +// TestDeviceAccessByUUID verifies UUID-based device access +func TestDeviceAccessByUUID(t *testing.T) { + server := New() + + // Get device by index and its UUID + originalDevice, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + uuid, ret := originalDevice.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.NotEmpty(t, uuid) + + // Get device by UUID + deviceByUUID, ret := server.DeviceGetHandleByUUID(uuid) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, originalDevice, deviceByUUID) + + // Test invalid UUID + _, ret = server.DeviceGetHandleByUUID("invalid-uuid") + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) +} + +// TestDeviceAccessByPciBusId verifies PCI bus ID-based device access +func TestDeviceAccessByPciBusId(t *testing.T) { + server := New() + + // Test each device's PCI bus ID + for i := 0; i < 8; i++ { + originalDevice, ret := server.DeviceGetHandleByIndex(i) + require.Equal(t, nvml.SUCCESS, ret) + + expectedPciBusID := fmt.Sprintf("0000:%02x:00.0", i) + + // Get device by PCI bus ID + deviceByPci, ret := server.DeviceGetHandleByPciBusId(expectedPciBusID) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, originalDevice, deviceByPci) + } + + // Test invalid PCI bus ID + _, ret := server.DeviceGetHandleByPciBusId("invalid-pci-id") + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) +} + +// TestMIGModeOperations verifies MIG mode handling +func TestMIGModeOperations(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Initially MIG should be disabled + current, pending, ret := device.GetMigMode() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 0, current) + require.Equal(t, 0, pending) + + // Enable MIG mode + currentRet, pendingRet := device.SetMigMode(1) + require.Equal(t, nvml.SUCCESS, currentRet) + require.Equal(t, nvml.SUCCESS, pendingRet) + + // Verify MIG is enabled + current, pending, ret = device.GetMigMode() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 1, current) + require.Equal(t, 1, pending) + + // Disable MIG mode + currentRet, pendingRet = device.SetMigMode(0) + require.Equal(t, nvml.SUCCESS, currentRet) + require.Equal(t, nvml.SUCCESS, pendingRet) + + // Verify MIG is disabled + current, pending, ret = device.GetMigMode() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 0, current) + require.Equal(t, 0, pending) +} + +// TestMIGProfilesExist verifies MIG profile configuration exists +func TestMIGProfilesExist(t *testing.T) { + // Test that MIGProfiles variable is accessible + require.NotNil(t, MIGProfiles) + require.NotNil(t, MIGProfiles.GpuInstanceProfiles) + require.NotNil(t, MIGProfiles.ComputeInstanceProfiles) + + // Test that MIGPlacements variable is accessible + require.NotNil(t, MIGPlacements) + require.NotNil(t, MIGPlacements.GpuInstancePossiblePlacements) + require.NotNil(t, MIGPlacements.ComputeInstancePossiblePlacements) + + // Test expected profile types exist + expectedProfiles := []int{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + nvml.GPU_INSTANCE_PROFILE_2_SLICE, + nvml.GPU_INSTANCE_PROFILE_3_SLICE, + nvml.GPU_INSTANCE_PROFILE_4_SLICE, + nvml.GPU_INSTANCE_PROFILE_7_SLICE, + } + + for _, profileId := range expectedProfiles { + profile, exists := MIGProfiles.GpuInstanceProfiles[profileId] + require.True(t, exists, "Profile %d should exist", profileId) + require.Equal(t, uint32(profileId), profile.Id) + require.Greater(t, profile.MemorySizeMB, uint64(0)) + } +} + +// TestGpuInstanceProfileInfo verifies GPU instance profile access +func TestGpuInstanceProfileInfo(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Test valid profile access + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), profileInfo.Id) + require.Equal(t, uint32(1), profileInfo.SliceCount) + require.Equal(t, uint64(4864), profileInfo.MemorySizeMB) // 1g.5gb + + // Test 7-slice profile + profileInfo7, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_7_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_7_SLICE), profileInfo7.Id) + require.Equal(t, uint32(7), profileInfo7.SliceCount) + require.Equal(t, uint64(40192), profileInfo7.MemorySizeMB) // 7g.40gb + + // Test invalid profile + _, ret = device.GetGpuInstanceProfileInfo(-1) + require.Equal(t, nvml.ERROR_INVALID_ARGUMENT, ret) + + // Test unsupported profile (use a valid range but unsupported profile) + _, ret = device.GetGpuInstanceProfileInfo(5) // Valid range but not in MIGProfiles + require.Equal(t, nvml.ERROR_NOT_SUPPORTED, ret) +} + +// TestGpuInstancePlacements verifies GPU instance placement access +func TestGpuInstancePlacements(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Test 1-slice placements + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + placements, ret := device.GetGpuInstancePossiblePlacements(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, placements, 7) // Should have 7 possible placements for 1-slice + + // Test 7-slice placements + profileInfo7, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_7_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + placements7, ret := device.GetGpuInstancePossiblePlacements(&profileInfo7) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, placements7, 1) // Should have 1 placement for 7-slice (full GPU) + require.Equal(t, uint32(0), placements7[0].Start) + require.Equal(t, uint32(8), placements7[0].Size) +} + +// TestGpuInstanceLifecycle verifies complete GPU instance lifecycle +func TestGpuInstanceLifecycle(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Get 1-slice profile + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + // Create GPU instance + gi, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi) + + // Test GPU instance info + giInfo, ret := gi.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, device, giInfo.Device) + require.Equal(t, profileInfo.Id, giInfo.ProfileId) + require.Equal(t, uint32(0), giInfo.Id) // First instance should have ID 0 + + // Get GPU instances for this profile + instances, ret := device.GetGpuInstances(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, instances, 1) + require.Equal(t, gi, instances[0]) + + // Destroy GPU instance + ret = gi.Destroy() + require.Equal(t, nvml.SUCCESS, ret) + + // Verify instance is removed + instances, ret = device.GetGpuInstances(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, instances, 0) +} + +// TestGpuInstanceWithPlacement verifies GPU instance creation with placement +func TestGpuInstanceWithPlacement(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Get profile and placement + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + placements, ret := device.GetGpuInstancePossiblePlacements(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotEmpty(t, placements) + + // Create GPU instance with specific placement + gi, ret := device.CreateGpuInstanceWithPlacement(&profileInfo, &placements[0]) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi) + + // Verify placement in instance info + giInfo, ret := gi.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, placements[0], giInfo.Placement) + + // Clean up + ret = gi.Destroy() + require.Equal(t, nvml.SUCCESS, ret) +} + +// TestComputeInstanceLifecycle verifies complete compute instance lifecycle +func TestComputeInstanceLifecycle(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Create GPU instance first + giProfileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + gi, ret := device.CreateGpuInstance(&giProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi) + + // Get compute instance profile + ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo( + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED, + ) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE), ciProfileInfo.Id) + + // Test invalid engine profile + _, ret = gi.GetComputeInstanceProfileInfo( + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + 999, // Invalid engine profile + ) + require.Equal(t, nvml.ERROR_NOT_SUPPORTED, ret) + + // Get compute instance placements + _, ret = gi.GetComputeInstancePossiblePlacements(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + // Note: Original implementation has empty placements (TODO comment) + + // Create compute instance + ci, ret := gi.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, ci) + + // Test compute instance info + ciInfo, ret := ci.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, device, ciInfo.Device) + require.Equal(t, gi, ciInfo.GpuInstance) + require.Equal(t, ciProfileInfo.Id, ciInfo.ProfileId) + require.Equal(t, uint32(0), ciInfo.Id) // First instance should have ID 0 + + // Get compute instances for this profile + instances, ret := gi.GetComputeInstances(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, instances, 1) + require.Equal(t, ci, instances[0]) + + // Destroy compute instance + ret = ci.Destroy() + require.Equal(t, nvml.SUCCESS, ret) + + // Verify compute instance is removed + instances, ret = gi.GetComputeInstances(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.Len(t, instances, 0) + + // Destroy GPU instance + ret = gi.Destroy() + require.Equal(t, nvml.SUCCESS, ret) +} + +// TestInitShutdownLifecycle verifies init/shutdown behavior +func TestInitShutdownLifecycle(t *testing.T) { + server := New() + + // Test init + ret := server.Init() + require.Equal(t, nvml.SUCCESS, ret) + + // Test lookup symbol + err := server.LookupSymbol("nvmlInit") + require.NoError(t, err) + + // Test extensions + ext := server.Extensions() + require.NotNil(t, ext) + require.Equal(t, server, ext) + + // Test shutdown + ret = server.Shutdown() + require.Equal(t, nvml.SUCCESS, ret) +} + +// TestMultipleDevices verifies all devices are unique and correctly indexed +func TestMultipleDevices(t *testing.T) { + server := New() + + devices := make([]nvml.Device, 8) + uuids := make(map[string]bool) + + // Get all devices and verify uniqueness + for i := 0; i < 8; i++ { + device, ret := server.DeviceGetHandleByIndex(i) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + devices[i] = device + + // Verify UUID is unique + uuid, ret := device.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.NotEmpty(t, uuid) + require.False(t, uuids[uuid], "UUID %s should be unique", uuid) + uuids[uuid] = true + + // Verify device properties are consistent + index, ret := device.GetIndex() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, i, index) + + minor, ret := device.GetMinorNumber() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, i, minor) + + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "Mock NVIDIA A100-SXM4-40GB", name) + } + + // Verify all devices are distinct objects + for i := 0; i < 8; i++ { + for j := i + 1; j < 8; j++ { + require.NotEqual(t, devices[i], devices[j], "Devices %d and %d should be different objects", i, j) + } + } +} + +// TestA100SpecificCharacteristics tests A100-specific values +func TestA100SpecificCharacteristics(t *testing.T) { + server := New() + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + + // Test A100 doesn't support P2P in MIG (IsP2pSupported should be 0) + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), profileInfo.IsP2pSupported) + + // Test A100 memory values are correct + profile1 := MIGProfiles.GpuInstanceProfiles[nvml.GPU_INSTANCE_PROFILE_1_SLICE] + require.Equal(t, uint64(4864), profile1.MemorySizeMB) // 1g.5gb + + profile7 := MIGProfiles.GpuInstanceProfiles[nvml.GPU_INSTANCE_PROFILE_7_SLICE] + require.Equal(t, uint64(40192), profile7.MemorySizeMB) // 7g.40gb + + // Test A100 architecture + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_AMPERE), arch) + + // Test A100 CUDA compute capability + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, major) // Ampere + require.Equal(t, 0, minor) + + // Test A100 PCI device ID + pciInfo, ret := device.GetPciInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0x20B010DE), pciInfo.PciDeviceId) // A100-SXM4-40GB +} diff --git a/pkg/nvml/mock/dgxa100/mig-profile.go b/pkg/nvml/mock/dgxa100/mig-profile.go deleted file mode 100644 index c4df4c8..0000000 --- a/pkg/nvml/mock/dgxa100/mig-profile.go +++ /dev/null @@ -1,471 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dgxa100 - -import ( - "github.com/NVIDIA/go-nvml/pkg/nvml" -) - -// MIGProfiles holds the profile information for GIs and CIs in this mock server. -// We should consider auto-generating this object in the future. -var MIGProfiles = struct { - GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo - ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo -}{ - GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 7, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 0, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 4864, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 1, - OfaCount: 1, - MemorySizeMB: 4864, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 4, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 9856, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, - IsP2pSupported: 0, - SliceCount: 2, - InstanceCount: 3, - MultiprocessorCount: 28, - CopyEngineCount: 2, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 9856, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, - IsP2pSupported: 0, - SliceCount: 3, - InstanceCount: 2, - MultiprocessorCount: 42, - CopyEngineCount: 3, - DecoderCount: 2, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 19968, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, - IsP2pSupported: 0, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - CopyEngineCount: 4, - DecoderCount: 2, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 19968, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, - IsP2pSupported: 0, - SliceCount: 7, - InstanceCount: 1, - MultiprocessorCount: 98, - CopyEngineCount: 7, - DecoderCount: 5, - EncoderCount: 0, - JpegCount: 1, - OfaCount: 1, - MemorySizeMB: 40192, - }, - }, - ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 0, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 2, - MultiprocessorCount: 14, - SharedCopyEngineCount: 2, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 1, - MultiprocessorCount: 28, - SharedCopyEngineCount: 2, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 3, - MultiprocessorCount: 14, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 1, - MultiprocessorCount: 28, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, - SliceCount: 3, - InstanceCount: 1, - MultiprocessorCount: 42, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 4, - MultiprocessorCount: 14, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 2, - MultiprocessorCount: 28, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 7, - MultiprocessorCount: 14, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 3, - MultiprocessorCount: 28, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, - SliceCount: 3, - InstanceCount: 2, - MultiprocessorCount: 42, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, - SliceCount: 7, - InstanceCount: 1, - MultiprocessorCount: 98, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - }, - }, -} - -// MIGPlacements holds the placement information for GIs and CIs in this mock server. -// We should consider auto-generating this object in the future. -var MIGPlacements = struct { - GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement - ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement -}{ - GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - { - Start: 0, - Size: 1, - }, - { - Start: 1, - Size: 1, - }, - { - Start: 2, - Size: 1, - }, - { - Start: 3, - Size: 1, - }, - { - Start: 4, - Size: 1, - }, - { - Start: 5, - Size: 1, - }, - { - Start: 6, - Size: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - { - Start: 0, - Size: 1, - }, - { - Start: 1, - Size: 1, - }, - { - Start: 2, - Size: 1, - }, - { - Start: 3, - Size: 1, - }, - { - Start: 4, - Size: 1, - }, - { - Start: 5, - Size: 1, - }, - { - Start: 6, - Size: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - { - Start: 0, - Size: 2, - }, - { - Start: 2, - Size: 2, - }, - { - Start: 4, - Size: 2, - }, - { - Start: 6, - Size: 2, - }, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - { - Start: 0, - Size: 2, - }, - { - Start: 2, - Size: 2, - }, - { - Start: 4, - Size: 2, - }, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - { - Start: 0, - Size: 4, - }, - { - Start: 4, - Size: 4, - }, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - { - Start: 0, - Size: 4, - }, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - { - Start: 0, - Size: 8, - }, - }, - }, - // TODO: Fill out ComputeInstancePossiblePlacements - ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {}, - }, - }, -} diff --git a/pkg/nvml/mock/dgxb200/dgxb200.go b/pkg/nvml/mock/dgxb200/dgxb200.go new file mode 100644 index 0000000..0e0a586 --- /dev/null +++ b/pkg/nvml/mock/dgxb200/dgxb200.go @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxb200 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" +) + +// Backwards compatible type aliases +type Server = shared.Server +type Device = shared.Device +type GpuInstance = shared.GpuInstance +type ComputeInstance = shared.ComputeInstance +type CudaComputeCapability = shared.CudaComputeCapability + +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.B200_SXM5_180GB, + GPUCount: 8, + DriverVersion: "560.28.03", + NvmlVersion: "12.560.28.03", + CudaDriverVersion: 12060, + }) +} + +func NewDevice(index int) *Device { + return shared.NewDeviceFromConfig(gpus.B200_SXM5_180GB, index) +} + +// NewServerWithGPU creates a new server with a specific B200 GPU variant +func NewServerWithGPU(gpuConfig shared.Config) *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpuConfig, + GPUCount: 8, + DriverVersion: "560.28.03", + NvmlVersion: "12.560.28.03", + CudaDriverVersion: 12060, + }) +} + +// NewDeviceWithGPU creates a new device with a specific B200 GPU variant +func NewDeviceWithGPU(gpuConfig shared.Config, index int) *Device { + return shared.NewDeviceFromConfig(gpuConfig, index) +} + +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.B200_SXM5_180GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.B200_SXM5_180GB.MIGProfiles.ComputeInstanceProfiles, + } + + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.B200_SXM5_180GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.B200_SXM5_180GB.MIGProfiles.ComputeInstancePlacements, + } +) diff --git a/pkg/nvml/mock/dgxb200/dgxb200_test.go b/pkg/nvml/mock/dgxb200/dgxb200_test.go new file mode 100644 index 0000000..8c08bc9 --- /dev/null +++ b/pkg/nvml/mock/dgxb200/dgxb200_test.go @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxb200 + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +func TestB200Server(t *testing.T) { + server := New() + + count, ret := server.DeviceGetCount() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, count) + + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "NVIDIA B200 180GB HBM3e", name) + + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_BLACKWELL), arch) + + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 10, major) + require.Equal(t, 0, minor) + + memory, ret := device.GetMemoryInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint64(184320*1024*1024), memory.Total) // 180GB + + // Test B200 supports P2P in MIG (IsP2pSupported should be 1) + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) + + // Test MIG functionality + gpuInstance, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gpuInstance) + + giInfo, ret := gpuInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), giInfo.Id) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), giInfo.ProfileId) + + // Test compute instance creation + ciProfileInfo, ret := gpuInstance.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + + computeInstance, ret := gpuInstance.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, computeInstance) + + ciInfo, ret := computeInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), ciInfo.Id) + require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE), ciInfo.ProfileId) +} + +func TestB200Device(t *testing.T) { + device := NewDevice(5) + + index, ret := device.GetIndex() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 5, index) + + minor, ret := device.GetMinorNumber() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 5, minor) + + uuid, ret := device.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.Contains(t, uuid, "GPU-") + + brand, ret := device.GetBrand() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.BRAND_NVIDIA, brand) + + pciInfo, ret := device.GetPciInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0x2B0010DE), pciInfo.PciDeviceId) +} + +func TestB200MIGProfiles(t *testing.T) { + device := NewDevice(0) + + // Test all GPU instance profiles + testCases := []struct { + profile int + sliceCount uint32 + memoryMB uint64 + multiproc uint32 + encoders uint32 + jpegs uint32 + ofas uint32 + }{ + {nvml.GPU_INSTANCE_PROFILE_1_SLICE, 1, 23552, 18, 0, 0, 0}, + {nvml.GPU_INSTANCE_PROFILE_2_SLICE, 2, 46080, 36, 1, 1, 1}, + {nvml.GPU_INSTANCE_PROFILE_3_SLICE, 3, 92160, 54, 2, 2, 2}, + {nvml.GPU_INSTANCE_PROFILE_4_SLICE, 4, 92160, 72, 2, 2, 2}, + {nvml.GPU_INSTANCE_PROFILE_7_SLICE, 7, 184320, 126, 4, 4, 4}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("profile_%d_slice", tc.sliceCount), func(t *testing.T) { + profileInfo, ret := device.GetGpuInstanceProfileInfo(tc.profile) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(tc.profile), profileInfo.Id) + require.Equal(t, tc.sliceCount, profileInfo.SliceCount) + require.Equal(t, tc.memoryMB, profileInfo.MemorySizeMB) + require.Equal(t, tc.multiproc, profileInfo.MultiprocessorCount) + require.Equal(t, tc.encoders, profileInfo.EncoderCount) + require.Equal(t, tc.jpegs, profileInfo.JpegCount) + require.Equal(t, tc.ofas, profileInfo.OfaCount) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) // B200 supports P2P + }) + } +} + +func TestB200AdvancedFeatures(t *testing.T) { + device := NewDevice(0) + + // Test that B200 has enhanced encoder/decoder capabilities compared to H100/H200 + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_7_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + // B200 should have more advanced multimedia engines + require.Equal(t, uint32(4), profileInfo.EncoderCount) // More encoders than H100/H200 + require.Equal(t, uint32(4), profileInfo.JpegCount) // JPEG engines + require.Equal(t, uint32(4), profileInfo.OfaCount) // OFA engines + + // Test GPU instance creation with advanced profile + gpuInstance, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gpuInstance) + + // Test compute instance with 7-slice profile + ciProfileInfo, ret := gpuInstance.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(126), ciProfileInfo.MultiprocessorCount) // High multiprocessor count for B200 +} + +func TestB200MIGInstanceManagement(t *testing.T) { + device := NewDevice(0) + + // Test creating and destroying instances + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_2_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + // Create GPU instance + gi, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi) + + // Create compute instance + ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + + ci, ret := gi.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, ci) + + // Verify compute instance info + ciInfo, ret := ci.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE), ciInfo.ProfileId) + + // Test destruction + ret = ci.Destroy() + require.Equal(t, nvml.SUCCESS, ret) + + ret = gi.Destroy() + require.Equal(t, nvml.SUCCESS, ret) +} diff --git a/pkg/nvml/mock/dgxh100/dgxh100.go b/pkg/nvml/mock/dgxh100/dgxh100.go new file mode 100644 index 0000000..aa46c46 --- /dev/null +++ b/pkg/nvml/mock/dgxh100/dgxh100.go @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxh100 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" +) + +// Backwards compatible type aliases +type Server = shared.Server +type Device = shared.Device +type GpuInstance = shared.GpuInstance +type ComputeInstance = shared.ComputeInstance +type CudaComputeCapability = shared.CudaComputeCapability + +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.H100_SXM5_80GB, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} + +func NewDevice(index int) *Device { + return shared.NewDeviceFromConfig(gpus.H100_SXM5_80GB, index) +} + +// NewServerWithGPU creates a new server with a specific H100 GPU variant +func NewServerWithGPU(gpuConfig shared.Config) *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpuConfig, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} + +// NewDeviceWithGPU creates a new device with a specific H100 GPU variant +func NewDeviceWithGPU(gpuConfig shared.Config, index int) *Device { + return shared.NewDeviceFromConfig(gpuConfig, index) +} + +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.H100_SXM5_80GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.H100_SXM5_80GB.MIGProfiles.ComputeInstanceProfiles, + } + + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.H100_SXM5_80GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.H100_SXM5_80GB.MIGProfiles.ComputeInstancePlacements, + } +) diff --git a/pkg/nvml/mock/dgxh100/dgxh100_test.go b/pkg/nvml/mock/dgxh100/dgxh100_test.go new file mode 100644 index 0000000..c0bec49 --- /dev/null +++ b/pkg/nvml/mock/dgxh100/dgxh100_test.go @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxh100 + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +func TestH100Server(t *testing.T) { + server := New() + + count, ret := server.DeviceGetCount() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, count) + + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "NVIDIA H100 80GB HBM3", name) + + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_HOPPER), arch) + + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 9, major) + require.Equal(t, 0, minor) + + memory, ret := device.GetMemoryInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint64(81920*1024*1024), memory.Total) // 80GB + + // Test H100 supports P2P in MIG (IsP2pSupported should be 1) + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) + + // Test MIG functionality + gpuInstance, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gpuInstance) + + giInfo, ret := gpuInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), giInfo.Id) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), giInfo.ProfileId) +} diff --git a/pkg/nvml/mock/dgxh200/dgxh200.go b/pkg/nvml/mock/dgxh200/dgxh200.go new file mode 100644 index 0000000..fec64ee --- /dev/null +++ b/pkg/nvml/mock/dgxh200/dgxh200.go @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxh200 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus" +) + +// Backwards compatible type aliases +type Server = shared.Server +type Device = shared.Device +type GpuInstance = shared.GpuInstance +type ComputeInstance = shared.ComputeInstance +type CudaComputeCapability = shared.CudaComputeCapability + +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.H200_SXM5_141GB, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} + +func NewDevice(index int) *Device { + return shared.NewDeviceFromConfig(gpus.H200_SXM5_141GB, index) +} + +// NewServerWithGPU creates a new server with a specific H200 GPU variant +func NewServerWithGPU(gpuConfig shared.Config) *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpuConfig, + GPUCount: 8, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + }) +} + +// NewDeviceWithGPU creates a new device with a specific H200 GPU variant +func NewDeviceWithGPU(gpuConfig shared.Config, index int) *Device { + return shared.NewDeviceFromConfig(gpuConfig, index) +} + +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.H200_SXM5_141GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.H200_SXM5_141GB.MIGProfiles.ComputeInstanceProfiles, + } + + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.H200_SXM5_141GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.H200_SXM5_141GB.MIGProfiles.ComputeInstancePlacements, + } +) diff --git a/pkg/nvml/mock/dgxh200/dgxh200_test.go b/pkg/nvml/mock/dgxh200/dgxh200_test.go new file mode 100644 index 0000000..e0b1ea4 --- /dev/null +++ b/pkg/nvml/mock/dgxh200/dgxh200_test.go @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxh200 + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +func TestH200Server(t *testing.T) { + server := New() + + count, ret := server.DeviceGetCount() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 8, count) + + device, ret := server.DeviceGetHandleByIndex(0) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, device) + + name, ret := device.GetName() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, "NVIDIA H200 141GB HBM3e", name) + + arch, ret := device.GetArchitecture() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.DeviceArchitecture(nvml.DEVICE_ARCH_HOPPER), arch) + + major, minor, ret := device.GetCudaComputeCapability() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 9, major) + require.Equal(t, 0, minor) + + memory, ret := device.GetMemoryInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint64(144384*1024*1024), memory.Total) // 141GB + + // Test H200 supports P2P in MIG (IsP2pSupported should be 1) + profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) + + // Test MIG functionality + gpuInstance, ret := device.CreateGpuInstance(&profileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gpuInstance) + + giInfo, ret := gpuInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), giInfo.Id) + require.Equal(t, uint32(nvml.GPU_INSTANCE_PROFILE_1_SLICE), giInfo.ProfileId) + + // Test compute instance creation + ciProfileInfo, ret := gpuInstance.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + + computeInstance, ret := gpuInstance.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, computeInstance) + + ciInfo, ret := computeInstance.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0), ciInfo.Id) + require.Equal(t, uint32(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE), ciInfo.ProfileId) +} + +func TestH200Device(t *testing.T) { + device := NewDevice(3) + + index, ret := device.GetIndex() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 3, index) + + minor, ret := device.GetMinorNumber() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, 3, minor) + + uuid, ret := device.GetUUID() + require.Equal(t, nvml.SUCCESS, ret) + require.Contains(t, uuid, "GPU-") + + brand, ret := device.GetBrand() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, nvml.BRAND_NVIDIA, brand) + + pciInfo, ret := device.GetPciInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(0x233310DE), pciInfo.PciDeviceId) +} + +func TestH200MIGProfiles(t *testing.T) { + device := NewDevice(0) + + // Test all GPU instance profiles + testCases := []struct { + profile int + sliceCount uint32 + memoryMB uint64 + multiproc uint32 + }{ + {nvml.GPU_INSTANCE_PROFILE_1_SLICE, 1, 18432, 16}, + {nvml.GPU_INSTANCE_PROFILE_2_SLICE, 2, 35840, 32}, + {nvml.GPU_INSTANCE_PROFILE_3_SLICE, 3, 72704, 48}, + {nvml.GPU_INSTANCE_PROFILE_4_SLICE, 4, 72704, 64}, + {nvml.GPU_INSTANCE_PROFILE_7_SLICE, 7, 144384, 112}, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("profile_%d_slice", tc.sliceCount), func(t *testing.T) { + profileInfo, ret := device.GetGpuInstanceProfileInfo(tc.profile) + require.Equal(t, nvml.SUCCESS, ret) + require.Equal(t, uint32(tc.profile), profileInfo.Id) + require.Equal(t, tc.sliceCount, profileInfo.SliceCount) + require.Equal(t, tc.memoryMB, profileInfo.MemorySizeMB) + require.Equal(t, tc.multiproc, profileInfo.MultiprocessorCount) + require.Equal(t, uint32(1), profileInfo.IsP2pSupported) // H200 supports P2P + }) + } +} + +func TestH200MIGInstanceCreation(t *testing.T) { + device := NewDevice(0) + + // Test creating multiple GPU instances of different profiles + profileInfo1, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + require.Equal(t, nvml.SUCCESS, ret) + + gi1, ret := device.CreateGpuInstance(&profileInfo1) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi1) + + gi2, ret := device.CreateGpuInstance(&profileInfo1) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, gi2) + + // Verify they have different IDs + gi1Info, ret := gi1.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + gi2Info, ret := gi2.GetInfo() + require.Equal(t, nvml.SUCCESS, ret) + require.NotEqual(t, gi1Info.Id, gi2Info.Id) + + // Test that we can create compute instances on each GPU instance + ciProfileInfo, ret := gi1.GetComputeInstanceProfileInfo(nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED) + require.Equal(t, nvml.SUCCESS, ret) + + ci1, ret := gi1.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, ci1) + + ci2, ret := gi2.CreateComputeInstance(&ciProfileInfo) + require.Equal(t, nvml.SUCCESS, ret) + require.NotNil(t, ci2) +} diff --git a/pkg/nvml/mock/shared/gpus/a100.go b/pkg/nvml/mock/shared/gpus/a100.go new file mode 100644 index 0000000..89b122d --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/a100.go @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// A100 GPU Variants with different memory profiles and PCI device IDs +var ( + A100_PCIE_40GB = shared.Config{ + Name: "NVIDIA A100-PCIE-40GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 40960, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20F110DE, + MIGProfiles: a100_40gb_MIGProfiles, + } + A100_PCIE_80GB = shared.Config{ + Name: "NVIDIA A100-PCIE-80GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B510DE, + MIGProfiles: a100_80gb_MIGProfiles, + } + A100_SXM4_40GB = shared.Config{ + Name: "Mock NVIDIA A100-SXM4-40GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 40960, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B010DE, + MIGProfiles: a100_40gb_MIGProfiles, + } + A100_SXM4_80GB = shared.Config{ + Name: "NVIDIA A100-SXM4-80GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B210DE, + MIGProfiles: a100_80gb_MIGProfiles, + } +) + +var ( + a100_40gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: a100_40gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a100_ComputeInstanceProfiles, + GpuInstancePlacements: a100_GpuInstancePlacements, + ComputeInstancePlacements: a100_ComputeInstancePlacements, + } + a100_80gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: a100_80gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a100_ComputeInstanceProfiles, + GpuInstancePlacements: a100_GpuInstancePlacements, + ComputeInstancePlacements: a100_ComputeInstancePlacements, + } +) + +var ( + a100_40gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 96, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 40192, + }, + } + a100_80gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40192, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40192, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 80384, + }, + } +) + +var a100_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var a100_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 8}, // Test expects Size 8 + }, +} + +var a100_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 8}, // Test expects Size 8 + }, + }, +} diff --git a/pkg/nvml/mock/shared/gpus/a30.go b/pkg/nvml/mock/shared/gpus/a30.go new file mode 100644 index 0000000..17085ba --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/a30.go @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// A30 GPU Variants with different memory profiles and PCI device IDs +var ( + A30_PCIE_24GB = shared.Config{ + Name: "NVIDIA A30-PCIE-24GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 24576, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B710DE, + MIGProfiles: a30_24gb_MIGProfiles, + } +) + +var a30_24gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: a30_24gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a30_ComputeInstanceProfiles, + GpuInstancePlacements: a30_GpuInstancePlacements, + ComputeInstancePlacements: a30_ComputeInstancePlacements, +} + +var a30_24gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 0, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 5836, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 5836, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 11672, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 11672, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 4, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 23344, + }, +} + +var a30_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + }, + }, +} + +var a30_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, +} + +var a30_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, +} diff --git a/pkg/nvml/mock/shared/gpus/b200.go b/pkg/nvml/mock/shared/gpus/b200.go new file mode 100644 index 0000000..6ca30a8 --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/b200.go @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// B200 GPU Variants +var ( + B200_SXM5_180GB = shared.Config{ + Name: "NVIDIA B200 180GB HBM3e", + Architecture: nvml.DEVICE_ARCH_BLACKWELL, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 184320, // 180GB + CudaMajor: 10, + CudaMinor: 0, + PciDeviceId: 0x2B0010DE, + MIGProfiles: b200_180gb_MIGProfiles, + } +) + +var ( + b200_180gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: b200_180gb_GpuInstanceProfiles, + ComputeInstanceProfiles: b200_ComputeInstanceProfiles, + GpuInstancePlacements: b200_GpuInstancePlacements, + ComputeInstancePlacements: b200_ComputeInstancePlacements, + } +) + +var ( + b200_180gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 23552, // 23GB (MIG 1g.23gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 1, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 23552, // 23GB (MIG 1g.23gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 46080, // 45GB (MIG 1g.45gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 36, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 1, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 46080, // 45GB (MIG 2g.45gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 54, + CopyEngineCount: 3, + DecoderCount: 3, + EncoderCount: 2, + JpegCount: 2, + OfaCount: 2, + MemorySizeMB: 92160, // 90GB (MIG 3g.90gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 72, + CopyEngineCount: 4, + DecoderCount: 4, + EncoderCount: 2, + JpegCount: 2, + OfaCount: 2, + MemorySizeMB: 92160, // 90GB (MIG 4g.90gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 126, + CopyEngineCount: 7, + DecoderCount: 7, + EncoderCount: 4, + JpegCount: 4, + OfaCount: 4, + MemorySizeMB: 184320, // 180GB (MIG 7g.180gb) + }, + } +) + +var b200_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 36, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 54, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 72, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 54, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 126, + }, + }, +} + +var b200_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var b200_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} diff --git a/pkg/nvml/mock/shared/gpus/h100.go b/pkg/nvml/mock/shared/gpus/h100.go new file mode 100644 index 0000000..cd30163 --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/h100.go @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// H100 GPU Variants +var ( + H100_SXM5_80GB = shared.Config{ + Name: "NVIDIA H100 80GB HBM3", + Architecture: nvml.DEVICE_ARCH_HOPPER, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, // 80GB + CudaMajor: 9, + CudaMinor: 0, + PciDeviceId: 0x233010DE, + MIGProfiles: h100_80gb_MIGProfiles, + } +) + +var ( + h100_80gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: h100_80gb_GpuInstanceProfiles, + ComputeInstanceProfiles: h100_ComputeInstanceProfiles, + GpuInstancePlacements: h100_GpuInstancePlacements, + ComputeInstancePlacements: h100_ComputeInstancePlacements, + } +) + +var ( + h100_80gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 10240, // 10GB (MIG 1g.10gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 10240, // 10GB (MIG 1g.10gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 20480, // 20GB (MIG 1g.20gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 20480, // 20GB (MIG 2g.20gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40960, // 40GB (MIG 3g.40gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40960, // 40GB (MIG 4g.40gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 81920, // 80GB (MIG 7g.80gb) + }, + } +) + +var h100_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var h100_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var h100_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} diff --git a/pkg/nvml/mock/shared/gpus/h200.go b/pkg/nvml/mock/shared/gpus/h200.go new file mode 100644 index 0000000..1c20a2f --- /dev/null +++ b/pkg/nvml/mock/shared/gpus/h200.go @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared" +) + +// H200 GPU Variants +var ( + H200_SXM5_141GB = shared.Config{ + Name: "NVIDIA H200 141GB HBM3e", + Architecture: nvml.DEVICE_ARCH_HOPPER, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 144384, // 141GB + CudaMajor: 9, + CudaMinor: 0, + PciDeviceId: 0x233310DE, + MIGProfiles: h200_141gb_MIGProfiles, + } +) + +var ( + h200_141gb_MIGProfiles = shared.MIGProfileConfig{ + GpuInstanceProfiles: h200_141gb_GpuInstanceProfiles, + ComputeInstanceProfiles: h200_ComputeInstanceProfiles, + GpuInstancePlacements: h200_GpuInstancePlacements, + ComputeInstancePlacements: h200_ComputeInstancePlacements, + } +) + +var ( + h200_141gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 18432, // 18GB (MIG 1g.18gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 18432, // 18GB (MIG 1g.18gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 35840, // 35GB (MIG 1g.35gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 35840, // 35GB (MIG 2g.35gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 72704, // 71GB (MIG 3g.71gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 72704, // 71GB (MIG 4g.71gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 144384, // 141GB (MIG 7g.141gb) + }, + } +) + +var h200_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var h200_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var h200_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} diff --git a/pkg/nvml/mock/shared/shared.go b/pkg/nvml/mock/shared/shared.go new file mode 100644 index 0000000..75d93c6 --- /dev/null +++ b/pkg/nvml/mock/shared/shared.go @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package shared + +import ( + "fmt" + "sync" + + "github.com/google/uuid" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock" +) + +// Config contains the minimal configuration needed for a GPU generation +type Config struct { + Name string + Architecture nvml.DeviceArchitecture + Brand nvml.BrandType + MemoryMB uint64 + CudaMajor int + CudaMinor int + PciDeviceId uint32 + MIGProfiles MIGProfileConfig +} + +// ServerConfig contains the minimal configuration needed for a server +type ServerConfig struct { + Config Config + GPUCount int + DriverVersion string + NvmlVersion string + CudaDriverVersion int +} + +// MIGProfileConfig contains MIG profile configuration for a GPU +type MIGProfileConfig struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + GpuInstancePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePlacements map[int]map[int][]nvml.ComputeInstancePlacement +} + +// Server provides a reusable server implementation +type Server struct { + mock.Interface + mock.ExtendedInterface + Devices [8]nvml.Device + DriverVersion string + NvmlVersion string + CudaDriverVersion int +} + +// Device provides a reusable device implementation +type Device struct { + mock.Device + sync.RWMutex + UUID string + Name string + Brand nvml.BrandType + Architecture nvml.DeviceArchitecture + PciBusID string + Minor int + Index int + CudaComputeCapability CudaComputeCapability + MigMode int + GpuInstances map[*GpuInstance]struct{} + GpuInstanceCounter uint32 + MemoryInfo nvml.Memory + PciDeviceId uint32 + MIGProfiles MIGProfileConfig +} + +// GpuInstance provides a reusable GPU instance implementation +type GpuInstance struct { + mock.GpuInstance + sync.RWMutex + Info nvml.GpuInstanceInfo + ComputeInstances map[*ComputeInstance]struct{} + ComputeInstanceCounter uint32 + MIGProfiles MIGProfileConfig +} + +// ComputeInstance provides a reusable compute instance implementation +type ComputeInstance struct { + mock.ComputeInstance + Info nvml.ComputeInstanceInfo +} + +// CudaComputeCapability represents CUDA compute capability +type CudaComputeCapability struct { + Major int + Minor int +} + +var _ nvml.Interface = (*Server)(nil) +var _ nvml.Device = (*Device)(nil) +var _ nvml.GpuInstance = (*GpuInstance)(nil) +var _ nvml.ComputeInstance = (*ComputeInstance)(nil) + +// NewServerFromConfig creates a new server from the provided configuration +func NewServerFromConfig(config ServerConfig) *Server { + var devices [8]nvml.Device + for i := 0; i < config.GPUCount && i < 8; i++ { + devices[i] = NewDeviceFromConfig(config.Config, i) + } + + server := &Server{ + Devices: devices, + DriverVersion: config.DriverVersion, + NvmlVersion: config.NvmlVersion, + CudaDriverVersion: config.CudaDriverVersion, + } + server.SetMockFuncs() + return server +} + +// NewDeviceFromConfig creates a new device from the provided GPU configuration +func NewDeviceFromConfig(config Config, index int) *Device { + device := &Device{ + UUID: "GPU-" + uuid.New().String(), + Name: config.Name, + Brand: config.Brand, + Architecture: config.Architecture, + PciBusID: fmt.Sprintf("0000:%02x:00.0", index), + Minor: index, + Index: index, + CudaComputeCapability: CudaComputeCapability{ + Major: config.CudaMajor, + Minor: config.CudaMinor, + }, + GpuInstances: make(map[*GpuInstance]struct{}), + GpuInstanceCounter: 0, + MemoryInfo: nvml.Memory{Total: config.MemoryMB * 1024 * 1024, Free: 0, Used: 0}, + PciDeviceId: config.PciDeviceId, + MIGProfiles: config.MIGProfiles, + } + device.SetMockFuncs() + return device +} + +// NewGpuInstanceFromInfo creates a new GPU instance +func NewGpuInstanceFromInfo(info nvml.GpuInstanceInfo, profiles MIGProfileConfig) *GpuInstance { + gi := &GpuInstance{ + Info: info, + ComputeInstances: make(map[*ComputeInstance]struct{}), + ComputeInstanceCounter: 0, + MIGProfiles: profiles, + } + gi.SetMockFuncs() + return gi +} + +// NewComputeInstanceFromInfo creates a new compute instance +func NewComputeInstanceFromInfo(info nvml.ComputeInstanceInfo) *ComputeInstance { + ci := &ComputeInstance{ + Info: info, + } + ci.SetMockFuncs() + return ci +} + +// SetMockFuncs configures all the mock function implementations for the server +func (s *Server) SetMockFuncs() { + s.ExtensionsFunc = func() nvml.ExtendedInterface { + return s + } + + s.LookupSymbolFunc = func(symbol string) error { + return nil + } + + s.InitFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.ShutdownFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { + return s.DriverVersion, nvml.SUCCESS + } + + s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { + return s.NvmlVersion, nvml.SUCCESS + } + + s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { + return s.CudaDriverVersion, nvml.SUCCESS + } + + s.DeviceGetCountFunc = func() (int, nvml.Return) { + return len(s.Devices), nvml.SUCCESS + } + + s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { + if index < 0 || index >= len(s.Devices) { + return nil, nvml.ERROR_INVALID_ARGUMENT + } + return s.Devices[index], nvml.SUCCESS + } + + s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if uuid == d.(*Device).UUID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } + + s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if busID == d.(*Device).PciBusID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } +} + +// SetMockFuncs configures all the mock function implementations for the device +func (d *Device) SetMockFuncs() { + d.GetMinorNumberFunc = func() (int, nvml.Return) { + return d.Minor, nvml.SUCCESS + } + + d.GetIndexFunc = func() (int, nvml.Return) { + return d.Index, nvml.SUCCESS + } + + d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { + return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS + } + + d.GetUUIDFunc = func() (string, nvml.Return) { + return d.UUID, nvml.SUCCESS + } + + d.GetNameFunc = func() (string, nvml.Return) { + return d.Name, nvml.SUCCESS + } + + d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { + return d.Brand, nvml.SUCCESS + } + + d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { + return d.Architecture, nvml.SUCCESS + } + + d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { + return d.MemoryInfo, nvml.SUCCESS + } + + d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { + p := nvml.PciInfo{ + PciDeviceId: d.PciDeviceId, + } + return p, nvml.SUCCESS + } + + d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { + d.MigMode = mode + return nvml.SUCCESS, nvml.SUCCESS + } + + d.GetMigModeFunc = func() (int, int, nvml.Return) { + return d.MigMode, d.MigMode, nvml.SUCCESS + } + + d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { + if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if _, exists := d.MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return d.MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS + } + + d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { + return d.MIGProfiles.GpuInstancePlacements[int(info.Id)], nvml.SUCCESS + } + + d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + } + d.GpuInstanceCounter++ + gi := NewGpuInstanceFromInfo(giInfo, d.MIGProfiles) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + Placement: *placement, + } + d.GpuInstanceCounter++ + gi := NewGpuInstanceFromInfo(giInfo, d.MIGProfiles) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { + d.RLock() + defer d.RUnlock() + var gis []nvml.GpuInstance + for gi := range d.GpuInstances { + if gi.Info.ProfileId == info.Id { + gis = append(gis, gi) + } + } + return gis, nvml.SUCCESS + } +} + +// SetMockFuncs configures all the mock function implementations for the GPU instance +func (gi *GpuInstance) SetMockFuncs() { + gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { + return gi.Info, nvml.SUCCESS + } + + gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { + if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + giProfileId := int(gi.Info.ProfileId) + + if _, exists := gi.MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + if _, exists := gi.MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return gi.MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS + } + + gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { + return gi.MIGProfiles.ComputeInstancePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS + } + + gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { + gi.Lock() + defer gi.Unlock() + ciInfo := nvml.ComputeInstanceInfo{ + Device: gi.Info.Device, + GpuInstance: gi, + Id: gi.ComputeInstanceCounter, + ProfileId: info.Id, + } + gi.ComputeInstanceCounter++ + ci := NewComputeInstanceFromInfo(ciInfo) + gi.ComputeInstances[ci] = struct{}{} + return ci, nvml.SUCCESS + } + + gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { + gi.RLock() + defer gi.RUnlock() + var cis []nvml.ComputeInstance + for ci := range gi.ComputeInstances { + if ci.Info.ProfileId == info.Id { + cis = append(cis, ci) + } + } + return cis, nvml.SUCCESS + } + + gi.DestroyFunc = func() nvml.Return { + d := gi.Info.Device.(*Device) + d.Lock() + defer d.Unlock() + delete(d.GpuInstances, gi) + return nvml.SUCCESS + } +} + +// SetMockFuncs configures all the mock function implementations for the compute instance +func (ci *ComputeInstance) SetMockFuncs() { + ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { + return ci.Info, nvml.SUCCESS + } + + ci.DestroyFunc = func() nvml.Return { + gi := ci.Info.GpuInstance.(*GpuInstance) + gi.Lock() + defer gi.Unlock() + delete(gi.ComputeInstances, ci) + return nvml.SUCCESS + } +}