docs: Update README.md with comprehensive H100, H200, and B200 documentation

fabiendupont · fabiendupont · commit 51f3c75a0c20 · 2025-09-24T17:10:18.000+02:00
Updates the mock framework documentation to include all GPU generations:

- Add H100, H200, and B200 to architecture diagram and file structure
- Document all GPU specifications:
  * H100 SXM5 80GB (Hopper, 132 SMs, CUDA 9.0, P2P MIG support)
  * H200 SXM5 141GB (Hopper, 132 SMs, CUDA 9.0, P2P MIG support)
  * B200 SXM5 180GB (Blackwell, 144 SMs, CUDA 10.0, P2P MIG support)
- Add complete server model documentation with driver versions
- Expand MIG support section with detailed SM allocations and P2P capabilities
- Update usage examples to include all four GPU generations
- Add comprehensive testing instructions for all mock implementations
- Update backward compatibility section to reflect all generations

The documentation now accurately reflects the complete shared factory
implementation with comprehensive coverage of Ampere, Hopper, and
Blackwell GPU architectures.

Signed-off-by: Fabien Dupont &lt;fdupont@redhat.com&gt;
diff --git a/pkg/nvml/mock/README.md b/pkg/nvml/mock/README.md
@@ -9,12 +9,23 @@ pkg/nvml/mock/
 ├── shared/
 │   ├── shared.go                 # Core shared factory and types
 │   └── gpus/                     # GPU configuration definitions
-│       ├── a100.go              # A100 GPU variants
-│       └── a30.go               # A30 GPU variants
-└── dgxa100/                      # DGX A100 implementation
-    ├── dgxa100.go               # Server and device implementation
-    ├── gpus.go                  # Legacy A100 configurations and MIG profiles
-    └── dgxa100_test.go          # Comprehensive tests
+│       ├── a100.go              # A100 GPU variants (Ampere)
+│       ├── a30.go               # A30 GPU variants (Ampere)
+│       ├── h100.go              # H100 GPU variants (Hopper)
+│       ├── h200.go              # H200 GPU variants (Hopper)
+│       └── b200.go              # B200 GPU variants (Blackwell)
+├── dgxa100/                      # DGX A100 implementation
+│   ├── dgxa100.go               # Server and device implementation
+│   └── dgxa100_test.go          # Comprehensive tests
+├── dgxh100/                      # DGX H100 implementation
+│   ├── dgxh100.go               # Server and device implementation
+│   └── dgxh100_test.go          # Comprehensive tests
+├── dgxh200/                      # DGX H200 implementation
+│   ├── dgxh200.go               # Server and device implementation
+│   └── dgxh200_test.go          # Comprehensive tests
+└── dgxb200/                      # DGX B200 implementation
+    ├── dgxb200.go               # Server and device implementation
+    └── dgxb200_test.go          # Comprehensive tests
 ```
 
 ## Core Concepts
@@ -44,26 +55,37 @@ Define Multi-Instance GPU capabilities including:
 ```go
 import (
     "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
+    "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh100"
+    "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh200"
+    "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxb200"
     "github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus"
 )
 
-// Create default A100 system
+// Create default systems
 serverA100 := dgxa100.New()   // A100-SXM4-40GB (8 GPUs)
+serverH100 := dgxh100.New()   // H100-SXM5-80GB (8 GPUs)
+serverH200 := dgxh200.New()   // H200-SXM5-141GB (8 GPUs)
+serverB200 := dgxb200.New()   // B200-SXM5-180GB (8 GPUs)
 
-// Create specific A100 variants
+// Create specific variants
 serverA100_80GB := dgxa100.NewServerWithGPU(gpus.A100_SXM4_80GB)
-serverA100_PCIE := dgxa100.NewServerWithGPU(gpus.A100_PCIE_40GB)
+serverH200_Custom := dgxh200.NewServerWithGPU(gpus.H200_SXM5_141GB)
+serverB200_Custom := dgxb200.NewServerWithGPU(gpus.B200_SXM5_180GB)
 ```
 
 ### Device Creation
 
 ```go
-// Create device with default configuration
-device := dgxa100.NewDevice(0)
+// Create devices with default configurations
+deviceA100 := dgxa100.NewDevice(0)
+deviceH100 := dgxh100.NewDevice(0)
+deviceH200 := dgxh200.NewDevice(0)
+deviceB200 := dgxb200.NewDevice(0)
 
-// Create device with specific GPU variant
+// Create devices with specific GPU variants
 deviceA100_80GB := dgxa100.NewDeviceWithGPU(gpus.A100_SXM4_80GB, 0)
-deviceA100_PCIE := dgxa100.NewDeviceWithGPU(gpus.A100_PCIE_40GB, 1)
+deviceH200_Custom := dgxh200.NewDeviceWithGPU(gpus.H200_SXM5_141GB, 1)
+deviceB200_Custom := dgxb200.NewDeviceWithGPU(gpus.B200_SXM5_180GB, 2)
 ```
 
 ### Accessing GPU Configurations
@@ -79,11 +101,30 @@ gpus.A100_PCIE_80GB     // A100 PCIe 80GB
 // A30 Family
 gpus.A30_PCIE_24GB      // A30 PCIe 24GB
 
+// H100 Family
+gpus.H100_SXM5_80GB     // H100 SXM5 80GB
+
+// H200 Family
+gpus.H200_SXM5_141GB    // H200 SXM5 141GB
+
+// B200 Family
+gpus.B200_SXM5_180GB    // B200 SXM5 180GB
+
 // Inspect configurations
 fmt.Printf("GPU: %s\n", gpus.A100_SXM4_80GB.Name)
 fmt.Printf("Memory: %d MB\n", gpus.A100_SXM4_80GB.MemoryMB)
 fmt.Printf("Architecture: %v\n", gpus.A100_SXM4_80GB.Architecture)
 fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId)
+
+// Inspect H100 configuration
+fmt.Printf("GPU: %s\n", gpus.H100_SXM5_80GB.Name)
+fmt.Printf("Memory: %d MB\n", gpus.H100_SXM5_80GB.MemoryMB)
+fmt.Printf("CUDA Major: %d\n", gpus.H100_SXM5_80GB.CudaMajor)
+
+// Inspect B200 configuration
+fmt.Printf("GPU: %s\n", gpus.B200_SXM5_180GB.Name)
+fmt.Printf("Memory: %d MB\n", gpus.B200_SXM5_180GB.MemoryMB)
+fmt.Printf("CUDA Major: %d\n", gpus.B200_SXM5_180GB.CudaMajor)
 ```
 
 ## Available GPU Models
@@ -127,6 +168,39 @@ fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId)
   - MIG P2P: Not supported (`IsP2pSupported: 0`)
   - MIG slices: 1, 2, 4 (no 3-slice or 7-slice support)
 
+### H100 Family (Hopper Architecture, 132 SMs)
+
+- **H100 SXM5 80GB** (`gpus.H100_SXM5_80GB`)
+  - Form factor: SXM5
+  - Memory: 80GB HBM3
+  - PCI Device ID: 0x233010DE
+  - CUDA Capability: 9.0
+  - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice)
+  - MIG P2P: Supported (`IsP2pSupported: 1`)
+  - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
+
+### H200 Family (Hopper Architecture, 132 SMs)
+
+- **H200 SXM5 141GB** (`gpus.H200_SXM5_141GB`)
+  - Form factor: SXM5
+  - Memory: 141GB HBM3e
+  - PCI Device ID: 0x233310DE
+  - CUDA Capability: 9.0
+  - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice)
+  - MIG P2P: Supported (`IsP2pSupported: 1`)
+  - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
+
+### B200 Family (Blackwell Architecture, 144 SMs)
+
+- **B200 SXM5 180GB** (`gpus.B200_SXM5_180GB`)
+  - Form factor: SXM5
+  - Memory: 180GB HBM3e
+  - PCI Device ID: 0x2B0010DE
+  - CUDA Capability: 10.0
+  - SMs per slice: 18 (1-slice), 36 (2-slice), 54 (3-slice), 72 (4-slice), 126 (7-slice)
+  - MIG P2P: Supported (`IsP2pSupported: 1`)
+  - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
+
 ## Available Server Models
 
 ### DGX A100 Family
@@ -137,16 +211,56 @@ fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId)
   - NVML: 12.550.54.15
   - CUDA: 12040
 
+### DGX H100 Family
+
+- **DGX H100 80GB** (default)
+  - 8x H100 SXM5 80GB GPUs
+  - Driver: 550.54.15
+  - NVML: 12.550.54.15
+  - CUDA: 12040
+
+### DGX H200 Family
+
+- **DGX H200 141GB** (default)
+  - 8x H200 SXM5 141GB GPUs
+  - Driver: 550.54.15
+  - NVML: 12.550.54.15
+  - CUDA: 12040
+
+### DGX B200 Family
+
+- **DGX B200 180GB** (default)
+  - 8x B200 SXM5 180GB GPUs
+  - Driver: 560.28.03
+  - NVML: 12.560.28.03
+  - CUDA: 12060
+
 ## MIG (Multi-Instance GPU) Support
 
 All GPU configurations include comprehensive MIG profile definitions:
 
 - **A100**: No P2P support in MIG (`IsP2pSupported: 0`)
   - Memory profiles differ between 40GB and 80GB variants
   - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices)
+  - 108 SMs total with 14 SMs per slice
 - **A30**: No P2P support in MIG (`IsP2pSupported: 0`)
   - Supports limited MIG slice configurations (1, 2, 4 slices only)
   - 56 SMs total with 14 SMs per slice
+- **H100**: Full P2P support in MIG (`IsP2pSupported: 1`)
+  - 80GB HBM3 memory with optimized slice allocations
+  - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices)
+  - 132 SMs total with 16 SMs per slice
+  - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
+- **H200**: Full P2P support in MIG (`IsP2pSupported: 1`)
+  - 141GB HBM3e memory with enhanced capacity
+  - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices)
+  - 132 SMs total with 16 SMs per slice
+  - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
+- **B200**: Full P2P support in MIG (`IsP2pSupported: 1`)
+  - 180GB HBM3e memory with next-generation capacity
+  - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices)
+  - 144 SMs total with 18 SMs per slice
+  - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
 
 ### MIG Operations
 
@@ -186,11 +300,15 @@ The framework includes comprehensive tests covering:
 # Run all mock tests
 go test ./pkg/nvml/mock/...
 
-# Run A100 specific tests
+# Run generation specific tests
 go test -v ./pkg/nvml/mock/dgxa100/
+go test -v ./pkg/nvml/mock/dgxh100/
+go test -v ./pkg/nvml/mock/dgxh200/
+go test -v ./pkg/nvml/mock/dgxb200/
 
 # Run specific test
 go test -v ./pkg/nvml/mock/dgxa100/ -run TestMIGProfilesExist
+go test -v ./pkg/nvml/mock/dgxh100/ -run TestMIGProfilesExist
 ```
 
 ## Extending the Framework
@@ -213,46 +331,47 @@ var A100_PCIE_24GB = shared.Config{
 
 ### Adding GPU Generations
 
-1. **Create new package** (e.g., `dgxa100/`)
-2. **Define GPU configurations** in `shared/gpus/a100.go`
+1. **Create new package** (e.g., `dgxb200/`)
+2. **Define GPU configurations** in `shared/gpus/b200.go`
 3. **Define MIG profiles** with appropriate memory and SM allocations
 4. **Implement server and device factory functions**
 5. **Add comprehensive tests**
 
-Example structure for A100 generation:
+Example structure for B200 generation:
 ```go
-// In shared/gpus/a100.go
-var A100_SXM4_80GB = shared.Config{
-    Name:         "NVIDIA A100 SXM4 80GB",
-    Architecture: nvml.DEVICE_ARCH_AMPERE,
+// In shared/gpus/b200.go
+var B200_SXM5_180GB = shared.Config{
+    Name:         "NVIDIA B200 180GB HBM3e",
+    Architecture: nvml.DEVICE_ARCH_BLACKWELL,
     Brand:        nvml.BRAND_NVIDIA,
-    MemoryMB:     81920,
-    CudaMajor:    8,
+    MemoryMB:     184320, // 180GB
+    CudaMajor:    10,
     CudaMinor:    0,
-    PciDeviceId:  0x20B210DE,
-    MIGProfiles:  a100MIGProfiles,
+    PciDeviceId:  0x2B0010DE,
+    MIGProfiles:  b200_180gb_MIGProfiles,
 }
 
-// In dgxa100/dgxa100.go
+// In dgxb200/dgxb200.go
 func New() *Server {
     return shared.NewServerFromConfig(shared.ServerConfig{
-        Config:            gpus.A100_SXM4_80GB,
-        GPUCount:          4,
-        DriverVersion:     "550.54.15",
-        NvmlVersion:       "12.550.54.15",
-        CudaDriverVersion: 12040,
+        Config:            gpus.B200_SXM5_180GB,
+        GPUCount:          8,
+        DriverVersion:     "560.28.03",
+        NvmlVersion:       "12.560.28.03",
+        CudaDriverVersion: 12060,
     })
 }
 ```
 
 ## Backward Compatibility
 
 The framework maintains full backward compatibility:
-- All existing `dgxa100.New()` calls continue to work unchanged
-- Legacy global variables (`MIGProfiles`, `MIGPlacements`) are preserved
+- All existing `dgxa100.New()`, `dgxh100.New()`, `dgxh200.New()`, `dgxb200.New()` calls continue to work unchanged
+- Legacy global variables (`MIGProfiles`, `MIGPlacements`) are preserved for all generations
 - Device names maintain "Mock" prefix for test compatibility
 - All existing tests pass without modification
-- A100 configurations now reference `shared/gpus` package
+- All GPU configurations reference `shared/gpus` package for consistency
+- Type aliases ensure seamless transition from generation-specific types
 
 ## Performance Considerations