@@ -9,12 +9,23 @@ pkg/nvml/mock/
9
9
├── shared/
10
10
│ ├── shared.go # Core shared factory and types
11
11
│ └── gpus/ # GPU configuration definitions
12
- │ ├── a100.go # A100 GPU variants
13
- │ └── a30.go # A30 GPU variants
14
- └── dgxa100/ # DGX A100 implementation
15
- ├── dgxa100.go # Server and device implementation
16
- ├── gpus.go # Legacy A100 configurations and MIG profiles
17
- └── dgxa100_test.go # Comprehensive tests
12
+ │ ├── a100.go # A100 GPU variants (Ampere)
13
+ │ ├── a30.go # A30 GPU variants (Ampere)
14
+ │ ├── h100.go # H100 GPU variants (Hopper)
15
+ │ ├── h200.go # H200 GPU variants (Hopper)
16
+ │ └── b200.go # B200 GPU variants (Blackwell)
17
+ ├── dgxa100/ # DGX A100 implementation
18
+ │ ├── dgxa100.go # Server and device implementation
19
+ │ └── dgxa100_test.go # Comprehensive tests
20
+ ├── dgxh100/ # DGX H100 implementation
21
+ │ ├── dgxh100.go # Server and device implementation
22
+ │ └── dgxh100_test.go # Comprehensive tests
23
+ ├── dgxh200/ # DGX H200 implementation
24
+ │ ├── dgxh200.go # Server and device implementation
25
+ │ └── dgxh200_test.go # Comprehensive tests
26
+ └── dgxb200/ # DGX B200 implementation
27
+ ├── dgxb200.go # Server and device implementation
28
+ └── dgxb200_test.go # Comprehensive tests
18
29
```
19
30
20
31
## Core Concepts
@@ -44,26 +55,37 @@ Define Multi-Instance GPU capabilities including:
44
55
``` go
45
56
import (
46
57
" github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
58
+ " github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh100"
59
+ " github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh200"
60
+ " github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxb200"
47
61
" github.com/NVIDIA/go-nvml/pkg/nvml/mock/shared/gpus"
48
62
)
49
63
50
- // Create default A100 system
64
+ // Create default systems
51
65
serverA100 := dgxa100.New () // A100-SXM4-40GB (8 GPUs)
66
+ serverH100 := dgxh100.New () // H100-SXM5-80GB (8 GPUs)
67
+ serverH200 := dgxh200.New () // H200-SXM5-141GB (8 GPUs)
68
+ serverB200 := dgxb200.New () // B200-SXM5-180GB (8 GPUs)
52
69
53
- // Create specific A100 variants
70
+ // Create specific variants
54
71
serverA100_80GB := dgxa100.NewServerWithGPU (gpus.A100_SXM4_80GB )
55
- serverA100_PCIE := dgxa100.NewServerWithGPU (gpus.A100_PCIE_40GB )
72
+ serverH200_Custom := dgxh200.NewServerWithGPU (gpus.H200_SXM5_141GB )
73
+ serverB200_Custom := dgxb200.NewServerWithGPU (gpus.B200_SXM5_180GB )
56
74
```
57
75
58
76
### Device Creation
59
77
60
78
``` go
61
- // Create device with default configuration
62
- device := dgxa100.NewDevice (0 )
79
+ // Create devices with default configurations
80
+ deviceA100 := dgxa100.NewDevice (0 )
81
+ deviceH100 := dgxh100.NewDevice (0 )
82
+ deviceH200 := dgxh200.NewDevice (0 )
83
+ deviceB200 := dgxb200.NewDevice (0 )
63
84
64
- // Create device with specific GPU variant
85
+ // Create devices with specific GPU variants
65
86
deviceA100_80GB := dgxa100.NewDeviceWithGPU (gpus.A100_SXM4_80GB , 0 )
66
- deviceA100_PCIE := dgxa100.NewDeviceWithGPU (gpus.A100_PCIE_40GB , 1 )
87
+ deviceH200_Custom := dgxh200.NewDeviceWithGPU (gpus.H200_SXM5_141GB , 1 )
88
+ deviceB200_Custom := dgxb200.NewDeviceWithGPU (gpus.B200_SXM5_180GB , 2 )
67
89
```
68
90
69
91
### Accessing GPU Configurations
@@ -79,11 +101,30 @@ gpus.A100_PCIE_80GB // A100 PCIe 80GB
79
101
// A30 Family
80
102
gpus.A30_PCIE_24GB // A30 PCIe 24GB
81
103
104
+ // H100 Family
105
+ gpus.H100_SXM5_80GB // H100 SXM5 80GB
106
+
107
+ // H200 Family
108
+ gpus.H200_SXM5_141GB // H200 SXM5 141GB
109
+
110
+ // B200 Family
111
+ gpus.B200_SXM5_180GB // B200 SXM5 180GB
112
+
82
113
// Inspect configurations
83
114
fmt.Printf (" GPU: %s \n " , gpus.A100_SXM4_80GB .Name )
84
115
fmt.Printf (" Memory: %d MB\n " , gpus.A100_SXM4_80GB .MemoryMB )
85
116
fmt.Printf (" Architecture: %v \n " , gpus.A100_SXM4_80GB .Architecture )
86
117
fmt.Printf (" PCI Device ID: 0x%X \n " , gpus.A100_SXM4_80GB .PciDeviceId )
118
+
119
+ // Inspect H100 configuration
120
+ fmt.Printf (" GPU: %s \n " , gpus.H100_SXM5_80GB .Name )
121
+ fmt.Printf (" Memory: %d MB\n " , gpus.H100_SXM5_80GB .MemoryMB )
122
+ fmt.Printf (" CUDA Major: %d \n " , gpus.H100_SXM5_80GB .CudaMajor )
123
+
124
+ // Inspect B200 configuration
125
+ fmt.Printf (" GPU: %s \n " , gpus.B200_SXM5_180GB .Name )
126
+ fmt.Printf (" Memory: %d MB\n " , gpus.B200_SXM5_180GB .MemoryMB )
127
+ fmt.Printf (" CUDA Major: %d \n " , gpus.B200_SXM5_180GB .CudaMajor )
87
128
```
88
129
89
130
## Available GPU Models
@@ -127,6 +168,39 @@ fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId)
127
168
- MIG P2P: Not supported (` IsP2pSupported: 0 ` )
128
169
- MIG slices: 1, 2, 4 (no 3-slice or 7-slice support)
129
170
171
+ ### H100 Family (Hopper Architecture, 132 SMs)
172
+
173
+ - ** H100 SXM5 80GB** (` gpus.H100_SXM5_80GB ` )
174
+ - Form factor: SXM5
175
+ - Memory: 80GB HBM3
176
+ - PCI Device ID: 0x233010DE
177
+ - CUDA Capability: 9.0
178
+ - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice)
179
+ - MIG P2P: Supported (` IsP2pSupported: 1 ` )
180
+ - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
181
+
182
+ ### H200 Family (Hopper Architecture, 132 SMs)
183
+
184
+ - ** H200 SXM5 141GB** (` gpus.H200_SXM5_141GB ` )
185
+ - Form factor: SXM5
186
+ - Memory: 141GB HBM3e
187
+ - PCI Device ID: 0x233310DE
188
+ - CUDA Capability: 9.0
189
+ - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice)
190
+ - MIG P2P: Supported (` IsP2pSupported: 1 ` )
191
+ - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
192
+
193
+ ### B200 Family (Blackwell Architecture, 144 SMs)
194
+
195
+ - ** B200 SXM5 180GB** (` gpus.B200_SXM5_180GB ` )
196
+ - Form factor: SXM5
197
+ - Memory: 180GB HBM3e
198
+ - PCI Device ID: 0x2B0010DE
199
+ - CUDA Capability: 10.0
200
+ - SMs per slice: 18 (1-slice), 36 (2-slice), 54 (3-slice), 72 (4-slice), 126 (7-slice)
201
+ - MIG P2P: Supported (` IsP2pSupported: 1 ` )
202
+ - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
203
+
130
204
## Available Server Models
131
205
132
206
### DGX A100 Family
@@ -137,16 +211,56 @@ fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId)
137
211
- NVML: 12.550.54.15
138
212
- CUDA: 12040
139
213
214
+ ### DGX H100 Family
215
+
216
+ - ** DGX H100 80GB** (default)
217
+ - 8x H100 SXM5 80GB GPUs
218
+ - Driver: 550.54.15
219
+ - NVML: 12.550.54.15
220
+ - CUDA: 12040
221
+
222
+ ### DGX H200 Family
223
+
224
+ - ** DGX H200 141GB** (default)
225
+ - 8x H200 SXM5 141GB GPUs
226
+ - Driver: 550.54.15
227
+ - NVML: 12.550.54.15
228
+ - CUDA: 12040
229
+
230
+ ### DGX B200 Family
231
+
232
+ - ** DGX B200 180GB** (default)
233
+ - 8x B200 SXM5 180GB GPUs
234
+ - Driver: 560.28.03
235
+ - NVML: 12.560.28.03
236
+ - CUDA: 12060
237
+
140
238
## MIG (Multi-Instance GPU) Support
141
239
142
240
All GPU configurations include comprehensive MIG profile definitions:
143
241
144
242
- ** A100** : No P2P support in MIG (` IsP2pSupported: 0 ` )
145
243
- Memory profiles differ between 40GB and 80GB variants
146
244
- Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices)
245
+ - 108 SMs total with 14 SMs per slice
147
246
- ** A30** : No P2P support in MIG (` IsP2pSupported: 0 ` )
148
247
- Supports limited MIG slice configurations (1, 2, 4 slices only)
149
248
- 56 SMs total with 14 SMs per slice
249
+ - ** H100** : Full P2P support in MIG (` IsP2pSupported: 1 ` )
250
+ - 80GB HBM3 memory with optimized slice allocations
251
+ - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices)
252
+ - 132 SMs total with 16 SMs per slice
253
+ - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
254
+ - ** H200** : Full P2P support in MIG (` IsP2pSupported: 1 ` )
255
+ - 141GB HBM3e memory with enhanced capacity
256
+ - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices)
257
+ - 132 SMs total with 16 SMs per slice
258
+ - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
259
+ - ** B200** : Full P2P support in MIG (` IsP2pSupported: 1 ` )
260
+ - 180GB HBM3e memory with next-generation capacity
261
+ - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices)
262
+ - 144 SMs total with 18 SMs per slice
263
+ - Includes REV1 (media extensions) and REV2 (expanded memory) profiles
150
264
151
265
### MIG Operations
152
266
@@ -186,11 +300,15 @@ The framework includes comprehensive tests covering:
186
300
# Run all mock tests
187
301
go test ./pkg/nvml/mock/...
188
302
189
- # Run A100 specific tests
303
+ # Run generation specific tests
190
304
go test -v ./pkg/nvml/mock/dgxa100/
305
+ go test -v ./pkg/nvml/mock/dgxh100/
306
+ go test -v ./pkg/nvml/mock/dgxh200/
307
+ go test -v ./pkg/nvml/mock/dgxb200/
191
308
192
309
# Run specific test
193
310
go test -v ./pkg/nvml/mock/dgxa100/ -run TestMIGProfilesExist
311
+ go test -v ./pkg/nvml/mock/dgxh100/ -run TestMIGProfilesExist
194
312
```
195
313
196
314
## Extending the Framework
@@ -213,46 +331,47 @@ var A100_PCIE_24GB = shared.Config{
213
331
214
332
### Adding GPU Generations
215
333
216
- 1 . ** Create new package** (e.g., ` dgxa100 /` )
217
- 2 . ** Define GPU configurations** in ` shared/gpus/a100 .go `
334
+ 1 . ** Create new package** (e.g., ` dgxb200 /` )
335
+ 2 . ** Define GPU configurations** in ` shared/gpus/b200 .go `
218
336
3 . ** Define MIG profiles** with appropriate memory and SM allocations
219
337
4 . ** Implement server and device factory functions**
220
338
5 . ** Add comprehensive tests**
221
339
222
- Example structure for A100 generation:
340
+ Example structure for B200 generation:
223
341
``` go
224
- // In shared/gpus/a100 .go
225
- var A100_SXM4_80GB = shared.Config {
226
- Name : " NVIDIA A100 SXM4 80GB " ,
227
- Architecture : nvml.DEVICE_ARCH_AMPERE ,
342
+ // In shared/gpus/b200 .go
343
+ var B200_SXM5_180GB = shared.Config {
344
+ Name : " NVIDIA B200 180GB HBM3e " ,
345
+ Architecture : nvml.DEVICE_ARCH_BLACKWELL ,
228
346
Brand : nvml.BRAND_NVIDIA ,
229
- MemoryMB : 81920 ,
230
- CudaMajor : 8 ,
347
+ MemoryMB : 184320 , // 180GB
348
+ CudaMajor : 10 ,
231
349
CudaMinor : 0 ,
232
- PciDeviceId : 0x20B210DE ,
233
- MIGProfiles : a100MIGProfiles ,
350
+ PciDeviceId : 0x2B0010DE ,
351
+ MIGProfiles : b200_180gb_MIGProfiles ,
234
352
}
235
353
236
- // In dgxa100/dgxa100 .go
354
+ // In dgxb200/dgxb200 .go
237
355
func New () *Server {
238
356
return shared.NewServerFromConfig (shared.ServerConfig {
239
- Config: gpus.A100_SXM4_80GB ,
240
- GPUCount: 4 ,
241
- DriverVersion: " 550.54.15 " ,
242
- NvmlVersion: " 12.550.54.15 " ,
243
- CudaDriverVersion: 12040 ,
357
+ Config: gpus.B200_SXM5_180GB ,
358
+ GPUCount: 8 ,
359
+ DriverVersion: " 560.28.03 " ,
360
+ NvmlVersion: " 12.560.28.03 " ,
361
+ CudaDriverVersion: 12060 ,
244
362
})
245
363
}
246
364
```
247
365
248
366
## Backward Compatibility
249
367
250
368
The framework maintains full backward compatibility:
251
- - All existing ` dgxa100.New() ` calls continue to work unchanged
252
- - Legacy global variables (` MIGProfiles ` , ` MIGPlacements ` ) are preserved
369
+ - All existing ` dgxa100.New() ` , ` dgxh100.New() ` , ` dgxh200.New() ` , ` dgxb200.New() ` calls continue to work unchanged
370
+ - Legacy global variables (` MIGProfiles ` , ` MIGPlacements ` ) are preserved for all generations
253
371
- Device names maintain "Mock" prefix for test compatibility
254
372
- All existing tests pass without modification
255
- - A100 configurations now reference ` shared/gpus ` package
373
+ - All GPU configurations reference ` shared/gpus ` package for consistency
374
+ - Type aliases ensure seamless transition from generation-specific types
256
375
257
376
## Performance Considerations
258
377
0 commit comments