diff --git a/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs b/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs index ed727f84f8..5875fdb4ab 100644 --- a/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs +++ b/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs @@ -7,8 +7,10 @@ namespace VirtualClient.Actions using System.Collections.Generic; using System.Diagnostics; using System.Linq; + using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; + using Microsoft.CodeAnalysis.CSharp.Syntax; using Microsoft.Extensions.DependencyInjection; using Moq; using Newtonsoft.Json.Linq; @@ -26,36 +28,18 @@ public class SuperBenchmarkExecutorTests private DependencyPath mockPackage; [SetUp] - public void SetupDefaultBehavior() + public void SetupTests() { this.mockFixture = new MockFixture(); - this.mockFixture.Setup(PlatformID.Unix); - this.mockPackage = new DependencyPath("SuperBenchmark", this.mockFixture.PlatformSpecifics.GetPackagePath("superbenchmark")); - - this.mockFixture.PackageManager.OnGetPackage().ReturnsAsync(this.mockPackage); - - this.mockFixture.File.Reset(); - this.mockFixture.File.Setup(f => f.Exists(It.IsAny())) - .Returns(true); - this.mockFixture.Directory.Setup(f => f.Exists(It.IsAny())) - .Returns(true); - this.mockFixture.Directory.Setup(f => f.Exists(It.IsRegex("superbenchmark"))) - .Returns(false); - - this.mockFixture.FileSystem.SetupGet(fs => fs.File).Returns(this.mockFixture.File.Object); - - this.mockFixture.Parameters = new Dictionary() - { - { nameof(SuperBenchmarkExecutor.Version), "0.0.1" }, - { nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" }, - { nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" }, - { nameof(SuperBenchmarkExecutor.Username), "testuser" } - }; } [Test] - public void SuperBenchmarkStateIsSerializeable() + [TestCase(Architecture.X64)] + [TestCase(Architecture.Arm64)] + public void SuperBenchmarkStateIsSerializeable(Architecture architecture) { + SetupDefaultMockBehavior(architecture); + State state = new State(new Dictionary { ["SuperBenchmarkInitialized"] = true @@ -69,8 +53,10 @@ public void SuperBenchmarkStateIsSerializeable() } [Test] - public async Task SuperBenchmarkExecutorClonesTheExpectedRepoContents() + public async Task SuperBenchmarkExecutorClonesTheExpectedRepoContentsOnX64Architecture() { + SetupDefaultMockBehavior(Architecture.X64); + ProcessStartInfo expectedInfo = new ProcessStartInfo(); this.mockFixture.Parameters = new Dictionary() { @@ -111,8 +97,54 @@ public async Task SuperBenchmarkExecutorClonesTheExpectedRepoContents() } [Test] - public async Task SuperBenchmarkExecutorUsesTheExpectedScriptFilesOnExecution() + public async Task SuperBenchmarkExecutorPullsTheExpectedDockerImageContentsOnArm64Architecture() + { + SetupDefaultMockBehavior(Architecture.Arm64); + + ProcessStartInfo expectedInfo = new ProcessStartInfo(); + this.mockFixture.Parameters = new Dictionary() + { + { nameof(SuperBenchmarkExecutor.Version), "1.2.3" }, + { nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" }, + { nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" }, + { nameof(SuperBenchmarkExecutor.Username), "testuser" } + }; + string expectedCommand = $"sudo docker pull {this.mockFixture.Parameters[nameof(SuperBenchmarkExecutor.ContainerVersion)]}"; + + bool commandExecuted = false; + this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) => + { + if (expectedCommand == $"{exe} {arguments}") + { + commandExecuted = true; + } + + return new InMemoryProcess + { + StartInfo = new ProcessStartInfo + { + FileName = exe, + Arguments = arguments + }, + ExitCode = 0, + OnStart = () => true, + OnHasExited = () => true + }; + }; + + using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters)) + { + await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); + } + + Assert.IsTrue(commandExecuted); + } + + [Test] + public async Task SuperBenchmarkExecutorUsesTheExpectedScriptFilesOnExecutionOnX64Architecture() { + SetupDefaultMockBehavior(Architecture.X64); + ProcessStartInfo expectedInfo = new ProcessStartInfo(); string expectedCommand = $"sudo bash initialize.sh testuser"; @@ -146,8 +178,10 @@ public async Task SuperBenchmarkExecutorUsesTheExpectedScriptFilesOnExecution() } [Test] - public async Task SuperBenchmarkExecutorDeploySuperBenchContainer() + public async Task SuperBenchmarkExecutorDeploySuperBenchContainerOnX64Architecture() { + SetupDefaultMockBehavior(Architecture.X64); + ProcessStartInfo expectedInfo = new ProcessStartInfo(); string expectedCommand = $"sb deploy --host-list localhost -i testContainer"; @@ -181,8 +215,48 @@ public async Task SuperBenchmarkExecutorDeploySuperBenchContainer() } [Test] - public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommand() + public async Task SuperBenchmarkExecutorRunsDockerContainerInDetachedModeForSetupOnArm64Architecture() { + SetupDefaultMockBehavior(Architecture.Arm64); + + ProcessStartInfo expectedInfo = new ProcessStartInfo(); + string expectedPath = this.mockFixture.PlatformSpecifics.Combine(this.mockFixture.PlatformSpecifics.PackagesDirectory, "superbenchmark"); + string expectedCommand = $"sudo docker run -itd --name=sb-dev --privileged --net=host --ipc=host --gpus=all -w /root -v {expectedPath}:/mnt testContainer"; + + bool commandExecuted = false; + this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) => + { + if (expectedCommand == $"{exe} {arguments}") + { + commandExecuted = true; + } + + return new InMemoryProcess + { + StartInfo = new ProcessStartInfo + { + FileName = exe, + Arguments = arguments + }, + ExitCode = 0, + OnStart = () => true, + OnHasExited = () => true + }; + }; + + using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters)) + { + await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); + } + + Assert.IsTrue(commandExecuted); + } + + [Test] + public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommandOnX64Architecture() + { + SetupDefaultMockBehavior(Architecture.X64); + ProcessStartInfo expectedInfo = new ProcessStartInfo(); string expectedCommand = $"sb run --host-list localhost -c Test.yaml"; @@ -216,8 +290,47 @@ public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommand() } [Test] - public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallation() + public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommandOnArm64Architecture() { + SetupDefaultMockBehavior(Architecture.Arm64); + + ProcessStartInfo expectedInfo = new ProcessStartInfo(); + string expectedCommand = $"sudo docker exec sb-dev sb run --no-docker -l localhost -c /mnt/Test.yaml --output-dir outputs/"; + + bool commandExecuted = false; + this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) => + { + if (expectedCommand == $"{exe} {arguments}") + { + commandExecuted = true; + } + + return new InMemoryProcess + { + StartInfo = new ProcessStartInfo + { + FileName = exe, + Arguments = arguments + }, + ExitCode = 0, + OnStart = () => true, + OnHasExited = () => true + }; + }; + + using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters)) + { + await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); + } + + Assert.IsTrue(commandExecuted); + } + + [Test] + public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallationOnX64Architecture() + { + SetupDefaultMockBehavior(Architecture.X64); + ProcessStartInfo expectedInfo = new ProcessStartInfo(); List expectedCommands = new List { @@ -261,8 +374,57 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati } [Test] - public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRun() + public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallationOnArm64Architecture() { + SetupDefaultMockBehavior(Architecture.Arm64); + + ProcessStartInfo expectedInfo = new ProcessStartInfo(); + string expectedPath = this.mockFixture.PlatformSpecifics.Combine(this.mockFixture.PlatformSpecifics.PackagesDirectory, "superbenchmark"); + List expectedCommands = new List + { + $"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"", + $"sudo docker pull testContainer", + $"sudo docker run -itd --name=sb-dev --privileged --net=host --ipc=host --gpus=all -w /root -v {expectedPath}:/mnt testContainer", + $"sudo docker exec sb-dev sb run --no-docker -l localhost -c /mnt/Test.yaml --output-dir outputs/" + }; + + int processCount = 0; + this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) => + { + Assert.AreEqual(expectedCommands.ElementAt(processCount), $"{exe} {arguments}"); + processCount++; + + return new InMemoryProcess + { + StartInfo = new ProcessStartInfo + { + FileName = exe, + Arguments = arguments + }, + ExitCode = 0, + OnStart = () => true, + OnHasExited = () => true + }; + }; + + this.mockFixture.StateManager.OnGetState().ReturnsAsync(JObject.FromObject(new SuperBenchmarkExecutor.SuperBenchmarkState() + { + SuperBenchmarkInitialized = false + })); + + using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters)) + { + await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); + } + + Assert.IsTrue(processCount == 4); + } + + [Test] + public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRunOnX64Architecture() + { + this.SetupDefaultMockBehavior(Architecture.X64); + ProcessStartInfo expectedInfo = new ProcessStartInfo(); List expectedCommands = new List { @@ -301,6 +463,76 @@ public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecu Assert.IsTrue(processCount == 1); } + [Test] + public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRunOnArm64Architecture() + { + SetupDefaultMockBehavior(Architecture.Arm64); + + ProcessStartInfo expectedInfo = new ProcessStartInfo(); + List expectedCommands = new List + { + $"sb run --host-list localhost -c Test.yaml" + }; + + int processCount = 0; + this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) => + { + Assert.AreEqual(expectedCommands.ElementAt(processCount), $"{exe} {arguments}"); + processCount++; + + return new InMemoryProcess + { + StartInfo = new ProcessStartInfo + { + FileName = exe, + Arguments = arguments + }, + ExitCode = 0, + OnStart = () => true, + OnHasExited = () => true + }; + }; + + this.mockFixture.StateManager.OnGetState().ReturnsAsync(JObject.FromObject(new SuperBenchmarkExecutor.SuperBenchmarkState() + { + SuperBenchmarkInitialized = true + })); + + using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters)) + { + await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); + } + + Assert.IsTrue(processCount == 1); + } + + public void SetupDefaultMockBehavior(Architecture architecture) + { + this.mockFixture = new MockFixture(); + this.mockFixture.Setup(PlatformID.Unix, architecture); + this.mockPackage = new DependencyPath("SuperBenchmark", this.mockFixture.PlatformSpecifics.GetPackagePath("superbenchmark")); + + this.mockFixture.PackageManager.OnGetPackage().ReturnsAsync(this.mockPackage); + + this.mockFixture.File.Reset(); + this.mockFixture.File.Setup(f => f.Exists(It.IsAny())) + .Returns(true); + this.mockFixture.Directory.Setup(f => f.Exists(It.IsAny())) + .Returns(true); + this.mockFixture.Directory.Setup(f => f.Exists(It.IsRegex("superbenchmark"))) + .Returns(false); + + this.mockFixture.FileSystem.SetupGet(fs => fs.File).Returns(this.mockFixture.File.Object); + + this.mockFixture.Parameters = new Dictionary() + { + { nameof(SuperBenchmarkExecutor.Version), "0.0.1" }, + { nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" }, + { nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" }, + { nameof(SuperBenchmarkExecutor.Username), "testuser" } + }; + } + private class TestSuperBenchmarkExecutor : SuperBenchmarkExecutor { public TestSuperBenchmarkExecutor(IServiceCollection dependencies, IDictionary parameters) diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs index afd8c2ea60..e8eb4c915f 100644 --- a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs +++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs @@ -128,9 +128,10 @@ protected override async Task ExecuteAsync(EventContext telemetryContext, Cancel { using (BackgroundOperations profiling = BackgroundOperations.BeginProfiling(this, cancellationToken)) { + string command = this.CpuArchitecture == Architecture.Arm64 ? "sudo" : "sb"; string commandArguments = this.GetCommandLineArguments(); - using (IProcessProxy process = await this.ExecuteCommandAsync("sb", commandArguments, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, runElevated: false)) + using (IProcessProxy process = await this.ExecuteCommandAsync(command, commandArguments, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, runElevated: false)) { if (!cancellationToken.IsCancellationRequested) { @@ -156,13 +157,18 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can if (!state.SuperBenchmarkInitialized) { - // This is to grant directory folders for await this.systemManager.MakeFilesExecutableAsync(this.PlatformSpecifics.CurrentDirectory, this.Platform, cancellationToken); - string cloneDir = this.PlatformSpecifics.Combine(this.PlatformSpecifics.PackagesDirectory, "superbenchmark"); - if (!this.fileSystem.Directory.Exists(cloneDir)) + if (!this.fileSystem.Directory.Exists(this.SuperBenchmarkDirectory)) { - await this.ExecuteSbCommandAsync("git", $"clone -b v{this.Version} https://github.com/microsoft/superbenchmark", this.PlatformSpecifics.PackagesDirectory, telemetryContext, cancellationToken, true); + if (this.CpuArchitecture == Architecture.Arm64) + { + this.fileSystem.Directory.CreateDirectory(this.SuperBenchmarkDirectory); + } + else + { + await this.ExecuteSbCommandAsync("git", $"clone -b v{this.Version} https://github.com/microsoft/superbenchmark", this.PlatformSpecifics.PackagesDirectory, telemetryContext, cancellationToken, true); + } } foreach (string file in this.fileSystem.Directory.GetFiles(this.PlatformSpecifics.GetScriptPath("superbenchmark"))) @@ -173,12 +179,20 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can true); } - await this.ExecuteSbCommandAsync("bash", $"initialize.sh {this.Username}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true); - await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false); + if (this.CpuArchitecture == Architecture.Arm64) + { + await this.ExecuteSbCommandAsync("sudo", $"docker pull {this.ContainerVersion}", this.PlatformSpecifics.CurrentDirectory, telemetryContext, cancellationToken, true); + await this.ExecuteSbCommandAsync("sudo", $"docker run -itd --name=sb-dev --privileged --net=host --ipc=host --gpus=all -w /root -v {this.SuperBenchmarkDirectory}:/mnt {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true); + } + else + { + await this.ExecuteSbCommandAsync("bash", $"initialize.sh {this.Username}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true); + await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false); + } state.SuperBenchmarkInitialized = true; } - + await this.stateManager.SaveStateAsync($"{nameof(SuperBenchmarkState)}", state, cancellationToken); } @@ -232,7 +246,14 @@ private async Task CaptureMetricsAsync(IProcessProxy process, string commandArgu private string GetCommandLineArguments() { - return @$"run --host-list localhost -c {this.ConfigurationFile}"; + switch (this.CpuArchitecture) + { + case Architecture.Arm64: + return @$"docker exec sb-dev sb run --no-docker -l localhost -c /mnt/{this.ConfigurationFile} --output-dir /mnt/outputs/"; ///////////////////////////////////////////////////// Check the output dir here + + default: + return @$"run --host-list localhost -c {this.ConfigurationFile}"; + } } internal class SuperBenchmarkState : State diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/gb200_dev_config.yaml b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/gb200_dev_config.yaml new file mode 100644 index 0000000000..20d1c2a21e --- /dev/null +++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/gb200_dev_config.yaml @@ -0,0 +1,384 @@ +version: v0.11 +superbench: + enable: + # microbenchmark - computation + - kernel-launch # 00:53 + - gemm-flops # 12:10 + - cudnn-function # 03:58 + - cublaslt-gemm # 10:27 + # - cublaslt-gemm:bmm # 01:09:21 + - cublas-function # 02:42 + - matmul # 00:14 + - gpu-burn # 15:07 + # microbenchmark - communication + - cpu-memory-bw-latency # 03:30 + - mem-bw # 00:50 + - gpu-copy-bw:perf # 01:21 + - gpu-copy-bw:correctness # 00:14 + # - gpu-stream:perf + #- gpu-stream:correctness + # NCCL + - nccl-bw:nvlink # 01:03 + - nccl-bw:nvlink-allgather # 01:03 + - nccl-bw:nvlink-broadcast # 01:03 + - nccl-bw:nvlink-reduce # 01:03 + - nccl-bw:nvlink-reducescatter # 01:03 + - nccl-bw:nvlink-alltoall # 01:03 + # microbenchmark - comput-comm. overlap + - computation-communication-overlap # 04:50 + - sharding-matmul # 00:25 + # microbenchmark - storage + # - disk-benchmark # 18:47 + # model benchmark - inference + # - ort-inference # 02:24 + # ##tensorrt-inference # 02:03:33 + - dist-inference # 00:35 + # model benchmark - training + - model-benchmarks:gpt # 10:37 + - model-benchmarks:bert # 15:02 + - model-benchmarks:lstm # 02:23 + - model-benchmarks:resnet # 25:40 + - model-benchmarks:densenet # 07:51 + - model-benchmarks:vgg # 11:51 + - model-benchmarks:llama2-7b + - model-benchmarks:llama2-7b-fp32: + - model-benchmarks:stress # 01:00:43 + monitor: + enable: false + sample_duration: 1 + sample_interval: 10 + var: + default_timeout: &default_timeout 600 + default_local_mode: &default_local_mode + modes: + - name: local + proc_num: 4 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} + parallel: yes + default_pytorch_mode: &default_pytorch_mode + modes: + - name: torch.distributed + proc_num: 4 + node_num: 1 + env: + NCCL_DEBUG: WARN + TORCH_NCCL_ASYNC_ERROR_HANDLING: '0' + frameworks: [pytorch] + model_ddp_parameter: &model_ddp_param + duration: 0 + num_warmup: 128 + num_steps: 512 + sample_count: 8192 + batch_size: 128 + precision: [float32, float16] + model_action: [train] + pin_memory: yes + num_workers: 0 + nccl_parameter: &nccl_param + minbytes: 1K + maxbytes: 16G + stepfactor: 2 + check: 1 + warmup_iters: 20 + iters: 100 + benchmarks: + # microbenchmark - computation + kernel-launch: + <<: *default_local_mode + timeout: *default_timeout + gemm-flops: + <<: *default_local_mode + timeout: 1500 + cudnn-function: + <<: *default_local_mode + timeout: *default_timeout + cublaslt-gemm: + <<: *default_local_mode + timeout: 1200 + parameters: + in_types: ['fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2'] + shapes: + - 4096,4096,4096 + - 8192,8192,8192 + - 16384,16384,16384 + - 16:2048,4608,12288 + - 16:2048,12288,1536 + cublaslt-gemm:bmm: + <<: *default_local_mode + timeout: 7200 + parameters: + in_types: ['fp64', 'fp32', 'fp16', 'bf16'] + batch: 96:12288 + shapes: + - 1,1:2048,128 + - 1,128,1:64 + - 1,128,256:2048 + cublas-function: + <<: *default_local_mode + timeout: *default_timeout + matmul: + <<: *default_local_mode + timeout: *default_timeout + frameworks: [pytorch] + gpu-burn: + timeout: 1800 + modes: + - name: local + parallel: no + parameters: + time: 90 + doubles: true + tensor_core: true + # microbenchmark - communication + cpu-memory-bw-latency: + timeout: *default_timeout + modes: + - name: local + parallel: no + parameters: + tests: + - bandwidth_matrix + - latency_matrix + - max_bandwidth + mem-bw: + timeout: *default_timeout + modes: + - name: local + proc_num: 4 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) + parallel: no + gpu-copy-bw:perf: + timeout: *default_timeout + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod] + copy_type: [sm, dma] + gpu-copy-bw:correctness: + timeout: *default_timeout + modes: + - name: local + parallel: no + parameters: + mem_type: [htod, dtoh, dtod] + copy_type: [sm, dma] + size: 4096 + num_warm_up: 0 + num_loops: 1 + check_data: true + gpu-stream:perf: + timeout: 600 + modes: + - name: local + parallel: false + parameters: + num_warm_up: 10 + num_loops: 40 + size: 4294967296 + gpu-stream:correctness: + timeout: 600 + modes: + - name: local + parallel: false + parameters: + num_warm_up: 0 + num_loops: 1 + size: 1048576 + check_data: true + nccl-bw:nvlink: + timeout: *default_timeout + modes: + - name: mpi + proc_num: 4 + node_num: 1 + parameters: + <<: *nccl_param + nccl-bw:nvlink-allgather: + timeout: *default_timeout + modes: + - name: mpi + proc_num: 4 + node_num: 1 + parameters: + <<: *nccl_param + operation: allgather + nccl-bw:nvlink-broadcast: + timeout: *default_timeout + modes: + - name: mpi + proc_num: 4 + node_num: 1 + parameters: + <<: *nccl_param + operation: broadcast + nccl-bw:nvlink-reduce: + timeout: *default_timeout + modes: + - name: mpi + proc_num: 4 + node_num: 1 + parameters: + <<: *nccl_param + operation: reduce + nccl-bw:nvlink-reducescatter: + timeout: *default_timeout + modes: + - name: mpi + proc_num: 4 + node_num: 1 + parameters: + <<: *nccl_param + operation: reducescatter + nccl-bw:nvlink-alltoall: + timeout: *default_timeout + modes: + - name: mpi + proc_num: 4 + node_num: 1 + parameters: + <<: *nccl_param + operation: alltoall + # microbenchmark - comput-comm. overlap + computation-communication-overlap: + <<: *default_pytorch_mode + timeout: *default_timeout + sharding-matmul: + <<: *default_pytorch_mode + timeout: *default_timeout + # microbenchmark - storage + disk-benchmark: + timeout: 2400 + modes: + - name: local + parallel: no + parameters: + block_devices: + - /dev/nvme0n1 + - /dev/nvme1n1 + - /dev/nvme2n1 + - /dev/nvme3n1 + - /dev/nvme4n1 + - /dev/nvme5n1 + - /dev/nvme6n1 + - /dev/nvme7n1 + seq_read_runtime: 60 + rand_read_runtime: 60 + # model benchmark - inference + ort-inference: + <<: *default_local_mode + timeout: *default_timeout + tensorrt-inference: + <<: *default_local_mode + timeout: 14400 + parameters: + pytorch_models: + - resnet50 + - resnet101 + - resnet152 + - densenet169 + - densenet201 + - bert-base + - bert-large + seq_length: 224 + batch_size: 32 + precision: fp16 + dist-inference: + <<: *default_pytorch_mode + timeout: *default_timeout + parameters: + batch_size: 80 + input_size: 128 + hidden_size: 128 + num_layers: 50 + num_steps: 10000 + precision: float16 + # model benchmark - training + model-benchmarks:gpt: + <<: *default_pytorch_mode + timeout: 1200 + models: + - gpt2-small + - gpt2-large + parameters: + <<: *model_ddp_param + precision: [float32, float16, fp8_hybrid] + batch_size: 32 + seq_len: 224 + model-benchmarks:bert: + <<: *default_pytorch_mode + timeout: 1800 + models: + - bert-base + - bert-large + parameters: + <<: *model_ddp_param + precision: [float32, float16, fp8_hybrid] + seq_len: 224 + model-benchmarks:lstm: + <<: *default_pytorch_mode + timeout: *default_timeout + models: + - lstm + parameters: + <<: *model_ddp_param + batch_size: 1024 + input_size: 224 + hidden_size: 1000 + seq_len: 32 + model-benchmarks:resnet: + <<: *default_pytorch_mode + timeout: 3000 + models: + - resnet50 + - resnet101 + - resnet152 + parameters: + <<: *model_ddp_param + batch_size: 384 + model-benchmarks:densenet: + <<: *default_pytorch_mode + timeout: 1000 + models: + - densenet169 + - densenet201 + parameters: + <<: *model_ddp_param + model-benchmarks:vgg: + <<: *default_pytorch_mode + timeout: 1500 + models: + - vgg11 + - vgg13 + - vgg16 + - vgg19 + parameters: + <<: *model_ddp_param + model-benchmarks:stress: + <<: *default_pytorch_mode + timeout: 7200 + models: + - gpt2-large + parameters: + <<: *model_ddp_param + batch_size: 32 + model-benchmarks:llama2-7b: + <<: *default_pytorch_mode + timeout: 1500 + models: + - llama2-7b + parameters: + <<: *model_ddp_param + batch_size: 72 + seq_len: 256 + precision: [float16, fp8_hybrid] + model-benchmarks:llama2-7b-fp32: + <<: *default_pytorch_mode + timeout: 1500 + models: + - llama2-7b + parameters: + <<: *model_ddp_param + batch_size: 24 + seq_len: 256 + precision: [float32] diff --git a/website/docs/workloads/superbenchmark/superbenchmark-profiles.md b/website/docs/workloads/superbenchmark/superbenchmark-profiles.md index e0b08fde41..e762430be2 100644 --- a/website/docs/workloads/superbenchmark/superbenchmark-profiles.md +++ b/website/docs/workloads/superbenchmark/superbenchmark-profiles.md @@ -8,13 +8,14 @@ The following profiles run customer-representative or benchmarking scenarios usi ::: ## PERF-GPU-SUPERBENCH.json -Runs the SuperBenchmark benchmark workload to test GPU performance. This workload is supported ONLY for systems that contain nVidia GPU +Runs the SuperBenchmark benchmark workload to test GPU performance. This workload is supported ONLY for systems that contain Nvidia GPU hardware components. Work is underway with partner teams in Azure to support additional GPU manufacturers. * [Workload Profile](https://github.com/microsoft/VirtualClient/blob/main/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json) * **Supported Platform/Architectures** * linux-x64 + * linux-arm64 * **Supports Disconnected Scenarios** * No. Internet connection required.