diff --git a/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs b/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs
index ed727f84f8..5875fdb4ab 100644
--- a/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs
+++ b/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs
@@ -7,8 +7,10 @@ namespace VirtualClient.Actions
     using System.Collections.Generic;
     using System.Diagnostics;
     using System.Linq;
+    using System.Runtime.InteropServices;
     using System.Threading;
     using System.Threading.Tasks;
+    using Microsoft.CodeAnalysis.CSharp.Syntax;
     using Microsoft.Extensions.DependencyInjection;
     using Moq;
     using Newtonsoft.Json.Linq;
@@ -26,36 +28,18 @@ public class SuperBenchmarkExecutorTests
         private DependencyPath mockPackage;
 
         [SetUp]
-        public void SetupDefaultBehavior()
+        public void SetupTests()
         {
             this.mockFixture = new MockFixture();
-            this.mockFixture.Setup(PlatformID.Unix);
-            this.mockPackage = new DependencyPath("SuperBenchmark", this.mockFixture.PlatformSpecifics.GetPackagePath("superbenchmark"));
-
-            this.mockFixture.PackageManager.OnGetPackage().ReturnsAsync(this.mockPackage);
-
-            this.mockFixture.File.Reset();
-            this.mockFixture.File.Setup(f => f.Exists(It.IsAny<string>()))
-                .Returns(true);
-            this.mockFixture.Directory.Setup(f => f.Exists(It.IsAny<string>()))
-                .Returns(true);
-            this.mockFixture.Directory.Setup(f => f.Exists(It.IsRegex("superbenchmark")))
-                .Returns(false);
-
-            this.mockFixture.FileSystem.SetupGet(fs => fs.File).Returns(this.mockFixture.File.Object);
-
-            this.mockFixture.Parameters = new Dictionary<string, IConvertible>()
-            {
-                { nameof(SuperBenchmarkExecutor.Version), "0.0.1" },
-                { nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" },
-                { nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" },
-                { nameof(SuperBenchmarkExecutor.Username), "testuser" }
-            };
         }
 
         [Test]
-        public void SuperBenchmarkStateIsSerializeable()
+        [TestCase(Architecture.X64)]
+        [TestCase(Architecture.Arm64)]
+        public void SuperBenchmarkStateIsSerializeable(Architecture architecture)
         {
+            SetupDefaultMockBehavior(architecture);
+
             State state = new State(new Dictionary<string, IConvertible>
             {
                 ["SuperBenchmarkInitialized"] = true
@@ -69,8 +53,10 @@ public void SuperBenchmarkStateIsSerializeable()
         }
 
         [Test]
-        public async Task SuperBenchmarkExecutorClonesTheExpectedRepoContents()
+        public async Task SuperBenchmarkExecutorClonesTheExpectedRepoContentsOnX64Architecture()
         {
+            SetupDefaultMockBehavior(Architecture.X64);
+
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
             this.mockFixture.Parameters = new Dictionary<string, IConvertible>()
             {
@@ -111,8 +97,54 @@ public async Task SuperBenchmarkExecutorClonesTheExpectedRepoContents()
         }
 
         [Test]
-        public async Task SuperBenchmarkExecutorUsesTheExpectedScriptFilesOnExecution()
+        public async Task SuperBenchmarkExecutorPullsTheExpectedDockerImageContentsOnArm64Architecture()
+        {
+            SetupDefaultMockBehavior(Architecture.Arm64);
+
+            ProcessStartInfo expectedInfo = new ProcessStartInfo();
+            this.mockFixture.Parameters = new Dictionary<string, IConvertible>()
+            {
+                { nameof(SuperBenchmarkExecutor.Version), "1.2.3" },
+                { nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" },
+                { nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" },
+                { nameof(SuperBenchmarkExecutor.Username), "testuser" }
+            };
+            string expectedCommand = $"sudo docker pull {this.mockFixture.Parameters[nameof(SuperBenchmarkExecutor.ContainerVersion)]}";
+
+            bool commandExecuted = false;
+            this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
+            {
+                if (expectedCommand == $"{exe} {arguments}")
+                {
+                    commandExecuted = true;
+                }
+
+                return new InMemoryProcess
+                {
+                    StartInfo = new ProcessStartInfo
+                    {
+                        FileName = exe,
+                        Arguments = arguments
+                    },
+                    ExitCode = 0,
+                    OnStart = () => true,
+                    OnHasExited = () => true
+                };
+            };
+
+            using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
+            {
+                await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);
+            }
+
+            Assert.IsTrue(commandExecuted);
+        }
+
+        [Test]
+        public async Task SuperBenchmarkExecutorUsesTheExpectedScriptFilesOnExecutionOnX64Architecture()
         {
+            SetupDefaultMockBehavior(Architecture.X64);
+
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
             string expectedCommand = $"sudo bash initialize.sh testuser";
 
@@ -146,8 +178,10 @@ public async Task SuperBenchmarkExecutorUsesTheExpectedScriptFilesOnExecution()
         }
 
         [Test]
-        public async Task SuperBenchmarkExecutorDeploySuperBenchContainer()
+        public async Task SuperBenchmarkExecutorDeploySuperBenchContainerOnX64Architecture()
         {
+            SetupDefaultMockBehavior(Architecture.X64);
+
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
             string expectedCommand = $"sb deploy --host-list localhost -i testContainer";
 
@@ -181,8 +215,48 @@ public async Task SuperBenchmarkExecutorDeploySuperBenchContainer()
         }
 
         [Test]
-        public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommand()
+        public async Task SuperBenchmarkExecutorRunsDockerContainerInDetachedModeForSetupOnArm64Architecture()
         {
+            SetupDefaultMockBehavior(Architecture.Arm64);
+
+            ProcessStartInfo expectedInfo = new ProcessStartInfo();
+            string expectedPath = this.mockFixture.PlatformSpecifics.Combine(this.mockFixture.PlatformSpecifics.PackagesDirectory, "superbenchmark");
+            string expectedCommand = $"sudo docker run -itd --name=sb-dev --privileged --net=host --ipc=host --gpus=all -w /root -v {expectedPath}:/mnt testContainer";
+
+            bool commandExecuted = false;
+            this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
+            {
+                if (expectedCommand == $"{exe} {arguments}")
+                {
+                    commandExecuted = true;
+                }
+
+                return new InMemoryProcess
+                {
+                    StartInfo = new ProcessStartInfo
+                    {
+                        FileName = exe,
+                        Arguments = arguments
+                    },
+                    ExitCode = 0,
+                    OnStart = () => true,
+                    OnHasExited = () => true
+                };
+            };
+
+            using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
+            {
+                await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);
+            }
+
+            Assert.IsTrue(commandExecuted);
+        }
+
+        [Test]
+        public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommandOnX64Architecture()
+        {
+            SetupDefaultMockBehavior(Architecture.X64);
+
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
             string expectedCommand = $"sb run --host-list localhost -c Test.yaml";
 
@@ -216,8 +290,47 @@ public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommand()
         }
 
         [Test]
-        public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallation()
+        public async Task SuperBenchmarkExecutorRunsTheExpectedWorkloadCommandOnArm64Architecture()
         {
+            SetupDefaultMockBehavior(Architecture.Arm64);
+
+            ProcessStartInfo expectedInfo = new ProcessStartInfo();
+            string expectedCommand = $"sudo docker exec sb-dev sb run --no-docker -l localhost -c /mnt/Test.yaml --output-dir outputs/";
+
+            bool commandExecuted = false;
+            this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
+            {
+                if (expectedCommand == $"{exe} {arguments}")
+                {
+                    commandExecuted = true;
+                }
+
+                return new InMemoryProcess
+                {
+                    StartInfo = new ProcessStartInfo
+                    {
+                        FileName = exe,
+                        Arguments = arguments
+                    },
+                    ExitCode = 0,
+                    OnStart = () => true,
+                    OnHasExited = () => true
+                };
+            };
+
+            using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
+            {
+                await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);
+            }
+
+            Assert.IsTrue(commandExecuted);
+        }
+
+        [Test]
+        public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallationOnX64Architecture()
+        {
+            SetupDefaultMockBehavior(Architecture.X64);
+
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
             List<string> expectedCommands = new List<string>
             {
@@ -261,8 +374,57 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati
         }
 
         [Test]
-        public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRun()
+        public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallationOnArm64Architecture()
         {
+            SetupDefaultMockBehavior(Architecture.Arm64);
+
+            ProcessStartInfo expectedInfo = new ProcessStartInfo();
+            string expectedPath = this.mockFixture.PlatformSpecifics.Combine(this.mockFixture.PlatformSpecifics.PackagesDirectory, "superbenchmark");
+            List<string> expectedCommands = new List<string>
+            {
+                $"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"",
+                $"sudo docker pull testContainer",
+                $"sudo docker run -itd --name=sb-dev --privileged --net=host --ipc=host --gpus=all -w /root -v {expectedPath}:/mnt testContainer",
+                $"sudo docker exec sb-dev sb run --no-docker -l localhost -c /mnt/Test.yaml --output-dir outputs/"
+            };
+
+            int processCount = 0;
+            this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
+            {
+                Assert.AreEqual(expectedCommands.ElementAt(processCount), $"{exe} {arguments}");
+                processCount++;
+
+                return new InMemoryProcess
+                {
+                    StartInfo = new ProcessStartInfo
+                    {
+                        FileName = exe,
+                        Arguments = arguments
+                    },
+                    ExitCode = 0,
+                    OnStart = () => true,
+                    OnHasExited = () => true
+                };
+            };
+
+            this.mockFixture.StateManager.OnGetState().ReturnsAsync(JObject.FromObject(new SuperBenchmarkExecutor.SuperBenchmarkState()
+            {
+                SuperBenchmarkInitialized = false
+            }));
+
+            using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
+            {
+                await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);
+            }
+
+            Assert.IsTrue(processCount == 4);
+        }
+
+        [Test]
+        public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRunOnX64Architecture()
+        {
+            this.SetupDefaultMockBehavior(Architecture.X64);
+
             ProcessStartInfo expectedInfo = new ProcessStartInfo();
             List<string> expectedCommands = new List<string>
             {
@@ -301,6 +463,76 @@ public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecu
             Assert.IsTrue(processCount == 1);
         }
 
+        [Test]
+        public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRunOnArm64Architecture()
+        {
+            SetupDefaultMockBehavior(Architecture.Arm64);
+
+            ProcessStartInfo expectedInfo = new ProcessStartInfo();
+            List<string> expectedCommands = new List<string>
+            {
+                $"sb run --host-list localhost -c Test.yaml"
+            };
+
+            int processCount = 0;
+            this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
+            {
+                Assert.AreEqual(expectedCommands.ElementAt(processCount), $"{exe} {arguments}");
+                processCount++;
+
+                return new InMemoryProcess
+                {
+                    StartInfo = new ProcessStartInfo
+                    {
+                        FileName = exe,
+                        Arguments = arguments
+                    },
+                    ExitCode = 0,
+                    OnStart = () => true,
+                    OnHasExited = () => true
+                };
+            };
+
+            this.mockFixture.StateManager.OnGetState().ReturnsAsync(JObject.FromObject(new SuperBenchmarkExecutor.SuperBenchmarkState()
+            {
+                SuperBenchmarkInitialized = true
+            }));
+
+            using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
+            {
+                await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);
+            }
+
+            Assert.IsTrue(processCount == 1);
+        }
+
+        public void SetupDefaultMockBehavior(Architecture architecture)
+        {
+            this.mockFixture = new MockFixture();
+            this.mockFixture.Setup(PlatformID.Unix, architecture);
+            this.mockPackage = new DependencyPath("SuperBenchmark", this.mockFixture.PlatformSpecifics.GetPackagePath("superbenchmark"));
+
+            this.mockFixture.PackageManager.OnGetPackage().ReturnsAsync(this.mockPackage);
+
+            this.mockFixture.File.Reset();
+            this.mockFixture.File.Setup(f => f.Exists(It.IsAny<string>()))
+                .Returns(true);
+            this.mockFixture.Directory.Setup(f => f.Exists(It.IsAny<string>()))
+                .Returns(true);
+            this.mockFixture.Directory.Setup(f => f.Exists(It.IsRegex("superbenchmark")))
+                .Returns(false);
+
+            this.mockFixture.FileSystem.SetupGet(fs => fs.File).Returns(this.mockFixture.File.Object);
+
+            this.mockFixture.Parameters = new Dictionary<string, IConvertible>()
+            {
+                { nameof(SuperBenchmarkExecutor.Version), "0.0.1" },
+                { nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" },
+                { nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" },
+                { nameof(SuperBenchmarkExecutor.Username), "testuser" }
+            };
+        }
+
         private class TestSuperBenchmarkExecutor : SuperBenchmarkExecutor
         {
             public TestSuperBenchmarkExecutor(IServiceCollection dependencies, IDictionary<string, IConvertible> parameters)
diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs
index afd8c2ea60..e8eb4c915f 100644
--- a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs
+++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs
@@ -128,9 +128,10 @@ protected override async Task ExecuteAsync(EventContext telemetryContext, Cancel
         {
             using (BackgroundOperations profiling = BackgroundOperations.BeginProfiling(this, cancellationToken))
             {
+                string command = this.CpuArchitecture == Architecture.Arm64 ? "sudo" : "sb";
                 string commandArguments = this.GetCommandLineArguments();
 
-                using (IProcessProxy process = await this.ExecuteCommandAsync("sb", commandArguments, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, runElevated: false))
+                using (IProcessProxy process = await this.ExecuteCommandAsync(command, commandArguments, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, runElevated: false))
                 {
                     if (!cancellationToken.IsCancellationRequested)
                     {
@@ -156,13 +157,18 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can
 
             if (!state.SuperBenchmarkInitialized)
             {
-                // This is to grant directory folders for 
                 await this.systemManager.MakeFilesExecutableAsync(this.PlatformSpecifics.CurrentDirectory, this.Platform, cancellationToken);
 
-                string cloneDir = this.PlatformSpecifics.Combine(this.PlatformSpecifics.PackagesDirectory, "superbenchmark");
-                if (!this.fileSystem.Directory.Exists(cloneDir))
+                if (!this.fileSystem.Directory.Exists(this.SuperBenchmarkDirectory))
                 {
-                    await this.ExecuteSbCommandAsync("git", $"clone -b v{this.Version} https://github.com/microsoft/superbenchmark", this.PlatformSpecifics.PackagesDirectory, telemetryContext, cancellationToken, true);
+                    if (this.CpuArchitecture == Architecture.Arm64)
+                    {
+                        this.fileSystem.Directory.CreateDirectory(this.SuperBenchmarkDirectory);
+                    }
+                    else
+                    {
+                        await this.ExecuteSbCommandAsync("git", $"clone -b v{this.Version} https://github.com/microsoft/superbenchmark", this.PlatformSpecifics.PackagesDirectory, telemetryContext, cancellationToken, true);
+                    }
                 }
 
                 foreach (string file in this.fileSystem.Directory.GetFiles(this.PlatformSpecifics.GetScriptPath("superbenchmark")))
@@ -173,12 +179,20 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can
                         true);
                 }
 
-                await this.ExecuteSbCommandAsync("bash", $"initialize.sh {this.Username}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true);
-                await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false);
+                if (this.CpuArchitecture == Architecture.Arm64)
+                {
+                    await this.ExecuteSbCommandAsync("sudo", $"docker pull {this.ContainerVersion}", this.PlatformSpecifics.CurrentDirectory, telemetryContext, cancellationToken, true);
+                    await this.ExecuteSbCommandAsync("sudo", $"docker run -itd --name=sb-dev --privileged --net=host --ipc=host --gpus=all -w /root -v {this.SuperBenchmarkDirectory}:/mnt {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true);
+                }
+                else
+                {
+                    await this.ExecuteSbCommandAsync("bash", $"initialize.sh {this.Username}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true);
+                    await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false);
+                }
 
                 state.SuperBenchmarkInitialized = true;
             }
-
+            
             await this.stateManager.SaveStateAsync<SuperBenchmarkState>($"{nameof(SuperBenchmarkState)}", state, cancellationToken);
         }
 
@@ -232,7 +246,14 @@ private async Task CaptureMetricsAsync(IProcessProxy process, string commandArgu
 
         private string GetCommandLineArguments()
         {
-            return @$"run --host-list localhost -c {this.ConfigurationFile}";
+            switch (this.CpuArchitecture)
+            {
+                case Architecture.Arm64:
+                    return @$"docker exec sb-dev sb run --no-docker -l localhost -c /mnt/{this.ConfigurationFile} --output-dir /mnt/outputs/"; ///////////////////////////////////////////////////// Check the output dir here
+
+                default:
+                    return @$"run --host-list localhost -c {this.ConfigurationFile}";
+            }
         }
 
         internal class SuperBenchmarkState : State
diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/gb200_dev_config.yaml b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/gb200_dev_config.yaml
new file mode 100644
index 0000000000..20d1c2a21e
--- /dev/null
+++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/gb200_dev_config.yaml
@@ -0,0 +1,384 @@
+version: v0.11
+superbench:
+  enable:
+  # microbenchmark - computation
+  - kernel-launch                         # 00:53
+  - gemm-flops                            # 12:10
+  - cudnn-function                        # 03:58
+  - cublaslt-gemm                         # 10:27
+  # - cublaslt-gemm:bmm                     # 01:09:21
+  - cublas-function                       # 02:42
+  - matmul                                # 00:14
+  - gpu-burn                              # 15:07
+  # microbenchmark - communication
+  - cpu-memory-bw-latency                 # 03:30
+  - mem-bw                                # 00:50
+  - gpu-copy-bw:perf                      # 01:21
+  - gpu-copy-bw:correctness               # 00:14
+  # - gpu-stream:perf
+  #- gpu-stream:correctness
+  # NCCL
+  - nccl-bw:nvlink                        # 01:03
+  - nccl-bw:nvlink-allgather              # 01:03
+  - nccl-bw:nvlink-broadcast              # 01:03
+  - nccl-bw:nvlink-reduce                 # 01:03
+  - nccl-bw:nvlink-reducescatter          # 01:03
+  - nccl-bw:nvlink-alltoall               # 01:03
+  # microbenchmark - comput-comm. overlap
+  - computation-communication-overlap     # 04:50
+  - sharding-matmul                       # 00:25
+  # microbenchmark - storage
+  # - disk-benchmark                       # 18:47
+  # model benchmark - inference
+  # - ort-inference                         # 02:24
+  # ##tensorrt-inference                    # 02:03:33
+  - dist-inference                        # 00:35
+  # model benchmark - training
+  - model-benchmarks:gpt                  # 10:37
+  - model-benchmarks:bert                 # 15:02
+  - model-benchmarks:lstm                 # 02:23
+  - model-benchmarks:resnet               # 25:40
+  - model-benchmarks:densenet             # 07:51
+  - model-benchmarks:vgg                  # 11:51
+  - model-benchmarks:llama2-7b            
+  - model-benchmarks:llama2-7b-fp32:
+  - model-benchmarks:stress               # 01:00:43
+  monitor:
+    enable: false
+    sample_duration: 1
+    sample_interval: 10
+  var:
+    default_timeout: &default_timeout 600
+    default_local_mode: &default_local_mode
+      modes:
+      - name: local
+        proc_num: 4
+        prefix: CUDA_VISIBLE_DEVICES={proc_rank}
+        parallel: yes
+    default_pytorch_mode: &default_pytorch_mode
+      modes:
+      - name: torch.distributed
+        proc_num: 4
+        node_num: 1
+        env:
+          NCCL_DEBUG: WARN
+          TORCH_NCCL_ASYNC_ERROR_HANDLING: '0'
+      frameworks: [pytorch]
+    model_ddp_parameter: &model_ddp_param
+      duration: 0
+      num_warmup: 128
+      num_steps: 512
+      sample_count: 8192
+      batch_size: 128
+      precision: [float32, float16]
+      model_action: [train]
+      pin_memory: yes
+      num_workers: 0
+    nccl_parameter: &nccl_param
+      minbytes: 1K
+      maxbytes: 16G
+      stepfactor: 2
+      check: 1
+      warmup_iters: 20
+      iters: 100
+  benchmarks:
+    # microbenchmark - computation
+    kernel-launch:
+      <<: *default_local_mode
+      timeout: *default_timeout
+    gemm-flops:
+      <<: *default_local_mode
+      timeout: 1500
+    cudnn-function:
+      <<: *default_local_mode
+      timeout: *default_timeout
+    cublaslt-gemm:
+      <<: *default_local_mode
+      timeout: 1200
+      parameters:
+        in_types: ['fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2']
+        shapes:
+        - 4096,4096,4096
+        - 8192,8192,8192
+        - 16384,16384,16384
+        - 16:2048,4608,12288
+        - 16:2048,12288,1536
+    cublaslt-gemm:bmm:
+      <<: *default_local_mode
+      timeout: 7200
+      parameters:
+        in_types: ['fp64', 'fp32', 'fp16', 'bf16']
+        batch: 96:12288
+        shapes:
+        - 1,1:2048,128
+        - 1,128,1:64
+        - 1,128,256:2048
+    cublas-function:
+      <<: *default_local_mode
+      timeout: *default_timeout
+    matmul:
+      <<: *default_local_mode
+      timeout: *default_timeout
+      frameworks: [pytorch]
+    gpu-burn:
+      timeout: 1800
+      modes:
+      - name: local
+        parallel: no
+      parameters:
+        time: 90
+        doubles: true
+        tensor_core: true
+    # microbenchmark - communication
+    cpu-memory-bw-latency:
+      timeout: *default_timeout
+      modes:
+      - name: local
+        parallel: no
+      parameters:
+        tests:
+        - bandwidth_matrix
+        - latency_matrix
+        - max_bandwidth
+    mem-bw:
+      timeout: *default_timeout
+      modes:
+      - name: local
+        proc_num: 4
+        prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
+        parallel: no
+    gpu-copy-bw:perf:
+      timeout: *default_timeout
+      modes:
+      - name: local
+        parallel: no
+      parameters:
+        mem_type: [htod, dtoh, dtod]
+        copy_type: [sm, dma]
+    gpu-copy-bw:correctness:
+      timeout: *default_timeout
+      modes:
+      - name: local
+        parallel: no
+      parameters:
+        mem_type: [htod, dtoh, dtod]
+        copy_type: [sm, dma]
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-stream:perf:
+      timeout: 600
+      modes:
+      - name: local
+        parallel: false
+      parameters:
+        num_warm_up: 10
+        num_loops: 40
+        size: 4294967296
+    gpu-stream:correctness:
+      timeout: 600
+      modes:
+      - name: local
+        parallel: false
+      parameters:
+        num_warm_up: 0
+        num_loops: 1
+        size: 1048576
+        check_data: true
+    nccl-bw:nvlink:
+      timeout: *default_timeout
+      modes:
+      - name: mpi
+        proc_num: 4
+        node_num: 1
+      parameters:
+        <<: *nccl_param
+    nccl-bw:nvlink-allgather:
+      timeout: *default_timeout
+      modes:
+      - name: mpi
+        proc_num: 4
+        node_num: 1
+      parameters:
+        <<: *nccl_param
+        operation: allgather
+    nccl-bw:nvlink-broadcast:
+      timeout: *default_timeout
+      modes:
+      - name: mpi
+        proc_num: 4
+        node_num: 1
+      parameters:
+        <<: *nccl_param
+        operation: broadcast
+    nccl-bw:nvlink-reduce:
+      timeout: *default_timeout
+      modes:
+      - name: mpi
+        proc_num: 4
+        node_num: 1
+      parameters:
+        <<: *nccl_param
+        operation: reduce
+    nccl-bw:nvlink-reducescatter:
+      timeout: *default_timeout
+      modes:
+      - name: mpi
+        proc_num: 4
+        node_num: 1
+      parameters:
+        <<: *nccl_param
+        operation: reducescatter
+    nccl-bw:nvlink-alltoall:
+      timeout: *default_timeout
+      modes:
+      - name: mpi
+        proc_num: 4
+        node_num: 1
+      parameters:
+        <<: *nccl_param
+        operation: alltoall
+    # microbenchmark - comput-comm. overlap
+    computation-communication-overlap:
+      <<: *default_pytorch_mode
+      timeout: *default_timeout
+    sharding-matmul:
+      <<: *default_pytorch_mode
+      timeout: *default_timeout
+    # microbenchmark - storage
+    disk-benchmark:
+      timeout: 2400
+      modes:
+      - name: local
+        parallel: no
+      parameters:
+        block_devices:
+        - /dev/nvme0n1
+        - /dev/nvme1n1
+        - /dev/nvme2n1
+        - /dev/nvme3n1
+        - /dev/nvme4n1
+        - /dev/nvme5n1
+        - /dev/nvme6n1
+        - /dev/nvme7n1
+        seq_read_runtime: 60
+        rand_read_runtime: 60
+    # model benchmark - inference
+    ort-inference:
+      <<: *default_local_mode
+      timeout: *default_timeout
+    tensorrt-inference:
+      <<: *default_local_mode
+      timeout: 14400
+      parameters:
+        pytorch_models:
+        - resnet50
+        - resnet101
+        - resnet152
+        - densenet169
+        - densenet201
+        - bert-base
+        - bert-large
+        seq_length: 224
+        batch_size: 32
+        precision: fp16
+    dist-inference:
+      <<: *default_pytorch_mode
+      timeout: *default_timeout
+      parameters:
+        batch_size: 80
+        input_size: 128
+        hidden_size: 128
+        num_layers: 50
+        num_steps: 10000
+        precision: float16
+    # model benchmark - training
+    model-benchmarks:gpt:
+      <<: *default_pytorch_mode
+      timeout: 1200
+      models:
+      - gpt2-small
+      - gpt2-large
+      parameters:
+        <<: *model_ddp_param
+        precision: [float32, float16, fp8_hybrid]
+        batch_size: 32
+        seq_len: 224
+    model-benchmarks:bert:
+      <<: *default_pytorch_mode
+      timeout: 1800
+      models:
+      - bert-base
+      - bert-large
+      parameters:
+        <<: *model_ddp_param
+        precision: [float32, float16, fp8_hybrid]
+        seq_len: 224
+    model-benchmarks:lstm:
+      <<: *default_pytorch_mode
+      timeout: *default_timeout
+      models:
+      - lstm
+      parameters:
+        <<: *model_ddp_param
+        batch_size: 1024
+        input_size: 224
+        hidden_size: 1000
+        seq_len: 32
+    model-benchmarks:resnet:
+      <<: *default_pytorch_mode
+      timeout: 3000
+      models:
+      - resnet50
+      - resnet101
+      - resnet152
+      parameters:
+        <<: *model_ddp_param
+        batch_size: 384
+    model-benchmarks:densenet:
+      <<: *default_pytorch_mode
+      timeout: 1000
+      models:
+      - densenet169
+      - densenet201
+      parameters:
+        <<: *model_ddp_param
+    model-benchmarks:vgg:
+      <<: *default_pytorch_mode
+      timeout: 1500
+      models:
+      - vgg11
+      - vgg13
+      - vgg16
+      - vgg19
+      parameters:
+        <<: *model_ddp_param
+    model-benchmarks:stress:
+      <<: *default_pytorch_mode
+      timeout: 7200
+      models:
+      - gpt2-large
+      parameters:
+        <<: *model_ddp_param
+        batch_size: 32
+    model-benchmarks:llama2-7b:
+      <<: *default_pytorch_mode
+      timeout: 1500
+      models:
+      - llama2-7b
+      parameters:
+        <<: *model_ddp_param
+        batch_size: 72
+        seq_len: 256
+        precision: [float16, fp8_hybrid]
+    model-benchmarks:llama2-7b-fp32:
+      <<: *default_pytorch_mode
+      timeout: 1500
+      models:
+      - llama2-7b
+      parameters:
+        <<: *model_ddp_param
+        batch_size: 24
+        seq_len: 256
+        precision: [float32]
diff --git a/website/docs/workloads/superbenchmark/superbenchmark-profiles.md b/website/docs/workloads/superbenchmark/superbenchmark-profiles.md
index e0b08fde41..e762430be2 100644
--- a/website/docs/workloads/superbenchmark/superbenchmark-profiles.md
+++ b/website/docs/workloads/superbenchmark/superbenchmark-profiles.md
@@ -8,13 +8,14 @@ The following profiles run customer-representative or benchmarking scenarios usi
 :::
 
 ## PERF-GPU-SUPERBENCH.json
-Runs the SuperBenchmark benchmark workload to test GPU performance. <mark>This workload is <b>supported ONLY for systems that contain nVidia GPU
+Runs the SuperBenchmark benchmark workload to test GPU performance. <mark>This workload is <b>supported ONLY for systems that contain Nvidia GPU
 hardware components</b>. Work is underway with partner teams in Azure to support additional GPU manufacturers.</mark>
 
 * [Workload Profile](https://github.com/microsoft/VirtualClient/blob/main/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json) 
 
 * **Supported Platform/Architectures**
   * linux-x64
+  * linux-arm64 
 
 * **Supports Disconnected Scenarios**  
   * No. Internet connection required.