From 53bd3ae9244e8b43fee5557c6740a237a92e7dd7 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 24 Nov 2025 11:37:33 -0800
Subject: [PATCH 1/9] Build pt2.9 arm64 image

---
 dlc_developer_config.toml                     |  12 +-
 pytorch/training/buildspec-arm64-2-9-ec2.yml  |  57 ++++
 pytorch/training/buildspec-arm64.yml          |   2 +-
 .../docker/2.9/py3/cu130/Dockerfile.arm64.gpu | 307 ++++++++++++++++++
 4 files changed, 371 insertions(+), 7 deletions(-)
 create mode 100644 pytorch/training/buildspec-arm64-2-9-ec2.yml
 create mode 100644 pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 2ddfe8ccb932..3ccb3a199b08 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -15,7 +15,7 @@ neuronx_mode = false
 graviton_mode = false
 # Please only set it to true if you are preparing a ARM64 related PR
 # Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR)
-arm64_mode = false
+arm64_mode = true
 # Please only set it to True if you are preparing a HABANA related PR
 # Do remember to revert it back to False before merging any PR (including HABANA dedicated PR)
 habana_mode = false
@@ -37,12 +37,12 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -69,7 +69,7 @@ ecs_tests = true
 eks_tests = true
 ec2_tests = true
 # Set it to true if you are preparing a Benchmark related PR
-ec2_benchmark_tests = false
+ec2_benchmark_tests = true
 
 ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
 ### default. If false, these types of tests will be skipped while other tests will run as usual.
@@ -78,7 +78,7 @@ ec2_benchmark_tests = false
 ec2_tests_on_heavy_instances = false
 ### SM specific tests
 ### On by default
-sagemaker_local_tests = true
+sagemaker_local_tests = false
 ### Set enable_ipv6 = true to run tests with IPv6-enabled resources
 ### Off by default (set to false)
 enable_ipv6 = false
@@ -96,7 +96,7 @@ enable_ipv6 = false
 ipv6_vpc_name = ""
 
 # run standard sagemaker remote tests from test/sagemaker_tests
-sagemaker_remote_tests = true
+sagemaker_remote_tests = false
 # run efa sagemaker tests
 sagemaker_efa_tests = false
 # run release_candidate_integration tests
diff --git a/pytorch/training/buildspec-arm64-2-9-ec2.yml b/pytorch/training/buildspec-arm64-2-9-ec2.yml
new file mode 100644
index 000000000000..251b258ab53f
--- /dev/null
+++ b/pytorch/training/buildspec-arm64-2-9-ec2.yml
@@ -0,0 +1,57 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK pytorch
+version: &VERSION 2.9.0
+short_version: &SHORT_VERSION "2.9"
+arch_type: arm64
+autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    start_cuda_compat:
+      source: docker/build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    dockerd_entrypoint:
+      source: docker/build_artifacts/dockerd_entrypoint.sh
+      target: dockerd_entrypoint.sh
+    changehostname:
+      source: docker/build_artifacts/changehostname.c
+      target: changehostname.c
+    start_with_right_hostname:
+      source: docker/build_artifacts/start_with_right_hostname.sh
+      target: start_with_right_hostname.sh
+    example_mnist_file:
+      source: docker/build_artifacts/mnist.py
+      target: mnist.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  BuildEC2Arm64GPUPTTrainPy3cu128DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 19700
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    cuda_version: &CUDA_VERSION cu130
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    # skip_build: "False"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.arm64.,
+                         *DEVICE_TYPE ]
+    target: ec2
+    context:
+      <<: *TRAINING_CONTEXT
diff --git a/pytorch/training/buildspec-arm64.yml b/pytorch/training/buildspec-arm64.yml
index 9d949d737ea0..a5ada04a54c6 100644
--- a/pytorch/training/buildspec-arm64.yml
+++ b/pytorch/training/buildspec-arm64.yml
@@ -1 +1 @@
-buildspec_pointer: buildspec-arm64-2-7-ec2.yml
+buildspec_pointer: buildspec-arm64-2-9-ec2.yml
diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
new file mode 100644
index 000000000000..24fc8fa1e544
--- /dev/null
+++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
@@ -0,0 +1,307 @@
+ARG PYTHON=python3
+ARG PYTHON_VERSION=3.12.10
+ARG PYTHON_SHORT_VERSION=3.12
+
+ARG CUDA_VERSION=13.0.0
+ARG CUDNN_VERSION=9.13.0.50
+ARG NCCL_VERSION=2.27.7
+ARG EFA_VERSION=1.43.3
+ARG GDRCOPY_VERSION=2.5
+ARG TE_VERSION=2.9
+ARG FLASH_ATTN_VERSION=2.8.3
+
+# PyTorch Binaries
+ARG TORCH_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.9.0/arm64/cu130/torch-2.9.0%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl
+ARG TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.9.0/arm64/cu130/torchvision-0.24.0%2Bcu130-cp312-cp312-linux_aarch64.whl
+ARG TORCHAUDIO_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.9.0/arm64/cu130/torchaudio-2.9.0%2Bcu130-cp312-cp312-linux_aarch64.whl
+ARG TORCHTEXT_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.9.0/arm64/cu130/torchtext-0.18.0%2Bcu130-cp312-cp312-linux_aarch64.whl
+ARG TORCHDATA_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.9.0/arm64/cu130/torchdata-0.11.0%2Bcu130-py3-none-any.whl
+
+########################################################
+#  _____ ____ ____    ___
+# | ____/ ___|___ \  |_ _|_ __ ___   __ _  __ _  ___
+# |  _|| |     __) |  | || '_ ` _ \ / _` |/ _` |/ _ \
+# | |__| |___ / __/   | || | | | | | (_| | (_| |  __/
+# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
+#                                         |___/
+#  ____           _
+# |  _ \ ___  ___(_)_ __   ___
+# | |_) / _ \/ __| | '_ \ / _ \
+# |  _ <  __/ (__| | |_) |  __/
+# |_| \_\___|\___|_| .__/ \___|
+#                  |_|
+########################################################
+FROM --platform=linux/arm64 nvidia/cuda:13.0.0-base-ubuntu22.04 AS ec2
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG PYTHON
+ARG PYTHON_VERSION
+ARG PYTHON_SHORT_VERSION
+
+ARG CUDA_VERSION
+ARG CUDNN_VERSION
+ARG NCCL_VERSION
+ARG EFA_VERSION
+ARG GDRCOPY_VERSION
+ARG TE_VERSION
+ARG FLASH_ATTN_VERSION
+
+ARG TORCH_URL
+ARG TORCHVISION_URL
+ARG TORCHAUDIO_URL
+ARG TORCHTEXT_URL
+ARG TORCHDATA_URL
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+ENV CUDA_HOME="/usr/local/cuda"
+ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="/lib/aarch64-linux-gnu:${LD_LIBRARY_PATH}"
+ENV PATH="${CUDA_HOME}/bin:${PATH}"
+ENV EFA_PATH="/opt/amazon/efa"
+ENV OPEN_MPI_PATH="/opt/amazon/openmpi"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+
+# Graviton Optimization
+ENV LRU_CACHE_CAPACITY=1024 \
+    THP_MEM_ALLOC_ENABLE=1 \
+    DNNL_DEFAULT_FPMATH_MODE=BF16
+
+ENV DLC_CONTAINER_TYPE=training
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get -y upgrade --only-upgrade systemd \
+ && apt-get install -y --allow-change-held-packages --no-install-recommends \
+    automake \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    wget \
+    scons \
+    unzip \
+    emacs \
+    vim \
+    git \
+    jq \
+    cuda-toolkit-13=${CUDA_VERSION}-1 \
+    libcudnn9-cuda-13=${CUDNN_VERSION}-1 \
+    libcudnn9-dev-cuda-13=${CUDNN_VERSION}-1 \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    openjdk-17-jdk \
+    openssl \
+    libssl-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    llvm \
+    libncurses5-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    liblzma-dev \
+    zlib1g-dev \
+    libjpeg-dev \
+    libpng-dev \
+    libffi-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Install EFA
+RUN mkdir /tmp/efa \
+&& cd /tmp/efa \
+&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
+&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
+&& cd aws-efa-installer \
+&& apt-get update \
+&& ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify \
+&& rm -rf /tmp/efa \
+&& rm -rf /var/lib/apt/lists/* \
+&& apt-get clean
+
+ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}"
+
+# Configure Open MPI and configure NCCL parameters
+RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \
+ && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \
+ && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo NCCL_DEBUG=INFO >> /etc/nccl.conf
+
+# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends openssh-client openssh-server \
+ && mkdir -p /var/run/sshd \
+ && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
+ && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
+ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Configure OpenSSH so that nodes can communicate with each other
+RUN mkdir -p /var/run/sshd \
+ && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+RUN rm -rf /root/.ssh/ \
+ && mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+# install python
+RUN cd /tmp/ \
+&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
+&& tar xzf Python-${PYTHON_VERSION}.tgz \
+&& cd Python-${PYTHON_VERSION} \
+&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \
+&& make -j "$(nproc)" \
+&& make altinstall \
+&& cd .. \
+&& rm -rf Python-${PYTHON_VERSION} \
+&& rm Python-${PYTHON_VERSION}.tgz \
+&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \
+&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \
+# This installation generate a .python_history file in the root directory leads sanity check to fail
+&& rm -f /root/.python_history
+
+# Python Path
+ENV PATH="/usr/local/bin:${PATH}"
+
+# this will add pip systemlink to pip${PYTHON_SHORT_VERSION}
+RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
+
+# Install pip packages
+RUN pip install --no-cache-dir \
+    cython \
+    boto3 \
+    scipy \
+    opencv-python \
+    numpy \
+    pyopenssl \
+    cryptography \
+    ipython \
+    parso \
+    awscli \
+    urllib3 \
+    idna \
+    tqdm \
+    requests \
+    mpi4py \
+    packaging \
+    ninja \
+    pybind11
+
+# Install PyTorch
+RUN pip install --no-cache-dir -U \
+    ${TORCH_URL} \
+    ${TORCHVISION_URL} \
+    ${TORCHAUDIO_URL} \
+    ${TORCHTEXT_URL} \
+    ${TORCHDATA_URL} \
+    torchtnt \
+    s3torchconnector \
+    accelerate
+
+# Install GDRCopy
+RUN cd /tmp \
+&& git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
+&& cd gdrcopy \
+&& sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
+&& CUDA=${CUDA_HOME} make install \
+&& rm -rf /tmp/gdrcopy
+
+# Install NCCL
+RUN cd /tmp \
+ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
+ && cd nccl \
+ && make -j64 src.build BUILDDIR=/usr/local \
+ && rm -rf /tmp/nccl
+# preload system nccl for PyTorch to use if it is dynamically linking NCCL
+ENV LD_PRELOAD="/usr/local/lib/libnccl.so"
+
+# Install flash attn and NVIDIA transformer engine.
+# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
+ENV NVTE_FRAMEWORK=pytorch
+# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
+# Set MAX_JOBS=4 to avoid OOM issues in installation process
+RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation --verbose
+# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
+
+# OSS compliance
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && cp ${HOME_DIR}/oss_compliance/build_from_source_packages/BUILD_FROM_SOURCE_PACKAGES_LICENCES_AARCH64_IMAGES ${HOME_DIR} \
+ && rm -rf ${HOME_DIR}/oss_compliance*
+
+# add license
+RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.7/license.txt
+
+# add telemetry
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+# COPY sitecustomize.py /usr/local/lib/python${PYTHON_SHORT_VERSION}/sitecustomize.py
+
+COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
+RUN chmod +x /usr/local/bin/start_cuda_compat.sh
+
+COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
+RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
+
+# Cleanup
+RUN pip cache purge \
+ && rm -rf /tmp/tmp* \
+ && rm -iRf /root/.cache
+
+ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
+CMD ["/bin/bash"]
+
+#################################################################
+#  ____                   __  __       _
+# / ___|  __ _  __ _  ___|  \/  | __ _| | _____ _ __
+# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__|
+#  ___) | (_| | (_| |  __/ |  | | (_| |   <  __/ |
+# |____/ \__,_|\__, |\___|_|  |_|\__,_|_|\_\___|_|
+#              |___/
+#  ___                              ____           _
+# |_ _|_ __ ___   __ _  __ _  ___  |  _ \ ___  ___(_)_ __   ___
+#  | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
+#  | || | | | | | (_| | (_| |  __/ |  _ <  __/ (__| | |_) |  __/
+# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
+#                      |___/                        |_|
+#
+#################################################################
+
+# FROM ec2 AS sagemaker
+
+# LABEL maintainer="Amazon AI"
+# LABEL dlc_major_version="1"
+
+# ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
+
+# ARG PYTHON
+
+# # Cleanup
+# RUN pip cache purge
+#  && rm -rf /tmp/tmp* \
+#  && rm -iRf /root/.cache

From 9ac33e0b8e820e39ca1685479bd830d5ea9f4b10 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Tue, 25 Nov 2025 01:58:01 -0800
Subject: [PATCH 2/9] add ec2 test

---
 .../test_pytorch_training_arm64_2_9.py        | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_9.py

diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_9.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_9.py
new file mode 100644
index 000000000000..0060a90a73e8
--- /dev/null
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_9.py
@@ -0,0 +1,97 @@
+import pytest
+
+import test.test_utils as test_utils
+
+from test.test_utils import ec2
+
+from test.dlc_tests.ec2.pytorch.training import common_cases
+from test.dlc_tests.ec2 import smclarify_cases
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("pytorch_gpu_tests")
+@pytest.mark.model("N/A")
+@pytest.mark.team("conda")
+@pytest.mark.parametrize(
+    "ec2_instance_type", common_cases.PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True
+)
+@pytest.mark.parametrize(
+    "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True
+)
+def test_pytorch_2_9_gpu(
+    pytorch_training_arm64___2__9, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training_arm64___2__9
+
+    test_cases = [
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_framework_gpu, (pytorch_training, ec2_connection)),
+    ]
+
+    if "sagemaker" in pytorch_training:
+        test_cases.append(
+            (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)),
+        )
+
+    # AMP must be run on multi_gpu
+    if ec2.is_instance_multi_gpu(ec2_instance_type):
+        test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection)))
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 GPU")
+
+
+# @pytest.mark.usefixtures("sagemaker")
+# @pytest.mark.integration("pytorch_gpu_heavy_tests")
+# @pytest.mark.model("N/A")
+# @pytest.mark.team("conda")
+# @pytest.mark.parametrize(
+#     "ec2_instance_type", common_cases.PT_EC2_HEAVY_GPU_ARM64_INSTANCE_TYPE, indirect=True
+# )
+# @pytest.mark.parametrize(
+#     "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True
+# )
+# @pytest.mark.skipif(
+#     test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(),
+#     reason="Skip GPU Heavy tests in PR context unless explicitly enabled",
+# )
+# def test_pytorch_2_7_gpu_heavy(
+#     pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type
+# ):
+#     pytorch_training = pytorch_training_arm64___2__7
+
+#     test_cases = [
+#         (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
+#         (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
+#     ]
+
+#     test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.7 GPU Heavy")
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("inductor")
+@pytest.mark.model("N/A")
+@pytest.mark.team("training-compiler")
+@pytest.mark.parametrize(
+    "ec2_instance_type", common_cases.PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True
+)
+@pytest.mark.parametrize(
+    "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True
+)
+def test_pytorch_2_9_gpu_inductor(
+    pytorch_training_arm64___2__9, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training_arm64___2__9
+
+    test_cases = [
+        (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)),
+    ]
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 GPU Inductor")

From c5de1530e6d9df516f336ca78baf0a66c060b351 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 22:06:06 -0800
Subject: [PATCH 3/9] update toml file to build image

---
 dlc_developer_config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 3ccb3a199b08..a3fe65917391 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -129,7 +129,7 @@ dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 
 # ARM64 Training
-dlc-pr-pytorch-arm64-training = ""
+dlc-pr-pytorch-arm64-training = "pytorch/training/buildspec-arm64-2-9-ec2.yml"
 
 # HuggingFace Training
 dlc-pr-huggingface-tensorflow-training = ""

From 36086559c92edf5b574ed3c236243b7b10ab8e68 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 22:33:51 -0800
Subject: [PATCH 4/9] disable autopatch and try build image

---
 pytorch/training/buildspec-arm64-2-9-ec2.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch/training/buildspec-arm64-2-9-ec2.yml b/pytorch/training/buildspec-arm64-2-9-ec2.yml
index 251b258ab53f..e97b5b59d6d7 100644
--- a/pytorch/training/buildspec-arm64-2-9-ec2.yml
+++ b/pytorch/training/buildspec-arm64-2-9-ec2.yml
@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.9.0
 short_version: &SHORT_VERSION "2.9"
 arch_type: arm64
-autopatch_build: "True"
+#autopatch_build: "True"
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY

From 7eb9eacfcf49790e57e4328551d5fc5b4aba3fa4 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Mon, 1 Dec 2025 23:26:45 -0800
Subject: [PATCH 5/9] update cudann version and try build image

---
 pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
index 24fc8fa1e544..b89382593ab2 100644
--- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
+++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
@@ -3,7 +3,7 @@ ARG PYTHON_VERSION=3.12.10
 ARG PYTHON_SHORT_VERSION=3.12
 
 ARG CUDA_VERSION=13.0.0
-ARG CUDNN_VERSION=9.13.0.50
+ARG CUDNN_VERSION=9.16.0.29
 ARG NCCL_VERSION=2.27.7
 ARG EFA_VERSION=1.43.3
 ARG GDRCOPY_VERSION=2.5

From a38a138eafd6eef54a7168c7e1fae774135d3162 Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Tue, 2 Dec 2025 00:15:58 -0800
Subject: [PATCH 6/9] Fix GDRCopy compilation and try build image

---
 pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
index b89382593ab2..193a9440502c 100644
--- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
+++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
@@ -218,10 +218,12 @@ RUN pip install --no-cache-dir -U \
     accelerate
 
 # Install GDRCopy
+# Note: CUDA 13.0 on ARM64 only supports compute_90 (Hopper architecture).
 RUN cd /tmp \
 && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
 && cd gdrcopy \
 && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
+&& sed -i 's/GENCODE_FLAGS.*/GENCODE_FLAGS := -gencode arch=compute_90,code=compute_90 -gencode arch=compute_90,code=sm_90/g' tests/Makefile \
 && CUDA=${CUDA_HOME} make install \
 && rm -rf /tmp/gdrcopy
 

From 90bca0bd2c3673fec7a31451082b63f310d7493a Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Tue, 2 Dec 2025 09:14:20 -0800
Subject: [PATCH 7/9] Fix GDRCopy compilation and try build image

---
 pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
index 193a9440502c..a829d4ca85f7 100644
--- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
+++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
@@ -223,8 +223,7 @@ RUN cd /tmp \
 && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
 && cd gdrcopy \
 && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
-&& sed -i 's/GENCODE_FLAGS.*/GENCODE_FLAGS := -gencode arch=compute_90,code=compute_90 -gencode arch=compute_90,code=sm_90/g' tests/Makefile \
-&& CUDA=${CUDA_HOME} make install \
+&& CUDA=${CUDA_HOME} NVCCFLAGS="-gencode arch=compute_90,code=compute_90 -gencode arch=compute_90,code=sm_90" make install \
 && rm -rf /tmp/gdrcopy
 
 # Install NCCL

From 79da55513bef22799d5574ebacc8a2ec76568c8e Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Tue, 2 Dec 2025 11:40:28 -0800
Subject: [PATCH 8/9] increase MAX_JOBS

---
 pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
index a829d4ca85f7..1936734ee1ae 100644
--- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
+++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
@@ -240,7 +240,7 @@ ENV LD_PRELOAD="/usr/local/lib/libnccl.so"
 ENV NVTE_FRAMEWORK=pytorch
 # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
 # Set MAX_JOBS=4 to avoid OOM issues in installation process
-RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation --verbose
+RUN MAX_JOBS=8 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation --verbose
 # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
 RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
 

From 3f9d049267fc09d24f09d7c56ba30400161a41ec Mon Sep 17 00:00:00 2001
From: DevakiBolleneni <devakib@amazon.com>
Date: Tue, 2 Dec 2025 14:53:41 -0800
Subject: [PATCH 9/9] try build flashattention

---
 pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
index 1936734ee1ae..e3d8b9b1f39c 100644
--- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
+++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.arm64.gpu
@@ -240,7 +240,7 @@ ENV LD_PRELOAD="/usr/local/lib/libnccl.so"
 ENV NVTE_FRAMEWORK=pytorch
 # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
 # Set MAX_JOBS=4 to avoid OOM issues in installation process
-RUN MAX_JOBS=8 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation --verbose
+RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --verbose
 # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
 RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation