Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions dlc_developer_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ neuronx_mode = false
graviton_mode = false
# Please only set it to true if you are preparing a ARM64 related PR
# Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR)
arm64_mode = false
arm64_mode = true
# Please only set it to True if you are preparing a HABANA related PR
# Do remember to revert it back to False before merging any PR (including HABANA dedicated PR)
habana_mode = false
Expand All @@ -37,12 +37,12 @@ deep_canary_mode = false
[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []
build_frameworks = ["pytorch"]


# By default we build both training and inference containers. Set true/false values to determine which to build.
build_training = true
build_inference = true
build_inference = false

# Set do_build to "false" to skip builds and test the latest image built by this PR
# Note: at least one build is required to set do_build to "false"
Expand All @@ -69,7 +69,7 @@ ecs_tests = true
eks_tests = true
ec2_tests = true
# Set it to true if you are preparing a Benchmark related PR
ec2_benchmark_tests = false
ec2_benchmark_tests = true

### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
### default. If false, these types of tests will be skipped while other tests will run as usual.
Expand All @@ -78,7 +78,7 @@ ec2_benchmark_tests = false
ec2_tests_on_heavy_instances = false
### SM specific tests
### On by default
sagemaker_local_tests = true
sagemaker_local_tests = false
### Set enable_ipv6 = true to run tests with IPv6-enabled resources
### Off by default (set to false)
enable_ipv6 = false
Expand All @@ -96,7 +96,7 @@ enable_ipv6 = false
ipv6_vpc_name = ""

# run standard sagemaker remote tests from test/sagemaker_tests
sagemaker_remote_tests = true
sagemaker_remote_tests = false
# run efa sagemaker tests
sagemaker_efa_tests = false
# run release_candidate_integration tests
Expand Down Expand Up @@ -129,7 +129,7 @@ dlc-pr-tensorflow-2-training = ""
dlc-pr-autogluon-training = ""

# ARM64 Training
dlc-pr-pytorch-arm64-training = ""
dlc-pr-pytorch-arm64-training = "pytorch/training/buildspec-arm64-2-9-ec2.yml"

# HuggingFace Training
dlc-pr-huggingface-tensorflow-training = ""
Expand Down
57 changes: 57 additions & 0 deletions pytorch/training/buildspec-arm64-2-9-ec2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.9.0
short_version: &SHORT_VERSION "2.9"
arch_type: arm64
#autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
BuildEC2Arm64GPUPTTrainPy3cu128DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 19700
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.arm64.,
*DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
2 changes: 1 addition & 1 deletion pytorch/training/buildspec-arm64.yml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
buildspec_pointer: buildspec-arm64-2-7-ec2.yml
buildspec_pointer: buildspec-arm64-2-9-ec2.yml
Loading