diff --git a/.coveragerc_py37 b/.coveragerc_py37
new file mode 100644
index 00000000..96bb72bf
--- /dev/null
+++ b/.coveragerc_py37
@@ -0,0 +1,20 @@
+[run]
+branch = True
+timid = True
+
+[report]
+exclude_lines =
+ pragma: no cover
+ pragma: py3 no cover
+ if six.PY2
+ elif six.PY2
+
+partial_branches =
+ pragma: no cover
+ pragma: py3 no cover
+ if six.PY3
+ elif six.PY3
+
+show_missing = True
+
+fail_under = 90
diff --git a/.flake8 b/.flake8
index a87e2f9f..83270830 100644
--- a/.flake8
+++ b/.flake8
@@ -1,3 +1,3 @@
[flake8]
-application_import_names = sagemaker_tensorflow_container, test, timeout, utils
+application_import_names = image_utils, integration, sagemaker_tensorflow_container, test, timeout, utils
import-order-style = google
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..978cf8cf
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,31 @@
+---
+name: Bug report
+about: File a report to help us reproduce and fix the problem
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To reproduce**
+A clear, step-by-step set of instructions to reproduce the bug.
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots or logs**
+If applicable, add screenshots or logs to help explain your problem.
+
+**System information**
+A description of your system. Please provide:
+- **Toolkit version**:
+- **Framework version**:
+- **Python version**:
+- **CPU or GPU**:
+- **Custom Docker image (Y/N)**:
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..9df79c90
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+ - name: Ask a question
+ url: https://stackoverflow.com/questions/tagged/amazon-sagemaker
+ about: Use Stack Overflow to ask and answer questions
diff --git a/.github/ISSUE_TEMPLATE/documentation-request.md b/.github/ISSUE_TEMPLATE/documentation-request.md
new file mode 100644
index 00000000..b64cd478
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation-request.md
@@ -0,0 +1,17 @@
+---
+name: Documentation request
+about: Request improved documentation
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**What did you find confusing? Please describe.**
+A clear and concise description of what you found confusing. Ex. I tried to [...] but I didn't understand how to [...]
+
+**Describe how documentation can be improved**
+A clear and concise description of where documentation was lacking and how it can be improved.
+
+**Additional context**
+Add any other context or screenshots about the documentation request here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..bff1cb4e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest new functionality for this toolkit
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the feature you'd like**
+A clear and concise description of the functionality you want.
+
+**How would this feature be used? Please describe.**
+A clear and concise description of the use case for this feature. Please provide an example, if possible.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index af391c6a..58039444 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,359 @@
# Changelog
+## v10.1.8 (2020-12-08)
+
+### Bug Fixes and Other Changes
+
+ * workaround to print stderr when capture_error is True
+
+## v10.1.7 (2020-11-06)
+
+### Bug Fixes and Other Changes
+
+ * propagate log level
+
+## v10.1.6 (2020-10-15)
+
+### Bug Fixes and Other Changes
+
+ * add condition to avoid error when 'model_dir' is None
+
+## v10.1.5 (2020-08-23)
+
+### Bug Fixes and Other Changes
+
+ * call entry_point.run with capture_error=True
+
+## v10.1.4.post4 (2020-07-01)
+
+### Testing and Release Infrastructure
+
+ * add integration test for MPI env vars propagation
+
+## v10.1.4.post3 (2020-06-29)
+
+### Testing and Release Infrastructure
+
+ * add issue templates
+
+## v10.1.4.post2 (2020-06-18)
+
+### Documentation Changes
+
+ * remove confusing information from the Readme.
+
+### Testing and Release Infrastructure
+
+ * add single-instance, multi-process Horovod test for local GPU
+
+## v10.1.4.post1 (2020-06-11)
+
+### Testing and Release Infrastructure
+
+ * Rename buildspec files.
+
+## v10.1.4.post0 (2020-06-10)
+
+### Documentation Changes
+
+ * remove functional test info from branch
+ * Update README.rst
+
+### Testing and Release Infrastructure
+
+ * Make docker folder read only, remove unused tests.
+
+## v10.1.4 (2020-06-10)
+
+### Bug Fixes and Other Changes
+
+ * bump version of sagemaker-training for script entry point fix.
+
+## v10.1.3 (2020-05-12)
+
+### Bug Fixes and Other Changes
+
+ * Bump version of sagemaker-training for typing fix
+
+### Testing and Release Infrastructure
+
+ * remove unused build scripts.
+
+## v10.1.2 (2020-05-05)
+
+### Bug Fixes and Other Changes
+
+ * Add py37 to sm tests
+
+## v10.1.1 (2020-05-04)
+
+### Bug Fixes and Other Changes
+
+ * remove sagemaker pysdk, keras_applications and keras_preprocessing in docker files
+ * Fix sm integration issues
+ * add dockerfiles for tf 1.15.2 py37 containers
+
+## v10.1.0 (2020-04-29)
+
+### Features
+
+ * Python 3.7 support
+
+### Testing and Release Infrastructure
+
+ * Fix buildspecs
+
+## v10.0.0 (2020-04-27)
+
+### Breaking Changes
+
+ * Replace sagemaker-containers with sagemaker-training
+
+### Testing and Release Infrastructure
+
+ * remove CHANGELOG entries from failed builds
+ * bump version to prepare for new version scheme
+ * add training script to benchmark directory
+ * skip image push in PR build if no changes
+
+## v2.3.2 (2020-04-07)
+
+### Bug Fixes and Other Changes
+
+ * Bump smdebug version
+
+## v2.3.1 (2020-04-06)
+
+### Bug Fixes and Other Changes
+
+ * updating pillow version of tf1.15
+
+## v2.3.0 (2020-04-02)
+
+### Features
+
+ * install sagemaker-tensorflow-toolkit from PyPI.
+
+## v2.2.8 (2020-04-01)
+
+### Bug Fixes and Other Changes
+
+ * Allowing arguments for deep_learning_container.py for tf1.15
+
+## v2.2.7.post0 (2020-03-31)
+
+### Testing and Release Infrastructure
+
+ * refactor toolkit tests.
+
+## v2.2.7 (2020-03-26)
+
+### Bug Fixes and Other Changes
+
+ * Adding of deep_learning_container.py in Tf1.15
+
+## v2.2.6 (2020-03-16)
+
+### Bug Fixes and Other Changes
+
+ * smdebug 0.7.1
+ * Added marker to skip on pipeline
+
+## v2.2.5 (2020-03-12)
+
+### Bug Fixes and Other Changes
+
+ * install smexperiments when python >= 3.6
+ * SM integration test for TF 1.x
+ * upgrade to latest sagemaker-experiments
+ * Added pytest fixture
+
+## v2.2.4 (2020-03-11)
+
+### Bug Fixes and Other Changes
+
+ * update smdebug wheel
+ * Revert "Update smdebug to 0.7.0 - TF 1.15.2 (#298)"
+
+## v2.2.3 (2020-03-10)
+
+### Bug Fixes and Other Changes
+
+ * update smdebug wheel
+ * Update smdebug to 0.7.0 - TF 1.15.2
+ * install SageMaker Python SDK into Python 3 images
+
+## v2.2.2.post0 (2020-03-05)
+
+### Testing and Release Infrastructure
+
+ * fix PR build
+
+## v2.2.2 (2020-02-20)
+
+### Bug Fixes and Other Changes
+
+ * copy all tests to test-toolkit folder.
+
+## v2.2.1 (2020-02-17)
+
+### Bug Fixes and Other Changes
+
+ * update: update r1.15.2 dockerfiles
+
+## v2.2.0 (2020-02-13)
+
+### Features
+
+ * Add release to PyPI. Change package name to sagemaker-tensorflow-training.
+
+### Bug Fixes and Other Changes
+
+ * pin awscli to latest version
+ * Pin awscli to latest
+ * bump smdebug version to 0.5.0.post0
+ * update: Update awscli version and remove related pins
+ * update: Update buildspec for TF 1.15.0
+ * update copyright year in license header
+
+### Documentation Changes
+
+ * update README.rst
+ * Add link to TF 2.0 branch
+
+### Testing and Release Infrastructure
+
+ * Add twine check during PR.
+ * properly fail build if has-matching-changes fails
+ * properly fail build if has-matching-changes fails
+
+## v0.1.0 (2020-02-12)
+
+### Features
+
+ * Add release to PyPI. Change package name to sagemaker-tensorflow-training.
+
+### Bug Fixes and Other Changes
+
+ * pin awscli to latest version
+ * Pin awscli to latest
+ * bump smdebug version to 0.5.0.post0
+ * update: Update awscli version and remove related pins
+ * update: Update buildspec for TF 1.15.0
+ * update copyright year in license header
+ * update: Release TF 1.15.0 dockerfiles
+ * use regional endpoint for STS in builds
+ * update documentation link in warning message
+ * update instance type region availability.
+ * license file was missing from root of repo.
+ * install tensorflow<2.0
+ * merge dockerfiles
+ * move script mode branch to master
+ * use last legacy mode version for --framework-version test arg default
+ * Pin pytest and pluggy to work around configparser error
+ * Use multiprocessing.Process to launch parameter server
+ * increase grpc message size limit to 2gb
+ * Fix typo in serving method name
+ * restore python-dev package in image
+ * Add default tag to functional tests
+ * update link to correct docs
+ * Add EI Dockerfile for 1.11
+ * Add EI documentation within README
+ * add Dockerfile for EI
+ * Use get_closest_marker instead of get_marker
+ * Add docker files of TF 1.12
+ * Default GRPC timeout for EI & Allow timeout to be configurable
+ * remove requests from test dependencies
+ * catch RpcError due to change in GRPC
+ * Icywang86rui gpu fix
+ * Read port range from container support for TFS port
+ * Unfreeze requests version
+ * export_saved_model: copy asset files
+ * add port to dockerfile
+ * Updated TF Pipe Mode Version
+ * Fix MKL setting
+ * Set MKL vars plus tests
+ * increase test timeout
+ * Add back https to S3
+ * Add 1.11.0 CPU and GPU Dockerfile
+ * pin requests version
+ * fix memory leak in serving
+ * Update region in s3 boto client in serve
+ * Update readme with instructions for 1.9.0 and above
+ * Fix deserialization of dicts for json predict requests
+ * Add dockerfile and update test for tensorflow 1.10.0
+ * Support tensorflow 1.9.0
+ * Add integ tests to verify that tensorflow in gpu-image can access gpu-devices.
+ * train on 3 epochs for pipe mode test
+ * Change error classes used by _default_input_fn() and _default_output_fn()
+ * Changing assertion to check only existence
+ * Install sagemaker-tensorflow from pypi. Add MKL environment variables for TF 1.8
+ * get most recent saved model to export
+ * pip install tensorflow 1.8 in 1.8 cpu image
+ * install tensorflow extensions
+ * upgrade cpu binaries in docker build
+ * Force upgrade of the framework binaries to make sure the right binaries are installed.
+ * Add Pillow to pip install list
+ * Increase train steps for cifar distributed test to mitigate race condition
+ * Add TensorFlow 1.8 dockerfiles
+ * Add TensorFlow 1.7 dockerfiles
+ * Explain how to download tf binaries from PyPI
+ * Allow training without S3
+ * Fix hyperparameter name for detecting a tuning job
+ * Checkout v1.4.1 tag instead of r1.4 branch
+ * Move processing of requirements file in.
+ * Generate checkpoint path using TRAINING_JOB_NAME environment variable if needed
+ * Wrap user-provided model_fn to pass arguments positionally (maintains compatibility with existing behavior)
+ * Add more unit tests for trainer, fix __all__ and rename train.py to avoid import conflict
+ * Use regional endpoint for S3 client
+ * Update README.rst
+ * Pass input_channels to eval_input_fn if defined
+ * Fix setup.py to refer to renamed README
+ * Add test and build instructions
+ * Fix year in license headers
+ * Add TensorFlow 1.6
+ * Add test instructions in README
+ * Add container support to install_requires
+ * Add Apache license headers
+ * Use wget to install tensorflow-model-server
+ * Fix file path for integ test
+ * Fix s3_prefix path in integ test
+ * Fix typo in path for integ test
+ * Add input_channels to train_input_fn interface.
+ * Update logging and make serving_input_fn optional.
+ * remove pip install in tensorflow training
+ * Modify integration tests to run nvidia-docker for gpu
+ * add h5py for keras models
+ * Add local integ tests & resources
+ * Restructure repo to use a directory per TF version for dockerfiles
+ * Rename "feature_map" variables to "feature_dict" to avoid overloading it with the ML term "feature map"
+ * Copying in changes from internal repo:
+ * Add functional test
+ * Fix FROM image names for final build dockerfiles
+ * Add dockerfiles for building our production images (TF 1.4)
+ * GPU Dockerfile and setup.py fixes
+ * Add base image Dockerfiles for 1.4
+ * Merge pull request #1 from aws/mvs-first-commit
+ * first commit
+ * Updating initial README.md from template
+ * Creating initial file from template
+ * Creating initial file from template
+ * Creating initial file from template
+ * Creating initial file from template
+ * Creating initial file from template
+ * Initial commit
+
+### Documentation Changes
+
+ * update README.rst
+ * Add link to TF 2.0 branch
+
+### Testing and Release Infrastructure
+
+ * Add twine check during PR.
+ * properly fail build if has-matching-changes fails
+ * properly fail build if has-matching-changes fails
+
## v0.1.0 (2019-05-22)
### Bug fixes and other changes
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..5cc14234
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,61 @@
+# Contributing Guidelines
+
+Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
+documentation, we greatly value feedback and contributions from our community.
+
+Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
+information to effectively respond to your bug report or contribution.
+
+
+## Reporting Bugs/Feature Requests
+
+We welcome you to use the GitHub issue tracker to report bugs or suggest features.
+
+When filing an issue, please check [existing open](https://github.com/aws/sagemaker-tensorflow-training-toolkit/issues), or [recently closed](https://github.com/aws/sagemaker-tensorflow-training-toolkit/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
+reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
+
+* A reproducible test case or series of steps
+* The version of our code being used
+* Any modifications you've made relevant to the bug
+* Anything unusual about your environment or deployment
+
+
+## Contributing via Pull Requests
+Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
+
+1. You are working against the latest source on the *master* branch.
+2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
+3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
+
+To send us a pull request, please:
+
+1. Fork the repository.
+2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
+3. Ensure local tests pass.
+4. Commit to your fork using clear commit messages.
+5. Send us a pull request, answering any default questions in the pull request interface.
+6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
+
+GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
+[creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
+
+
+## Finding contributions to work on
+Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/sagemaker-tensorflow-training-toolkit/labels/help%20wanted) issues is a great place to start.
+
+
+## Code of Conduct
+This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
+For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
+opensource-codeofconduct@amazon.com with any additional questions or comments.
+
+
+## Security issue notifications
+If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
+
+
+## Licensing
+
+See the [LICENSE](https://github.com/aws/sagemaker-tensorflow-training-toolkit//blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
+
+We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
diff --git a/README.rst b/README.rst
index 6d031bf9..92aed6e2 100644
--- a/README.rst
+++ b/README.rst
@@ -1,290 +1,29 @@
-===============================
-SageMaker TensorFlow Containers
-===============================
+=====================================
+SageMaker TensorFlow Training Toolkit
+=====================================
-SageMaker TensorFlow Containers is an open source library for making the
-TensorFlow framework run on `Amazon SageMaker `__.
+SageMaker TensorFlow Training Toolkit is an open-source library for using TensorFlow to train models on Amazon SageMaker.
-This repository also contains Dockerfiles which install this library, TensorFlow, and dependencies
-for building SageMaker TensorFlow images.
+For inference, see `SageMaker TensorFlow Inference Toolkit `__.
-For information on running TensorFlow jobs on SageMaker: `Python
-SDK `__.
+For the Dockerfiles used for building SageMaker TensorFlow Containers, see `AWS Deep Learning Containers `__.
+
+For information on running TensorFlow jobs on Amazon SageMaker, please refer to the `SageMaker Python SDK documentation `__.
For notebook examples: `SageMaker Notebook
Examples `__.
-Table of Contents
------------------
-
-#. `Getting Started <#getting-started>`__
-#. `Building your Image <#building-your-image>`__
-#. `Running the tests <#running-the-tests>`__
-
-Getting Started
----------------
-
-Prerequisites
-~~~~~~~~~~~~~
-
-Make sure you have installed all of the following prerequisites on your
-development machine:
-
-- `Docker `__
-
-For Testing on GPU
-^^^^^^^^^^^^^^^^^^
-
-- `Nvidia-Docker `__
-
-Recommended
-^^^^^^^^^^^
-
-- A Python environment management tool. (e.g.
- `PyEnv `__,
- `VirtualEnv `__)
-
-Building your Image
--------------------
-
-`Amazon SageMaker `__
-utilizes Docker containers to run all training jobs & inference endpoints.
-
-The Docker images are built from the Dockerfiles specified in
-`Docker/ `__.
-
-The Docker files are grouped based on TensorFlow version and separated
-based on Python version and processor type.
-
-The Docker files for TensorFlow 2.0 are available in the
-`tf-2 `__ branch, in
-`docker/2.0.0/ `__.
-
-The Docker images, used to run training & inference jobs, are built from
-both corresponding "base" and "final" Dockerfiles.
-
-Base Images
-~~~~~~~~~~~
-
-The "base" Dockerfile encompass the installation of the framework and all of the dependencies
-needed. It is needed before building image for TensorFlow 1.8.0 and before.
-Building a base image is not required for images for TensorFlow 1.9.0 and onwards.
-
-Tagging scheme is based on --. (e.g. 1.4
-.1-cpu-py2)
-
-All "final" Dockerfiles build images using base images that use the tagging scheme
-above.
-
-If you want to build your "base" Docker image, then use:
-
-::
-
- # All build instructions assume you're building from the same directory as the Dockerfile.
-
- # CPU
- docker build -t tensorflow-base:-cpu- -f Dockerfile.cpu .
-
- # GPU
- docker build -t tensorflow-base:-gpu- -f Dockerfile.gpu .
-
-::
-
- # Example
-
- # CPU
- docker build -t tensorflow-base:1.4.1-cpu-py2 -f Dockerfile.cpu .
-
- # GPU
- docker build -t tensorflow-base:1.4.1-gpu-py2 -f Dockerfile.gpu .
-
-Final Images
-~~~~~~~~~~~~
-
-The "final" Dockerfiles encompass the installation of the SageMaker specific support code.
-
-For images of TensorFlow 1.8.0 and before, all "final" Dockerfiles use `base images for building `__.
-
-These "base" images are specified with the naming convention of
-tensorflow-base:--.
-
-Before building "final" images:
-
-Build your "base" image. Make sure it is named and tagged in accordance with your "final"
-Dockerfile. Skip this step if you want to build image of Tensorflow Version 1.9.0 and above.
-
-Then prepare the SageMaker TensorFlow Container python package in the image folder like below:
-
-::
-
- # Create the SageMaker TensorFlow Container Python package.
- cd sagemaker-tensorflow-containers
- python setup.py sdist
-
- #. Copy your Python package to "final" Dockerfile directory that you are building.
- cp dist/sagemaker_tensorflow_container-.tar.gz docker//final/py2
-
-If you want to build "final" Docker images, for versions 1.6 and above, you will first need to download the appropriate tensorflow pip wheel, then pass in its location as a build argument. These can be obtained from pypi. For example, the files for 1.6.0 are here:
-
-https://pypi.org/project/tensorflow/1.6.0/#files
-https://pypi.org/project/tensorflow-gpu/1.6.0/#files
-
-Note that you need to use the tensorflow-gpu wheel when building the GPU image.
-
-Then run:
-
-::
-
- # All build instructions assumes you're building from the same directory as the Dockerfile.
-
- # CPU
- docker build -t : --build-arg py_version= --build-arg framework_installable= -f Dockerfile.cpu .
-
- # GPU
- docker build -t : --build-arg py_version= --build-arg framework_installable= -f Dockerfile.gpu .
-
-::
-
- # Example
- docker build -t preprod-tensorflow:1.6.0-cpu-py2 --build-arg py_version=2
- --build-arg framework_installable=tensorflow-1.6.0-cp27-cp27mu-manylinux1_x86_64.whl -f Dockerfile.cpu .
-
-The dockerfiles for 1.4 and 1.5 build from source instead, so when building those, you don't need to download the wheel beforehand:
-
-::
-
- # All build instructions assumes you're building from the same directory as the Dockerfile.
-
- # CPU
- docker build -t : -f Dockerfile.cpu .
-
- # GPU
- docker build -t : -f Dockerfile.gpu .
-
-::
-
- # Example
-
- # CPU
- docker build -t preprod-tensorflow:1.4.1-cpu-py2 -f Dockerfile.cpu .
-
- # GPU
- docker build -t preprod-tensorflow:1.4.1-gpu-py2 -f Dockerfile.gpu .
-
-
-Running the tests
------------------
-
-Running the tests requires installation of the SageMaker TensorFlow Container code and its test
-dependencies.
-
-::
-
- git clone https://github.com/aws/sagemaker-tensorflow-containers.git
- cd sagemaker-tensorflow-containers
- pip install -e .[test]
-
-Tests are defined in
-`test/ `__
-and include unit, integration and functional tests.
-
-Unit Tests
-~~~~~~~~~~
-
-If you want to run unit tests, then use:
-
-::
-
- # All test instructions should be run from the top level directory
-
- pytest test/unit
-
-Integration Tests
-~~~~~~~~~~~~~~~~~
-
-Running integration tests require `Docker `__ and `AWS
-credentials `__,
-as the integration tests make calls to a couple AWS services. The integration and functional
-tests require configurations specified within their respective
-`conftest.py `__.Make sure to update the account-id and region at a minimum.
-
-Integration tests on GPU require `Nvidia-Docker `__.
-
-Before running integration tests:
-
-#. Build your Docker image.
-#. Pass in the correct pytest arguments to run tests against your Docker image.
-
-If you want to run local integration tests, then use:
-
-::
-
- # Required arguments for integration tests are found in test/integ/conftest.py
-
- pytest test/integration --docker-base-name \
- --tag \
- --framework-version \
- --processor
-
-::
-
- # Example
- pytest test/integration --docker-base-name preprod-tensorflow \
- --tag 1.0 \
- --framework-version 1.4.1 \
- --processor cpu
-
-Functional Tests
-~~~~~~~~~~~~~~~~
-
-Functional tests require your Docker image to be within an `Amazon ECR repository `__.
-
-The Docker-base-name is your `ECR repository namespace `__.
-
-The instance-type is your specified `Amazon SageMaker Instance Type
-`__ that the functional test will run on.
-
-
-Before running functional tests:
-
-#. Build your Docker image.
-#. Push the image to your ECR repository.
-#. Pass in the correct pytest arguments to run tests on SageMaker against the image within your ECR repository.
-
-If you want to run a functional end to end test on `Amazon
-SageMaker `__, then use:
-
-::
-
- # Required arguments for integration tests are found in test/functional/conftest.py
-
- pytest test/functional --aws-id \
- --docker-base-name \
- --instance-type \
- --tag \
-
-::
-
- # Example
- pytest test/functional --aws-id 12345678910 \
- --docker-base-name preprod-tensorflow \
- --instance-type ml.m4.xlarge \
- --tag 1.0
-
Contributing
------------
Please read
-`CONTRIBUTING.md `__
+`CONTRIBUTING.md `__
for details on our code of conduct, and the process for submitting pull
requests to us.
License
-------
-SageMaker TensorFlow Containers is licensed under the Apache 2.0 License. It is copyright 2018
+SageMaker TensorFlow Training Toolkit is licensed under the Apache 2.0 License. It is copyright 2018
Amazon.com, Inc. or its affiliates. All Rights Reserved. The license is available at:
http://aws.amazon.com/apache2.0/
diff --git a/VERSION b/VERSION
index eb5fc1c6..50106b6d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.0.8.dev0
+10.1.9.dev0
diff --git a/benchmarks/horovod-resnet/execute_horovod_training.py b/benchmarks/horovod-resnet/execute_horovod_training.py
index e6ac7609..4b0b9b23 100755
--- a/benchmarks/horovod-resnet/execute_horovod_training.py
+++ b/benchmarks/horovod-resnet/execute_horovod_training.py
@@ -26,7 +26,7 @@
from sagemaker.tensorflow import TensorFlow
dir_path = os.path.dirname(os.path.realpath(__file__))
-benchmark_results_dir = os.path.join('s3://', Session().default_bucket(), 'hvd-benchmarking')
+benchmark_results_dir = os.path.join("s3://", Session().default_bucket(), "hvd-benchmarking")
@click.group()
@@ -35,93 +35,98 @@ def cli():
def generate_report():
- results_dir = os.path.join(dir_path, 'results')
+ results_dir = os.path.join(dir_path, "results")
if os.path.exists(results_dir):
shutil.rmtree(results_dir)
- subprocess.call(['aws', 's3', 'cp', '--recursive', benchmark_results_dir, results_dir])
+ subprocess.call(["aws", "s3", "cp", "--recursive", benchmark_results_dir, results_dir])
jobs = {}
for job_name in os.listdir(results_dir):
jobs[job_name] = {}
- _, instance_type, instance_count, device, py_version, _, _, _, _, _, _, _ = job_name.split('-')
+ _, instance_type, instance_count, device, py_version, _, _, _, _, _, _, _ = job_name.split(
+ "-"
+ )
current_dir = os.path.join(results_dir, job_name)
- model_dir = os.path.join(current_dir, 'output', 'model.tar.gz')
- subprocess.call(['tar', '-xvzf', model_dir], cwd=current_dir)
+ model_dir = os.path.join(current_dir, "output", "model.tar.gz")
+ subprocess.call(["tar", "-xvzf", model_dir], cwd=current_dir)
- jobs[job_name]['instance_type'] = instance_type
- jobs[job_name]['instance_count'] = instance_count
- jobs[job_name]['device'] = device
- jobs[job_name]['py_version'] = py_version
+ jobs[job_name]["instance_type"] = instance_type
+ jobs[job_name]["instance_count"] = instance_count
+ jobs[job_name]["device"] = device
+ jobs[job_name]["py_version"] = py_version
- benchmark_log = os.path.join(current_dir, 'benchmark_run.log')
+ benchmark_log = os.path.join(current_dir, "benchmark_run.log")
if os.path.exists(benchmark_log):
with open(benchmark_log) as f:
data = json.load(f)
-
- jobs[job_name]['dataset'] = data['dataset']['name']
- jobs[job_name]['num_cores'] = data['machine_config']['cpu_info']['num_cores']
- jobs[job_name]['cpu_info'] = data['machine_config']['cpu_info']['cpu_info']
- jobs[job_name]['mhz_per_cpu'] = data['machine_config']['cpu_info']['mhz_per_cpu']
- jobs[job_name]['gpu_count'] = data['machine_config']['gpu_info']['count']
- jobs[job_name]['gpu_model'] = data['machine_config']['gpu_info']['model']
+ jobs[job_name]["dataset"] = data["dataset"]["name"]
+ jobs[job_name]["num_cores"] = data["machine_config"]["cpu_info"]["num_cores"]
+ jobs[job_name]["cpu_info"] = data["machine_config"]["cpu_info"]["cpu_info"]
+ jobs[job_name]["mhz_per_cpu"] = data["machine_config"]["cpu_info"]["mhz_per_cpu"]
+ jobs[job_name]["gpu_count"] = data["machine_config"]["gpu_info"]["count"]
+ jobs[job_name]["gpu_model"] = data["machine_config"]["gpu_info"]["model"]
def find_value(parameter):
- other_key = [k for k in parameter if k != 'name'][0]
+ other_key = [k for k in parameter if k != "name"][0]
return parameter[other_key]
- for parameter in data['run_parameters']:
- jobs[job_name][parameter['name']] = find_value(parameter)
+ for parameter in data["run_parameters"]:
+ jobs[job_name][parameter["name"]] = find_value(parameter)
- jobs[job_name]['model_name'] = data['model_name']
- jobs[job_name]['run_date'] = data['run_date']
- jobs[job_name]['tensorflow_version'] = data['tensorflow_version']['version']
- jobs[job_name]['tensorflow_version_git_hash'] = data['tensorflow_version']['git_hash']
+ jobs[job_name]["model_name"] = data["model_name"]
+ jobs[job_name]["run_date"] = data["run_date"]
+ jobs[job_name]["tensorflow_version"] = data["tensorflow_version"]["version"]
+ jobs[job_name]["tensorflow_version_git_hash"] = data["tensorflow_version"][
+ "git_hash"
+ ]
return pd.DataFrame(jobs)
-@cli.command('train')
-@click.option('--framework-version', required=True, type=click.Choice(['1.11', '1.12']))
-@click.option('--device', required=True, type=click.Choice(['cpu', 'gpu']))
-@click.option('--py-versions', multiple=True, type=str)
-@click.option('--training-input-mode', default='File', type=click.Choice(['File', 'Pipe']))
-@click.option('--networking-isolation/--no-networking-isolation', default=False)
-@click.option('--wait/--no-wait', default=False)
-@click.option('--security-groups', multiple=True, type=str)
-@click.option('--subnets', multiple=True, type=str)
-@click.option('--role', default='SageMakerRole', type=str)
-@click.option('--instance-counts', multiple=True, type=int)
-@click.option('--instance-types', multiple=True, type=str)
-@click.argument('script_args', nargs=-1, type=str)
-def train(framework_version,
- device,
- py_versions,
- training_input_mode,
- networking_isolation,
- wait,
- security_groups,
- subnets,
- role,
- instance_counts,
- instance_types,
- script_args):
+@cli.command("train")
+@click.option("--framework-version", required=True, type=click.Choice(["1.11", "1.12"]))
+@click.option("--device", required=True, type=click.Choice(["cpu", "gpu"]))
+@click.option("--py-versions", multiple=True, type=str)
+@click.option("--training-input-mode", default="File", type=click.Choice(["File", "Pipe"]))
+@click.option("--networking-isolation/--no-networking-isolation", default=False)
+@click.option("--wait/--no-wait", default=False)
+@click.option("--security-groups", multiple=True, type=str)
+@click.option("--subnets", multiple=True, type=str)
+@click.option("--role", default="SageMakerRole", type=str)
+@click.option("--instance-counts", multiple=True, type=int)
+@click.option("--instance-types", multiple=True, type=str)
+@click.argument("script_args", nargs=-1, type=str)
+def train(
+ framework_version,
+ device,
+ py_versions,
+ training_input_mode,
+ networking_isolation,
+ wait,
+ security_groups,
+ subnets,
+ role,
+ instance_counts,
+ instance_types,
+ script_args,
+):
iterator = itertools.product(instance_types, py_versions, instance_counts)
for instance_type, py_version, instance_count in iterator:
base_name = job_name(instance_type, instance_count, device, py_version)
- mpi_options = '-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog'
+ mpi_options = "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog"
estimator = TensorFlow(
- entry_point=os.path.join(dir_path, 'train.sh'),
+ entry_point=os.path.join(dir_path, "train.sh"),
role=role,
- dependencies=[os.path.join(dir_path, 'train_imagenet_resnet_hvd.py')],
+ dependencies=[os.path.join(dir_path, "train_imagenet_resnet_hvd.py")],
base_job_name=base_name,
train_instance_count=instance_count,
train_instance_type=instance_type,
@@ -129,36 +134,34 @@ def train(framework_version,
py_version=py_version,
script_mode=True,
hyperparameters={
- 'sagemaker_mpi_enabled': True,
- 'sagemaker_mpi_num_of_processes_per_host': 8,
- 'sagemaker_mpi_custom_mpi_options': mpi_options
+ "sagemaker_mpi_enabled": True,
+ "sagemaker_mpi_num_of_processes_per_host": 8,
+ "sagemaker_mpi_custom_mpi_options": mpi_options,
},
output_path=benchmark_results_dir,
security_group_ids=security_groups,
- subnets=subnets
+ subnets=subnets,
)
estimator.fit(wait=wait)
if wait:
- artifacts_path = os.path.join(dir_path, 'results',
- estimator.latest_training_job.job_name)
- model_path = os.path.join(artifacts_path, 'model.tar.gz')
+ artifacts_path = os.path.join(
+ dir_path, "results", estimator.latest_training_job.job_name
+ )
+ model_path = os.path.join(artifacts_path, "model.tar.gz")
os.makedirs(artifacts_path)
- subprocess.call(['aws', 's3', 'cp', estimator.model_data, model_path])
- subprocess.call(['tar', '-xvzf', model_path], cwd=artifacts_path)
+ subprocess.call(["aws", "s3", "cp", estimator.model_data, model_path])
+ subprocess.call(["tar", "-xvzf", model_path], cwd=artifacts_path)
+
+ print("Model downloaded at %s" % model_path)
- print('Model downloaded at %s' % model_path)
+def job_name(instance_type, instance_count, device, python_version):
+ instance_typename = instance_type.replace(".", "").replace("ml", "")
-def job_name(instance_type,
- instance_count,
- device,
- python_version):
- instance_typename = instance_type.replace('.', '').replace('ml', '')
+ return "hvd-%s-%s-%s-%s" % (instance_typename, instance_count, device, python_version)
- return 'hvd-%s-%s-%s-%s' % (
- instance_typename, instance_count, device, python_version)
-if __name__ == '__main__':
+if __name__ == "__main__":
cli()
diff --git a/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py b/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py
index d415c62d..cf0e2486 100644
--- a/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py
+++ b/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py
@@ -51,18 +51,26 @@
from operator import itemgetter
from tensorflow.python.util import nest
+
def rank0log(logger, *args, **kwargs):
if hvd.rank() == 0:
if logger:
- logger.info(''.join([str(x) for x in list(args)]))
+ logger.info("".join([str(x) for x in list(args)]))
else:
print(*args, **kwargs)
class LayerBuilder(object):
- def __init__(self, activation=None, data_format='channels_last',
- training=False, use_batch_norm=False, batch_norm_config=None,
- conv_initializer=None, adv_bn_init=False):
+ def __init__(
+ self,
+ activation=None,
+ data_format="channels_last",
+ training=False,
+ use_batch_norm=False,
+ batch_norm_config=None,
+ conv_initializer=None,
+ adv_bn_init=False,
+ ):
self.activation = activation
self.data_format = data_format
self.training = training
@@ -72,19 +80,22 @@ def __init__(self, activation=None, data_format='channels_last',
self.adv_bn_init = adv_bn_init
if self.batch_norm_config is None:
self.batch_norm_config = {
- 'decay': 0.9,
- 'epsilon': 1e-4,
- 'scale': True,
- 'zero_debias_moving_mean': False,
+ "decay": 0.9,
+ "epsilon": 1e-4,
+ "scale": True,
+ "zero_debias_moving_mean": False,
}
def _conv2d(self, inputs, activation, *args, **kwargs):
x = tf.layers.conv2d(
- inputs, data_format=self.data_format,
+ inputs,
+ data_format=self.data_format,
use_bias=not self.use_batch_norm,
kernel_initializer=self.conv_initializer,
activation=None if self.use_batch_norm else activation,
- *args, **kwargs)
+ *args,
+ **kwargs
+ )
if self.use_batch_norm:
x = self.batch_norm(x)
x = activation(x) if activation is not None else x
@@ -92,19 +103,23 @@ def _conv2d(self, inputs, activation, *args, **kwargs):
def conv2d_linear_last_bn(self, inputs, *args, **kwargs):
x = tf.layers.conv2d(
- inputs, data_format=self.data_format,
+ inputs,
+ data_format=self.data_format,
use_bias=False,
kernel_initializer=self.conv_initializer,
- activation=None, *args, **kwargs)
+ activation=None,
+ *args,
+ **kwargs
+ )
param_initializers = {
- 'moving_mean': tf.zeros_initializer(),
- 'moving_variance': tf.ones_initializer(),
- 'beta': tf.zeros_initializer(),
+ "moving_mean": tf.zeros_initializer(),
+ "moving_variance": tf.ones_initializer(),
+ "beta": tf.zeros_initializer(),
}
if self.adv_bn_init:
- param_initializers['gamma'] = tf.zeros_initializer()
+ param_initializers["gamma"] = tf.zeros_initializer()
else:
- param_initializers['gamma'] = tf.ones_initializer()
+ param_initializers["gamma"] = tf.ones_initializer()
x = self.batch_norm(x, param_initializers=param_initializers)
return x
@@ -125,19 +140,17 @@ def pad2d(self, inputs, begin, end=None):
_ = end[1]
except TypeError:
end = [end, end]
- if self.data_format == 'channels_last':
+ if self.data_format == "channels_last":
padding = [[0, 0], [begin[0], end[0]], [begin[1], end[1]], [0, 0]]
else:
padding = [[0, 0], [0, 0], [begin[0], end[0]], [begin[1], end[1]]]
return tf.pad(inputs, padding)
def max_pooling2d(self, inputs, *args, **kwargs):
- return tf.layers.max_pooling2d(
- inputs, data_format=self.data_format, *args, **kwargs)
+ return tf.layers.max_pooling2d(inputs, data_format=self.data_format, *args, **kwargs)
def average_pooling2d(self, inputs, *args, **kwargs):
- return tf.layers.average_pooling2d(
- inputs, data_format=self.data_format, *args, **kwargs)
+ return tf.layers.average_pooling2d(inputs, data_format=self.data_format, *args, **kwargs)
def dense_linear(self, inputs, units, **kwargs):
return tf.layers.dense(inputs, units, activation=None)
@@ -152,72 +165,72 @@ def activate(self, inputs, activation=None):
def batch_norm(self, inputs, **kwargs):
all_kwargs = dict(self.batch_norm_config)
all_kwargs.update(kwargs)
- data_format = 'NHWC' if self.data_format == 'channels_last' else 'NCHW'
+ data_format = "NHWC" if self.data_format == "channels_last" else "NCHW"
return tf.contrib.layers.batch_norm(
- inputs, is_training=self.training, data_format=data_format,
- fused=True, **all_kwargs)
+ inputs, is_training=self.training, data_format=data_format, fused=True, **all_kwargs
+ )
def spatial_average2d(self, inputs):
shape = inputs.get_shape().as_list()
- if self.data_format == 'channels_last':
+ if self.data_format == "channels_last":
n, h, w, c = shape
else:
n, c, h, w = shape
n = -1 if n is None else n
- x = tf.layers.average_pooling2d(inputs, (h, w), (1, 1),
- data_format=self.data_format)
+ x = tf.layers.average_pooling2d(inputs, (h, w), (1, 1), data_format=self.data_format)
return tf.reshape(x, [n, c])
def flatten2d(self, inputs):
x = inputs
- if self.data_format != 'channel_last':
+ if self.data_format != "channel_last":
# Note: This ensures the output order matches that of NHWC networks
x = tf.transpose(x, [0, 2, 3, 1])
input_shape = x.get_shape().as_list()
num_inputs = 1
for dim in input_shape[1:]:
num_inputs *= dim
- return tf.reshape(x, [-1, num_inputs], name='flatten')
+ return tf.reshape(x, [-1, num_inputs], name="flatten")
def residual2d(self, inputs, network, units=None, scale=1.0, activate=False):
outputs = network(inputs)
- c_axis = -1 if self.data_format == 'channels_last' else 1
- h_axis = 1 if self.data_format == 'channels_last' else 2
+ c_axis = -1 if self.data_format == "channels_last" else 1
+ h_axis = 1 if self.data_format == "channels_last" else 2
w_axis = h_axis + 1
ishape, oshape = [y.get_shape().as_list() for y in [inputs, outputs]]
ichans, ochans = ishape[c_axis], oshape[c_axis]
- strides = ((ishape[h_axis] - 1) // oshape[h_axis] + 1,
- (ishape[w_axis] - 1) // oshape[w_axis] + 1)
- with tf.name_scope('residual'):
- if (ochans != ichans or strides[0] != 1 or strides[1] != 1):
- inputs = self.conv2d_linear(inputs, units, 1, strides, 'SAME')
+ strides = (
+ (ishape[h_axis] - 1) // oshape[h_axis] + 1,
+ (ishape[w_axis] - 1) // oshape[w_axis] + 1,
+ )
+ with tf.name_scope("residual"):
+ if ochans != ichans or strides[0] != 1 or strides[1] != 1:
+ inputs = self.conv2d_linear(inputs, units, 1, strides, "SAME")
x = inputs + scale * outputs
if activate:
x = self.activate(x)
return x
-def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride,
- basic=False):
+def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, basic=False):
num_inputs = inputs.get_shape().as_list()[1]
x = inputs
- with tf.name_scope('resnet_v1'):
+ with tf.name_scope("resnet_v1"):
if depth == num_inputs:
if stride == 1:
shortcut = x
else:
shortcut = builder.max_pooling2d(x, 1, stride)
else:
- shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME')
+ shortcut = builder.conv2d_linear(x, depth, 1, stride, "SAME")
if basic:
x = builder.pad2d(x, 1)
- x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID')
- x = builder.conv2d_linear(x, depth, 3, 1, 'SAME')
+ x = builder.conv2d(x, depth_bottleneck, 3, stride, "VALID")
+ x = builder.conv2d_linear(x, depth, 3, 1, "SAME")
else:
- x = builder.conv2d(x, depth_bottleneck, 1, 1, 'SAME')
- x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
+ x = builder.conv2d(x, depth_bottleneck, 1, 1, "SAME")
+ x = builder.conv2d(x, depth_bottleneck, 3, stride, "SAME")
# x = builder.conv2d_linear(x, depth, 1, 1, 'SAME')
- x = builder.conv2d_linear_last_bn(x, depth, 1, 1, 'SAME')
+ x = builder.conv2d_linear_last_bn(x, depth, 1, 1, "SAME")
x = tf.nn.relu(x + shortcut)
return x
@@ -225,8 +238,8 @@ def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride,
def inference_resnet_v1_impl(builder, inputs, layer_counts, basic=False):
x = inputs
x = builder.pad2d(x, 3)
- x = builder.conv2d(x, 64, 7, 2, 'VALID')
- x = builder.max_pooling2d(x, 3, 2, 'SAME')
+ x = builder.conv2d(x, 64, 7, 2, "VALID")
+ x = builder.max_pooling2d(x, 3, 2, "SAME")
for i in range(layer_counts[0]):
x = resnet_bottleneck_v1(builder, x, 256, 64, 1, basic)
for i in range(layer_counts[1]):
@@ -238,13 +251,25 @@ def inference_resnet_v1_impl(builder, inputs, layer_counts, basic=False):
return builder.spatial_average2d(x)
-def inference_resnet_v1(inputs, nlayer, data_format='channels_last',
- training=False, conv_initializer=None, adv_bn_init=False):
+def inference_resnet_v1(
+ inputs,
+ nlayer,
+ data_format="channels_last",
+ training=False,
+ conv_initializer=None,
+ adv_bn_init=False,
+):
"""Deep Residual Networks family of models
https://arxiv.org/abs/1512.03385
"""
- builder = LayerBuilder(tf.nn.relu, data_format, training, use_batch_norm=True,
- conv_initializer=conv_initializer, adv_bn_init=adv_bn_init)
+ builder = LayerBuilder(
+ tf.nn.relu,
+ data_format,
+ training,
+ use_batch_norm=True,
+ conv_initializer=conv_initializer,
+ adv_bn_init=adv_bn_init,
+ )
if nlayer == 18:
return inference_resnet_v1_impl(builder, inputs, [2, 2, 2, 2], basic=True)
elif nlayer == 34:
@@ -256,83 +281,95 @@ def inference_resnet_v1(inputs, nlayer, data_format='channels_last',
elif nlayer == 152:
return inference_resnet_v1_impl(builder, inputs, [3, 8, 36, 3])
else:
- raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" %
- nlayer)
+ raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" % nlayer)
def get_model_func(model_name):
- if model_name.startswith('resnet'):
- nlayer = int(model_name[len('resnet'):])
- return lambda images, *args, **kwargs: \
- inference_resnet_v1(images, nlayer, *args, **kwargs)
+ if model_name.startswith("resnet"):
+ nlayer = int(model_name[len("resnet") :])
+ return lambda images, *args, **kwargs: inference_resnet_v1(images, nlayer, *args, **kwargs)
else:
raise ValueError("Invalid model type: %s" % model_name)
def deserialize_image_record(record):
feature_map = {
- 'image/encoded': tf.FixedLenFeature([], tf.string, ''),
- 'image/class/label': tf.FixedLenFeature([1], tf.int64, -1),
- 'image/class/text': tf.FixedLenFeature([], tf.string, ''),
- 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
- 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
- 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
- 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32)
+ "image/encoded": tf.FixedLenFeature([], tf.string, ""),
+ "image/class/label": tf.FixedLenFeature([1], tf.int64, -1),
+ "image/class/text": tf.FixedLenFeature([], tf.string, ""),
+ "image/object/bbox/xmin": tf.VarLenFeature(dtype=tf.float32),
+ "image/object/bbox/ymin": tf.VarLenFeature(dtype=tf.float32),
+ "image/object/bbox/xmax": tf.VarLenFeature(dtype=tf.float32),
+ "image/object/bbox/ymax": tf.VarLenFeature(dtype=tf.float32),
}
- with tf.name_scope('deserialize_image_record'):
+ with tf.name_scope("deserialize_image_record"):
obj = tf.parse_single_example(record, feature_map)
- imgdata = obj['image/encoded']
- label = tf.cast(obj['image/class/label'], tf.int32)
- bbox = tf.stack([obj['image/object/bbox/%s' % x].values
- for x in ['ymin', 'xmin', 'ymax', 'xmax']])
+ imgdata = obj["image/encoded"]
+ label = tf.cast(obj["image/class/label"], tf.int32)
+ bbox = tf.stack(
+ [obj["image/object/bbox/%s" % x].values for x in ["ymin", "xmin", "ymax", "xmax"]]
+ )
bbox = tf.transpose(tf.expand_dims(bbox, 0), [0, 2, 1])
- text = obj['image/class/text']
+ text = obj["image/class/text"]
return imgdata, label, bbox, text
def decode_jpeg(imgdata, channels=3):
- return tf.image.decode_jpeg(imgdata, channels=channels,
- fancy_upscaling=False,
- dct_method='INTEGER_FAST')
+ return tf.image.decode_jpeg(
+ imgdata, channels=channels, fancy_upscaling=False, dct_method="INTEGER_FAST"
+ )
-def crop_and_resize_image(image, original_bbox, height, width,
- distort=False, nsummary=10):
- with tf.name_scope('crop_and_resize'):
+def crop_and_resize_image(image, original_bbox, height, width, distort=False, nsummary=10):
+ with tf.name_scope("crop_and_resize"):
# Evaluation is done on a center-crop of this ratio
eval_crop_ratio = 0.8
if distort:
- initial_shape = [int(round(height / eval_crop_ratio)),
- int(round(width / eval_crop_ratio)),
- 3]
- bbox_begin, bbox_size, bbox = \
- tf.image.sample_distorted_bounding_box(
- initial_shape,
- bounding_boxes=tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
- # tf.zeros(shape=[1,0,4]), # No bounding boxes
- min_object_covered=0.1,
- aspect_ratio_range=[3. / 4., 4. / 3.],
- area_range=[0.08, 1.0],
- max_attempts=100,
- seed=11 * hvd.rank(), # Need to set for deterministic results
- use_image_if_no_bounding_boxes=True)
+ initial_shape = [
+ int(round(height / eval_crop_ratio)),
+ int(round(width / eval_crop_ratio)),
+ 3,
+ ]
+ bbox_begin, bbox_size, bbox = tf.image.sample_distorted_bounding_box(
+ initial_shape,
+ bounding_boxes=tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
+ # tf.zeros(shape=[1,0,4]), # No bounding boxes
+ min_object_covered=0.1,
+ aspect_ratio_range=[3.0 / 4.0, 4.0 / 3.0],
+ area_range=[0.08, 1.0],
+ max_attempts=100,
+ seed=11 * hvd.rank(), # Need to set for deterministic results
+ use_image_if_no_bounding_boxes=True,
+ )
bbox = bbox[0, 0] # Remove batch, box_idx dims
else:
# Central crop
ratio_y = ratio_x = eval_crop_ratio
- bbox = tf.constant([0.5 * (1 - ratio_y), 0.5 * (1 - ratio_x),
- 0.5 * (1 + ratio_y), 0.5 * (1 + ratio_x)])
- image = tf.image.crop_and_resize(
- image[None, :, :, :], bbox[None, :], [0], [height, width])[0]
+ bbox = tf.constant(
+ [0.5 * (1 - ratio_y), 0.5 * (1 - ratio_x), 0.5 * (1 + ratio_y), 0.5 * (1 + ratio_x)]
+ )
+ image = tf.image.crop_and_resize(image[None, :, :, :], bbox[None, :], [0], [height, width])[
+ 0
+ ]
return image
-def parse_and_preprocess_image_record(record, counter, height, width,
- brightness, contrast, saturation, hue,
- distort=False, nsummary=10, increased_aug=False):
+def parse_and_preprocess_image_record(
+ record,
+ counter,
+ height,
+ width,
+ brightness,
+ contrast,
+ saturation,
+ hue,
+ distort=False,
+ nsummary=10,
+ increased_aug=False,
+):
imgdata, label, bbox, text = deserialize_image_record(record)
label -= 1 # Change to 0-based (don't use background class)
- with tf.name_scope('preprocess_train'):
+ with tf.name_scope("preprocess_train"):
try:
image = decode_jpeg(imgdata, channels=3)
except:
@@ -342,24 +379,44 @@ def parse_and_preprocess_image_record(record, counter, height, width,
image = tf.image.random_flip_left_right(image)
if increased_aug:
image = tf.image.random_brightness(image, max_delta=brightness)
- image = distort_image_ops.random_hsv_in_yiq(image,
- lower_saturation=saturation,
- upper_saturation=2.0 - saturation,
- max_delta_hue=hue * math.pi)
+ image = distort_image_ops.random_hsv_in_yiq(
+ image,
+ lower_saturation=saturation,
+ upper_saturation=2.0 - saturation,
+ max_delta_hue=hue * math.pi,
+ )
image = tf.image.random_contrast(image, lower=contrast, upper=2.0 - contrast)
- tf.summary.image('distorted_color_image', tf.expand_dims(image, 0))
- image = tf.clip_by_value(image, 0., 255.)
+ tf.summary.image("distorted_color_image", tf.expand_dims(image, 0))
+ image = tf.clip_by_value(image, 0.0, 255.0)
image = tf.cast(image, tf.uint8)
return image, label
-def make_dataset(filenames, take_count, batch_size, height, width,
- brightness, contrast, saturation, hue,
- training=False, num_threads=10, nsummary=10, shard=False, synthetic=False,
- increased_aug=False):
+
+def make_dataset(
+ filenames,
+ take_count,
+ batch_size,
+ height,
+ width,
+ brightness,
+ contrast,
+ saturation,
+ hue,
+ training=False,
+ num_threads=10,
+ nsummary=10,
+ shard=False,
+ synthetic=False,
+ increased_aug=False,
+):
if synthetic and training:
input_shape = [height, width, 3]
- input_element = nest.map_structure(lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape))
- label_element = nest.map_structure(lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1]))
+ input_element = nest.map_structure(
+ lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape)
+ )
+ label_element = nest.map_structure(
+ lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1])
+ )
element = (input_element, label_element)
ds = tf.data.Dataset.from_tensors(element).repeat()
else:
@@ -380,16 +437,29 @@ def make_dataset(filenames, take_count, batch_size, height, width,
if training:
ds = ds.shuffle(1000, seed=7 * (1 + hvd.rank()))
- ds = ds.interleave(
- tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
+ ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
counter = tf.data.Dataset.range(sys.maxsize)
ds = tf.data.Dataset.zip((ds, counter))
preproc_func = lambda record, counter_: parse_and_preprocess_image_record(
- record, counter_, height, width, brightness, contrast, saturation, hue,
- distort=training, nsummary=nsummary if training else 0, increased_aug=increased_aug)
+ record,
+ counter_,
+ height,
+ width,
+ brightness,
+ contrast,
+ saturation,
+ hue,
+ distort=training,
+ nsummary=nsummary if training else 0,
+ increased_aug=increased_aug,
+ )
ds = ds.map(preproc_func, num_parallel_calls=num_threads)
if training:
- ds = ds.apply(tf.data.experimental.shuffle_and_repeat(shuffle_buffer_size, seed=5*(1+hvd.rank())))
+ ds = ds.apply(
+ tf.data.experimental.shuffle_and_repeat(
+ shuffle_buffer_size, seed=5 * (1 + hvd.rank())
+ )
+ )
ds = ds.batch(batch_size)
return ds
@@ -399,18 +469,19 @@ def stage(tensors):
"""
stage_area = data_flow_ops.StagingArea(
dtypes=[tensor.dtype for tensor in tensors],
- shapes=[tensor.get_shape() for tensor in tensors])
+ shapes=[tensor.get_shape() for tensor in tensors],
+ )
put_op = stage_area.put(tensors)
get_tensors = stage_area.get()
- tf.add_to_collection('STAGING_AREA_PUTS', put_op)
+ tf.add_to_collection("STAGING_AREA_PUTS", put_op)
return put_op, get_tensors
class PrefillStagingAreasHook(tf.train.SessionRunHook):
def after_create_session(self, session, coord):
- enqueue_ops = tf.get_collection('STAGING_AREA_PUTS')
+ enqueue_ops = tf.get_collection("STAGING_AREA_PUTS")
for i in range(len(enqueue_ops)):
- session.run(enqueue_ops[:i + 1])
+ session.run(enqueue_ops[: i + 1])
class LogSessionRunHook(tf.train.SessionRunHook):
@@ -421,15 +492,15 @@ def __init__(self, global_batch_size, num_records, display_every=10, logger=None
self.logger = logger
def after_create_session(self, session, coord):
- rank0log(self.logger, ' Step Epoch Speed Loss FinLoss LR')
- self.elapsed_secs = 0.
+ rank0log(self.logger, " Step Epoch Speed Loss FinLoss LR")
+ self.elapsed_secs = 0.0
self.count = 0
def before_run(self, run_context):
self.t0 = time.time()
return tf.train.SessionRunArgs(
- fetches=[tf.train.get_global_step(),
- 'loss:0', 'total_loss:0', 'learning_rate:0'])
+ fetches=[tf.train.get_global_step(), "loss:0", "total_loss:0", "learning_rate:0"]
+ )
def after_run(self, run_context, run_values):
self.elapsed_secs += time.time() - self.t0
@@ -439,25 +510,37 @@ def after_run(self, run_context, run_values):
dt = self.elapsed_secs / self.count
img_per_sec = self.global_batch_size / dt
epoch = global_step * self.global_batch_size / self.num_records
- self.logger.info('%6i %5.1f %7.1f %6.3f %6.3f %7.5f' %
- (global_step, epoch, img_per_sec, loss, total_loss, lr))
- self.elapsed_secs = 0.
+ self.logger.info(
+ "%6i %5.1f %7.1f %6.3f %6.3f %7.5f"
+ % (global_step, epoch, img_per_sec, loss, total_loss, lr)
+ )
+ self.elapsed_secs = 0.0
self.count = 0
-def _fp32_trainvar_getter(getter, name, shape=None, dtype=None,
- trainable=True, regularizer=None,
- *args, **kwargs):
+def _fp32_trainvar_getter(
+ getter, name, shape=None, dtype=None, trainable=True, regularizer=None, *args, **kwargs
+):
storage_dtype = tf.float32 if trainable else dtype
- variable = getter(name, shape, dtype=storage_dtype,
- trainable=trainable,
- regularizer=regularizer if trainable and 'BatchNorm' not in name and 'batchnorm' not in name and 'batch_norm' not in name and 'Batch_Norm' not in name else None,
- *args, **kwargs)
+ variable = getter(
+ name,
+ shape,
+ dtype=storage_dtype,
+ trainable=trainable,
+ regularizer=regularizer
+ if trainable
+ and "BatchNorm" not in name
+ and "batchnorm" not in name
+ and "batch_norm" not in name
+ and "Batch_Norm" not in name
+ else None,
+ *args,
+ **kwargs
+ )
if trainable and dtype != tf.float32:
- cast_name = name + '/fp16_cast'
+ cast_name = name + "/fp16_cast"
try:
- cast_variable = tf.get_default_graph().get_tensor_by_name(
- cast_name + ':0')
+ cast_variable = tf.get_default_graph().get_tensor_by_name(cast_name + ":0")
except KeyError:
cast_variable = tf.cast(variable, dtype, name=cast_name)
cast_variable._ref = variable._ref
@@ -465,31 +548,26 @@ def _fp32_trainvar_getter(getter, name, shape=None, dtype=None,
return variable
-def fp32_trainable_vars(name='fp32_vars', *args, **kwargs):
+def fp32_trainable_vars(name="fp32_vars", *args, **kwargs):
"""A varible scope with custom variable getter to convert fp16 trainable
variables with fp32 storage followed by fp16 cast.
"""
- return tf.variable_scope(
- name, custom_getter=_fp32_trainvar_getter, *args, **kwargs)
+ return tf.variable_scope(name, custom_getter=_fp32_trainvar_getter, *args, **kwargs)
class MixedPrecisionOptimizer(tf.train.Optimizer):
"""An optimizer that updates trainable variables in fp32."""
- def __init__(self, optimizer,
- scale=None,
- name="MixedPrecisionOptimizer",
- use_locking=False):
- super(MixedPrecisionOptimizer, self).__init__(
- name=name, use_locking=use_locking)
+ def __init__(self, optimizer, scale=None, name="MixedPrecisionOptimizer", use_locking=False):
+ super(MixedPrecisionOptimizer, self).__init__(name=name, use_locking=use_locking)
self._optimizer = optimizer
self._scale = float(scale) if scale is not None else 1.0
def compute_gradients(self, loss, var_list=None, *args, **kwargs):
if var_list is None:
- var_list = (
- tf.trainable_variables() +
- tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+ var_list = tf.trainable_variables() + tf.get_collection(
+ tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES
+ )
replaced_list = var_list
@@ -503,7 +581,7 @@ def compute_gradients(self, loss, var_list=None, *args, **kwargs):
if var is not orig_var:
grad = tf.cast(grad, orig_var.dtype)
if self._scale != 1.0:
- grad = tf.scalar_mul(1. / self._scale, grad)
+ grad = tf.scalar_mul(1.0 / self._scale, grad)
final_gradvar.append((grad, orig_var))
return final_gradvar
@@ -511,6 +589,7 @@ def compute_gradients(self, loss, var_list=None, *args, **kwargs):
def apply_gradients(self, *args, **kwargs):
return self._optimizer.apply_gradients(*args, **kwargs)
+
class LarcOptimizer(tf.train.Optimizer):
""" LARC implementation
-------------------
@@ -524,10 +603,17 @@ class LarcOptimizer(tf.train.Optimizer):
- use_locking
"""
- def __init__(self, optimizer, learning_rate, eta, clip=True, epsilon=1.,
- name="LarcOptimizer", use_locking=False):
- super(LarcOptimizer, self).__init__(
- name=name, use_locking=use_locking)
+ def __init__(
+ self,
+ optimizer,
+ learning_rate,
+ eta,
+ clip=True,
+ epsilon=1.0,
+ name="LarcOptimizer",
+ use_locking=False,
+ ):
+ super(LarcOptimizer, self).__init__(name=name, use_locking=use_locking)
self._optimizer = optimizer
self._learning_rate = learning_rate
self._eta = float(eta)
@@ -539,16 +625,13 @@ def compute_gradients(self, *args, **kwargs):
def apply_gradients(self, gradvars, *args, **kwargs):
v_list = [tf.norm(tensor=v, ord=2) for _, v in gradvars]
- g_list = [tf.norm(tensor=g, ord=2) if g is not None else 0.0
- for g, _ in gradvars]
+ g_list = [tf.norm(tensor=g, ord=2) if g is not None else 0.0 for g, _ in gradvars]
v_norms = tf.stack(v_list)
g_norms = tf.stack(g_list)
zeds = tf.zeros_like(v_norms)
# assign epsilon if weights or grads = 0, to avoid division by zero
# also prevent biases to get stuck at initialization (0.)
- cond = tf.logical_and(
- tf.not_equal(v_norms, zeds),
- tf.not_equal(g_norms, zeds))
+ cond = tf.logical_and(tf.not_equal(v_norms, zeds), tf.not_equal(g_norms, zeds))
true_vals = tf.scalar_mul(self._eta, tf.div(v_norms, g_norms))
# true_vals = tf.scalar_mul(tf.cast(self._eta, tf.float32), tf.div(tf.cast(v_norms, tf.float32), tf.cast(g_norms, tf.float32)))
false_vals = tf.fill(tf.shape(v_norms), self._epsilon)
@@ -561,9 +644,10 @@ def apply_gradients(self, gradvars, *args, **kwargs):
# for which learning rate is already fixed
# We then have to scale the gradients instead of the learning rate.
larc_local_lr = tf.minimum(tf.div(larc_local_lr, lr), ones)
- gradvars = [(tf.multiply(larc_local_lr[i], g), v)
- if g is not None else (None, v)
- for i, (g, v) in enumerate(gradvars)]
+ gradvars = [
+ (tf.multiply(larc_local_lr[i], g), v) if g is not None else (None, v)
+ for i, (g, v) in enumerate(gradvars)
+ ]
return self._optimizer.apply_gradients(gradvars, *args, **kwargs)
@@ -571,45 +655,64 @@ def get_with_default(obj, key, default_value):
return obj[key] if key in obj and obj[key] is not None else default_value
-def get_lr(lr, steps, lr_steps, warmup_it, decay_steps, global_step, lr_decay_mode,
- cdr_first_decay_ratio, cdr_t_mul, cdr_m_mul, cdr_alpha, lc_periods, lc_alpha, lc_beta):
- if lr_decay_mode == 'steps':
- learning_rate = tf.train.piecewise_constant(global_step,
- steps, lr_steps)
- elif lr_decay_mode == 'poly' or lr_decay_mode == 'poly_cycle':
- cycle = lr_decay_mode == 'poly_cycle'
- learning_rate = tf.train.polynomial_decay(lr,
- global_step - warmup_it,
- decay_steps=decay_steps - warmup_it,
- end_learning_rate=0.00001,
- power=2,
- cycle=cycle)
- elif lr_decay_mode == 'cosine_decay_restarts':
- learning_rate = tf.train.cosine_decay_restarts(lr,
- global_step - warmup_it,
- (decay_steps - warmup_it) * cdr_first_decay_ratio,
- t_mul=cdr_t_mul,
- m_mul=cdr_m_mul,
- alpha=cdr_alpha)
- elif lr_decay_mode == 'cosine':
- learning_rate = tf.train.cosine_decay(lr,
- global_step - warmup_it,
- decay_steps=decay_steps - warmup_it,
- alpha=0.0)
- elif lr_decay_mode == 'linear_cosine':
- learning_rate = tf.train.linear_cosine_decay(lr,
- global_step - warmup_it,
- decay_steps=decay_steps - warmup_it,
- num_periods=lc_periods,#0.47,
- alpha=lc_alpha,#0.0,
- beta=lc_beta)#0.00001)
+def get_lr(
+ lr,
+ steps,
+ lr_steps,
+ warmup_it,
+ decay_steps,
+ global_step,
+ lr_decay_mode,
+ cdr_first_decay_ratio,
+ cdr_t_mul,
+ cdr_m_mul,
+ cdr_alpha,
+ lc_periods,
+ lc_alpha,
+ lc_beta,
+):
+ if lr_decay_mode == "steps":
+ learning_rate = tf.train.piecewise_constant(global_step, steps, lr_steps)
+ elif lr_decay_mode == "poly" or lr_decay_mode == "poly_cycle":
+ cycle = lr_decay_mode == "poly_cycle"
+ learning_rate = tf.train.polynomial_decay(
+ lr,
+ global_step - warmup_it,
+ decay_steps=decay_steps - warmup_it,
+ end_learning_rate=0.00001,
+ power=2,
+ cycle=cycle,
+ )
+ elif lr_decay_mode == "cosine_decay_restarts":
+ learning_rate = tf.train.cosine_decay_restarts(
+ lr,
+ global_step - warmup_it,
+ (decay_steps - warmup_it) * cdr_first_decay_ratio,
+ t_mul=cdr_t_mul,
+ m_mul=cdr_m_mul,
+ alpha=cdr_alpha,
+ )
+ elif lr_decay_mode == "cosine":
+ learning_rate = tf.train.cosine_decay(
+ lr, global_step - warmup_it, decay_steps=decay_steps - warmup_it, alpha=0.0
+ )
+ elif lr_decay_mode == "linear_cosine":
+ learning_rate = tf.train.linear_cosine_decay(
+ lr,
+ global_step - warmup_it,
+ decay_steps=decay_steps - warmup_it,
+ num_periods=lc_periods, # 0.47,
+ alpha=lc_alpha, # 0.0,
+ beta=lc_beta,
+ ) # 0.00001)
else:
- raise ValueError('Invalid type of lr_decay_mode')
+ raise ValueError("Invalid type of lr_decay_mode")
return learning_rate
def warmup_decay(warmup_lr, global_step, warmup_steps, warmup_end_lr):
from tensorflow.python.ops import math_ops
+
p = tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32)
diff = math_ops.subtract(warmup_end_lr, warmup_lr)
res = math_ops.add(warmup_lr, math_ops.multiply(diff, p))
@@ -618,40 +721,40 @@ def warmup_decay(warmup_lr, global_step, warmup_steps, warmup_end_lr):
def cnn_model_function(features, labels, mode, params):
labels = tf.reshape(labels, (-1,)) # Squash unnecessary unary dim
- lr = params['lr']
- lr_steps = params['lr_steps']
- steps = params['steps']
- use_larc = params['use_larc']
- leta = params['leta']
- lr_decay_mode = params['lr_decay_mode']
- decay_steps = params['decay_steps']
- cdr_first_decay_ratio = params['cdr_first_decay_ratio']
- cdr_t_mul = params['cdr_t_mul']
- cdr_m_mul = params['cdr_m_mul']
- cdr_alpha = params['cdr_alpha']
- lc_periods = params['lc_periods']
- lc_alpha = params['lc_alpha']
- lc_beta = params['lc_beta']
-
- model_name = params['model']
- num_classes = params['n_classes']
- model_dtype = get_with_default(params, 'dtype', tf.float32)
- model_format = get_with_default(params, 'format', 'channels_first')
- device = get_with_default(params, 'device', '/gpu:0')
+ lr = params["lr"]
+ lr_steps = params["lr_steps"]
+ steps = params["steps"]
+ use_larc = params["use_larc"]
+ leta = params["leta"]
+ lr_decay_mode = params["lr_decay_mode"]
+ decay_steps = params["decay_steps"]
+ cdr_first_decay_ratio = params["cdr_first_decay_ratio"]
+ cdr_t_mul = params["cdr_t_mul"]
+ cdr_m_mul = params["cdr_m_mul"]
+ cdr_alpha = params["cdr_alpha"]
+ lc_periods = params["lc_periods"]
+ lc_alpha = params["lc_alpha"]
+ lc_beta = params["lc_beta"]
+
+ model_name = params["model"]
+ num_classes = params["n_classes"]
+ model_dtype = get_with_default(params, "dtype", tf.float32)
+ model_format = get_with_default(params, "format", "channels_first")
+ device = get_with_default(params, "device", "/gpu:0")
model_func = get_model_func(model_name)
inputs = features # TODO: Should be using feature columns?
- is_training = (mode == tf.estimator.ModeKeys.TRAIN)
- momentum = params['mom']
- weight_decay = params['wdecay']
- warmup_lr = params['warmup_lr']
- warmup_it = params['warmup_it']
- loss_scale = params['loss_scale']
+ is_training = mode == tf.estimator.ModeKeys.TRAIN
+ momentum = params["mom"]
+ weight_decay = params["wdecay"]
+ warmup_lr = params["warmup_lr"]
+ warmup_it = params["warmup_it"]
+ loss_scale = params["loss_scale"]
- adv_bn_init = params['adv_bn_init']
- conv_init = params['conv_init']
+ adv_bn_init = params["adv_bn_init"]
+ conv_init = params["conv_init"]
if mode == tf.estimator.ModeKeys.TRAIN:
- with tf.device('/cpu:0'):
+ with tf.device("/cpu:0"):
preload_op, (inputs, labels) = stage([inputs, labels])
with tf.device(device):
@@ -661,73 +764,87 @@ def cnn_model_function(features, labels, mode, params):
imagenet_mean = np.array([121, 115, 100], dtype=np.float32)
imagenet_std = np.array([70, 68, 71], dtype=np.float32)
inputs = tf.subtract(inputs, imagenet_mean)
- inputs = tf.multiply(inputs, 1. / imagenet_std)
- if model_format == 'channels_first':
+ inputs = tf.multiply(inputs, 1.0 / imagenet_std)
+ if model_format == "channels_first":
inputs = tf.transpose(inputs, [0, 3, 1, 2])
- with fp32_trainable_vars(
- regularizer=tf.contrib.layers.l2_regularizer(weight_decay)):
+ with fp32_trainable_vars(regularizer=tf.contrib.layers.l2_regularizer(weight_decay)):
top_layer = model_func(
- inputs, data_format=model_format, training=is_training,
- conv_initializer=conv_init, adv_bn_init=adv_bn_init)
- logits = tf.layers.dense(top_layer, num_classes,
- kernel_initializer=tf.random_normal_initializer(stddev=0.01))
+ inputs,
+ data_format=model_format,
+ training=is_training,
+ conv_initializer=conv_init,
+ adv_bn_init=adv_bn_init,
+ )
+ logits = tf.layers.dense(
+ top_layer, num_classes, kernel_initializer=tf.random_normal_initializer(stddev=0.01)
+ )
predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32)
logits = tf.cast(logits, tf.float32)
if mode == tf.estimator.ModeKeys.PREDICT:
probabilities = tf.softmax(logits)
predictions = {
- 'class_ids': predicted_classes[:, None],
- 'probabilities': probabilities,
- 'logits': logits
+ "class_ids": predicted_classes[:, None],
+ "probabilities": probabilities,
+ "logits": logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
- loss = tf.losses.sparse_softmax_cross_entropy(
- logits=logits, labels=labels)
- loss = tf.identity(loss, name='loss') # For access by logger (TODO: Better way to access it?)
+ loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)
+ loss = tf.identity(
+ loss, name="loss"
+ ) # For access by logger (TODO: Better way to access it?)
if mode == tf.estimator.ModeKeys.EVAL:
with tf.device(None): # Allow fallback to CPU if no GPU support for these ops
- accuracy = tf.metrics.accuracy(
- labels=labels, predictions=predicted_classes)
- top5acc = tf.metrics.mean(
- tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
+ accuracy = tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
+ top5acc = tf.metrics.mean(tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
newaccuracy = (hvd.allreduce(accuracy[0]), accuracy[1])
newtop5acc = (hvd.allreduce(top5acc[0]), top5acc[1])
- metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc}
- return tf.estimator.EstimatorSpec(
- mode, loss=loss, eval_metric_ops=metrics)
+ metrics = {"val-top1acc": newaccuracy, "val-top5acc": newtop5acc}
+ return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
- assert (mode == tf.estimator.ModeKeys.TRAIN)
+ assert mode == tf.estimator.ModeKeys.TRAIN
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
- total_loss = tf.add_n([loss] + reg_losses, name='total_loss')
+ total_loss = tf.add_n([loss] + reg_losses, name="total_loss")
batch_size = tf.shape(inputs)[0]
global_step = tf.train.get_global_step()
- with tf.device('/cpu:0'): # Allow fallback to CPU if no GPU support for these ops
- learning_rate = tf.cond(global_step < warmup_it,
- lambda: warmup_decay(warmup_lr, global_step, warmup_it,
- lr),
- lambda: get_lr(lr, steps, lr_steps, warmup_it, decay_steps, global_step,
- lr_decay_mode,
- cdr_first_decay_ratio, cdr_t_mul, cdr_m_mul, cdr_alpha,
- lc_periods, lc_alpha, lc_beta))
- learning_rate = tf.identity(learning_rate, 'learning_rate')
- tf.summary.scalar('learning_rate', learning_rate)
-
- opt = tf.train.MomentumOptimizer(
- learning_rate, momentum, use_nesterov=True)
+ with tf.device("/cpu:0"): # Allow fallback to CPU if no GPU support for these ops
+ learning_rate = tf.cond(
+ global_step < warmup_it,
+ lambda: warmup_decay(warmup_lr, global_step, warmup_it, lr),
+ lambda: get_lr(
+ lr,
+ steps,
+ lr_steps,
+ warmup_it,
+ decay_steps,
+ global_step,
+ lr_decay_mode,
+ cdr_first_decay_ratio,
+ cdr_t_mul,
+ cdr_m_mul,
+ cdr_alpha,
+ lc_periods,
+ lc_alpha,
+ lc_beta,
+ ),
+ )
+ learning_rate = tf.identity(learning_rate, "learning_rate")
+ tf.summary.scalar("learning_rate", learning_rate)
+
+ opt = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
opt = hvd.DistributedOptimizer(opt)
if use_larc:
opt = LarcOptimizer(opt, learning_rate, leta, clip=True)
opt = MixedPrecisionOptimizer(opt, scale=loss_scale)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
with tf.control_dependencies(update_ops):
- gate_gradients = (tf.train.Optimizer.GATE_NONE)
+ gate_gradients = tf.train.Optimizer.GATE_NONE
train_op = opt.minimize(
- total_loss, global_step=tf.train.get_global_step(),
- gate_gradients=gate_gradients)
+ total_loss, global_step=tf.train.get_global_step(), gate_gradients=gate_gradients
+ )
train_op = tf.group(preload_op, gpucopy_op, train_op) # , update_ops)
return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
@@ -741,158 +858,234 @@ def count_records(tf_record_filename):
return count
nfile = len(filenames)
- return (count_records(filenames[0]) * (nfile - 1) +
- count_records(filenames[-1]))
+ return count_records(filenames[0]) * (nfile - 1) + count_records(filenames[-1])
def add_bool_argument(cmdline, shortname, longname=None, default=False, help=None):
if longname is None:
shortname, longname = None, shortname
elif default == True:
- raise ValueError("""Boolean arguments that are True by default should not have short names.""")
+ raise ValueError(
+ """Boolean arguments that are True by default should not have short names."""
+ )
name = longname[2:]
feature_parser = cmdline.add_mutually_exclusive_group(required=False)
if shortname is not None:
- feature_parser.add_argument(shortname, '--' + name, dest=name, action='store_true', help=help, default=default)
+ feature_parser.add_argument(
+ shortname, "--" + name, dest=name, action="store_true", help=help, default=default
+ )
else:
- feature_parser.add_argument('--' + name, dest=name, action='store_true', help=help, default=default)
- feature_parser.add_argument('--no' + name, dest=name, action='store_false')
+ feature_parser.add_argument(
+ "--" + name, dest=name, action="store_true", help=help, default=default
+ )
+ feature_parser.add_argument("--no" + name, dest=name, action="store_false")
return cmdline
def add_cli_args():
- cmdline = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ cmdline = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Basic options
- cmdline.add_argument('-m', '--model', default='resnet50',
- help="""Name of model to run: resnet[18,34,50,101,152]""")
- cmdline.add_argument('--data_dir',
- help="""Path to dataset in TFRecord format
+ cmdline.add_argument(
+ "-m",
+ "--model",
+ default="resnet50",
+ help="""Name of model to run: resnet[18,34,50,101,152]""",
+ )
+ cmdline.add_argument(
+ "--data_dir",
+ help="""Path to dataset in TFRecord format
(aka Example protobufs). Files should be
- named 'train-*' and 'validation-*'.""")
- add_bool_argument(cmdline, '--synthetic', help="""Whether to use synthetic data for training""")
- cmdline.add_argument('-b', '--batch_size', default=256, type=int,
- help="""Size of each minibatch per GPU""")
- cmdline.add_argument('--num_batches', type=int,
- help="""Number of batches to run.
- Ignored during eval or if num epochs given""")
- cmdline.add_argument('--num_epochs', type=int,
- help="""Number of epochs to run.
- Overrides --num_batches. Ignored during eval.""")
- cmdline.add_argument('--log_dir', default='imagenet_resnet',
- help="""Directory in which to write training
+ named 'train-*' and 'validation-*'.""",
+ )
+ add_bool_argument(cmdline, "--synthetic", help="""Whether to use synthetic data for training""")
+ cmdline.add_argument(
+ "-b", "--batch_size", default=256, type=int, help="""Size of each minibatch per GPU"""
+ )
+ cmdline.add_argument(
+ "--num_batches",
+ type=int,
+ help="""Number of batches to run.
+ Ignored during eval or if num epochs given""",
+ )
+ cmdline.add_argument(
+ "--num_epochs",
+ type=int,
+ help="""Number of epochs to run.
+ Overrides --num_batches. Ignored during eval.""",
+ )
+ cmdline.add_argument(
+ "--log_dir",
+ default="imagenet_resnet",
+ help="""Directory in which to write training
summaries and checkpoints. If the log directory already
contains some checkpoints, it tries to resume training
from the last saved checkpoint. Pass --clear_log if you
- want to clear all checkpoints and start a fresh run""")
- add_bool_argument(cmdline, '--clear_log', default=False,
- help="""Clear the log folder passed so a fresh run can be started""")
- cmdline.add_argument('--log_name', type=str, default='hvd_train.log')
- add_bool_argument(cmdline, '--local_ckpt',
- help="""Performs local checkpoints (i.e. one per node)""")
- cmdline.add_argument('--display_every', default=50, type=int,
- help="""How often (in iterations) to print out
- running information.""")
- add_bool_argument(cmdline, '--eval',
- help="""Evaluate the top-1 and top-5 accuracy of
+ want to clear all checkpoints and start a fresh run""",
+ )
+ add_bool_argument(
+ cmdline,
+ "--clear_log",
+ default=False,
+ help="""Clear the log folder passed so a fresh run can be started""",
+ )
+ cmdline.add_argument("--log_name", type=str, default="hvd_train.log")
+ add_bool_argument(
+ cmdline, "--local_ckpt", help="""Performs local checkpoints (i.e. one per node)"""
+ )
+ cmdline.add_argument(
+ "--display_every",
+ default=50,
+ type=int,
+ help="""How often (in iterations) to print out
+ running information.""",
+ )
+ add_bool_argument(
+ cmdline,
+ "--eval",
+ help="""Evaluate the top-1 and top-5 accuracy of
the latest checkpointed model. If you want to evaluate using multiple GPUs ensure that
all processes have access to all checkpoints. Either if checkpoints
were saved using --local_ckpt or they were saved to a shared directory which all processes
- can access.""")
- cmdline.add_argument('--eval_interval', type=int,
- help="""Evaluate accuracy per eval_interval number of epochs""")
- add_bool_argument(cmdline, '--fp16', default=True,
- help="""Train using float16 (half) precision instead
- of float32.""")
- cmdline.add_argument('--num_gpus', default=1, type=int,
- help="""Specify total number of GPUS used to train a checkpointed model during eval.
- Used only to calculate epoch number to print during evaluation""")
-
- cmdline.add_argument('--save_checkpoints_steps', type=int, default=1000)
- cmdline.add_argument('--save_summary_steps', type=int, default=0)
- add_bool_argument(cmdline, '--adv_bn_init', default=True,
- help="""init gamme of the last BN of each ResMod at 0.""")
- add_bool_argument(cmdline, '--adv_conv_init', default=True,
- help="""init conv with MSRA initializer""")
-
- cmdline.add_argument('--lr', type=float,
- help="""Start learning rate""")
- cmdline.add_argument('--mom', default=0.90, type=float,
- help="""Momentum""")
- cmdline.add_argument('--wdecay', default=0.0001, type=float,
- help="""Weight decay""")
- cmdline.add_argument('--loss_scale', default=1024., type=float,
- help="""loss scale""")
- cmdline.add_argument('--warmup_lr', default=0.001, type=float,
- help="""Warmup starting from this learning rate""")
- cmdline.add_argument('--warmup_epochs', default=0, type=int,
- help="""Number of epochs in which to warmup to given lr""")
- cmdline.add_argument('--lr_decay_steps', default='30,60,80', type=str,
- help="""epoch numbers at which lr is decayed by lr_decay_lrs.
- Used when lr_decay_mode is steps""")
- cmdline.add_argument('--lr_decay_lrs', default='', type=str,
- help="""learning rates at specific epochs""")
- cmdline.add_argument('--lr_decay_mode', default='poly',
- help="""Takes either `steps` (decay by a factor at specified steps)
- or `poly`(polynomial_decay with degree 2)""")
-
- add_bool_argument(cmdline, '--use_larc', default=False,
- help="""Use Layer wise Adaptive Rate Control which helps convergence at really large batch sizes""")
- cmdline.add_argument('--leta', default=0.013, type=float,
- help="""The trust coefficient for LARC optimization, LARC Eta""")
-
- cmdline.add_argument('--cdr_first_decay_ratio', default=0.33, type=float,
- help="""Cosine Decay Restart First Deacy Steps ratio""")
- cmdline.add_argument('--cdr_t_mul', default=2.0, type=float,
- help="""Cosine Decay Restart t_mul""")
- cmdline.add_argument('--cdr_m_mul', default=0.1, type=float,
- help="""Cosine Decay Restart m_mul""")
- cmdline.add_argument('--cdr_alpha', default=0.0, type=float,
- help="""Cosine Decay Restart alpha""")
- cmdline.add_argument('--lc_periods', default=0.47, type=float,
- help="""Linear Cosine num of periods""")
- cmdline.add_argument('--lc_alpha', default=0.0, type=float,
- help="""linear Cosine alpha""")
- cmdline.add_argument('--lc_beta', default=0.00001, type=float,
- help="""Liner Cosine Beta""")
-
- add_bool_argument(cmdline, '--increased_aug', default=False,
- help="""Increase augmentations helpful when training with large number of GPUs such as 128 or 256""")
- cmdline.add_argument('--contrast', default=0.6, type=float,
- help="""contrast factor""")
- cmdline.add_argument('--saturation', default=0.6, type=float,
- help="""saturation factor""")
- cmdline.add_argument('--hue', default=0.13, type=float,
- help="""hue max delta factor, hue delta = hue * math.pi""")
- cmdline.add_argument('--brightness', default=0.3, type=float,
- help="""Brightness factor""")
+ can access.""",
+ )
+ cmdline.add_argument(
+ "--eval_interval", type=int, help="""Evaluate accuracy per eval_interval number of epochs"""
+ )
+ add_bool_argument(
+ cmdline,
+ "--fp16",
+ default=True,
+ help="""Train using float16 (half) precision instead
+ of float32.""",
+ )
+ cmdline.add_argument(
+ "--num_gpus",
+ default=1,
+ type=int,
+ help="""Specify total number of GPUS used to train a checkpointed model during eval.
+ Used only to calculate epoch number to print during evaluation""",
+ )
+
+ cmdline.add_argument("--save_checkpoints_steps", type=int, default=1000)
+ cmdline.add_argument("--save_summary_steps", type=int, default=0)
+ add_bool_argument(
+ cmdline,
+ "--adv_bn_init",
+ default=True,
+ help="""init gamme of the last BN of each ResMod at 0.""",
+ )
+ add_bool_argument(
+ cmdline, "--adv_conv_init", default=True, help="""init conv with MSRA initializer"""
+ )
+
+ cmdline.add_argument("--lr", type=float, help="""Start learning rate""")
+ cmdline.add_argument("--mom", default=0.90, type=float, help="""Momentum""")
+ cmdline.add_argument("--wdecay", default=0.0001, type=float, help="""Weight decay""")
+ cmdline.add_argument("--loss_scale", default=1024.0, type=float, help="""loss scale""")
+ cmdline.add_argument(
+ "--warmup_lr", default=0.001, type=float, help="""Warmup starting from this learning rate"""
+ )
+ cmdline.add_argument(
+ "--warmup_epochs",
+ default=0,
+ type=int,
+ help="""Number of epochs in which to warmup to given lr""",
+ )
+ cmdline.add_argument(
+ "--lr_decay_steps",
+ default="30,60,80",
+ type=str,
+ help="""epoch numbers at which lr is decayed by lr_decay_lrs.
+ Used when lr_decay_mode is steps""",
+ )
+ cmdline.add_argument(
+ "--lr_decay_lrs", default="", type=str, help="""learning rates at specific epochs"""
+ )
+ cmdline.add_argument(
+ "--lr_decay_mode",
+ default="poly",
+ help="""Takes either `steps` (decay by a factor at specified steps)
+ or `poly`(polynomial_decay with degree 2)""",
+ )
+
+ add_bool_argument(
+ cmdline,
+ "--use_larc",
+ default=False,
+ help="""Use Layer wise Adaptive Rate Control which helps convergence at really large batch sizes""",
+ )
+ cmdline.add_argument(
+ "--leta",
+ default=0.013,
+ type=float,
+ help="""The trust coefficient for LARC optimization, LARC Eta""",
+ )
+
+ cmdline.add_argument(
+ "--cdr_first_decay_ratio",
+ default=0.33,
+ type=float,
+ help="""Cosine Decay Restart First Deacy Steps ratio""",
+ )
+ cmdline.add_argument(
+ "--cdr_t_mul", default=2.0, type=float, help="""Cosine Decay Restart t_mul"""
+ )
+ cmdline.add_argument(
+ "--cdr_m_mul", default=0.1, type=float, help="""Cosine Decay Restart m_mul"""
+ )
+ cmdline.add_argument(
+ "--cdr_alpha", default=0.0, type=float, help="""Cosine Decay Restart alpha"""
+ )
+ cmdline.add_argument(
+ "--lc_periods", default=0.47, type=float, help="""Linear Cosine num of periods"""
+ )
+ cmdline.add_argument("--lc_alpha", default=0.0, type=float, help="""linear Cosine alpha""")
+ cmdline.add_argument("--lc_beta", default=0.00001, type=float, help="""Liner Cosine Beta""")
+
+ add_bool_argument(
+ cmdline,
+ "--increased_aug",
+ default=False,
+ help="""Increase augmentations helpful when training with large number of GPUs such as 128 or 256""",
+ )
+ cmdline.add_argument("--contrast", default=0.6, type=float, help="""contrast factor""")
+ cmdline.add_argument("--saturation", default=0.6, type=float, help="""saturation factor""")
+ cmdline.add_argument(
+ "--hue",
+ default=0.13,
+ type=float,
+ help="""hue max delta factor, hue delta = hue * math.pi""",
+ )
+ cmdline.add_argument("--brightness", default=0.3, type=float, help="""Brightness factor""")
return cmdline
def sort_and_load_ckpts(log_dir):
ckpts = []
for f in os.listdir(log_dir):
- m = re.match(r'model.ckpt-([0-9]+).index', f)
+ m = re.match(r"model.ckpt-([0-9]+).index", f)
if m is None:
continue
fullpath = os.path.join(log_dir, f)
- ckpts.append({'step': int(m.group(1)),
- 'path': os.path.splitext(fullpath)[0],
- 'mtime': os.stat(fullpath).st_mtime,
- })
- ckpts.sort(key=itemgetter('step'))
+ ckpts.append(
+ {
+ "step": int(m.group(1)),
+ "path": os.path.splitext(fullpath)[0],
+ "mtime": os.stat(fullpath).st_mtime,
+ }
+ )
+ ckpts.sort(key=itemgetter("step"))
return ckpts
def main():
gpu_thread_count = 2
- os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
- os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
- os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
- os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+ os.environ["TF_GPU_THREAD_MODE"] = "gpu_private"
+ os.environ["TF_GPU_THREAD_COUNT"] = str(gpu_thread_count)
+ os.environ["TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT"] = "1"
+ os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1"
hvd.init()
-
config = tf.ConfigProto()
config.gpu_options.visible_device_list = str(hvd.local_rank())
config.gpu_options.force_gpu_compatible = True # Force pinned memory
@@ -914,7 +1107,7 @@ def main():
FLAGS.log_dir = None if FLAGS.log_dir == "" else FLAGS.log_dir
if FLAGS.eval:
- FLAGS.log_name = 'eval_' + FLAGS.log_name
+ FLAGS.log_name = "eval_" + FLAGS.log_name
if hvd.rank() != 0:
return
if FLAGS.local_ckpt:
@@ -930,7 +1123,7 @@ def main():
os.makedirs(FLAGS.log_dir)
barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
tf.Session(config=config).run(barrier)
-
+
logger = logging.getLogger(FLAGS.log_name)
logger.setLevel(logging.INFO) # INFO, ERROR
# file handler which logs debug messages
@@ -939,7 +1132,7 @@ def main():
ch.setLevel(logging.INFO)
# add formatter to the handlers
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- formatter = logging.Formatter('%(message)s')
+ formatter = logging.Formatter("%(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
if not hvd.rank():
@@ -948,23 +1141,25 @@ def main():
fh.setFormatter(formatter)
# add handlers to logger
logger.addHandler(fh)
-
+
height, width = 224, 224
global_batch_size = FLAGS.batch_size * hvd.size()
- rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
+ rank0log(logger, "PY" + str(sys.version) + "TF" + str(tf.__version__))
rank0log(logger, "Horovod size: ", hvd.size())
if FLAGS.data_dir:
- filename_pattern = os.path.join(FLAGS.data_dir, '%s-*')
- train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
- eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
+ filename_pattern = os.path.join(FLAGS.data_dir, "%s-*")
+ train_filenames = sorted(tf.gfile.Glob(filename_pattern % "train"))
+ eval_filenames = sorted(tf.gfile.Glob(filename_pattern % "validation"))
num_training_samples = get_num_records(train_filenames)
rank0log(logger, "Using data from: ", FLAGS.data_dir)
if not FLAGS.eval:
- rank0log(logger, 'Found ', num_training_samples, ' training samples')
+ rank0log(logger, "Found ", num_training_samples, " training samples")
else:
if not FLAGS.synthetic:
- raise ValueError('data_dir missing. Please pass --synthetic if you want to run on synthetic data. Else please pass --data_dir')
+ raise ValueError(
+ "data_dir missing. Please pass --synthetic if you want to run on synthetic data. Else please pass --data_dir"
+ )
train_filenames = eval_filenames = []
num_training_samples = 1281167
training_samples_per_rank = num_training_samples // hvd.size()
@@ -979,9 +1174,9 @@ def main():
nstep_per_epoch = num_training_samples // global_batch_size
decay_steps = nstep
- if FLAGS.lr_decay_mode == 'steps':
- steps = [int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(',')]
- lr_steps = [float(x) for x in FLAGS.lr_decay_lrs.split(',')]
+ if FLAGS.lr_decay_mode == "steps":
+ steps = [int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(",")]
+ lr_steps = [float(x) for x in FLAGS.lr_decay_lrs.split(",")]
else:
steps = []
lr_steps = []
@@ -997,11 +1192,11 @@ def main():
if not FLAGS.save_summary_steps:
# default to save one checkpoint per epoch
FLAGS.save_summary_steps = nstep_per_epoch
-
+
if not FLAGS.eval:
- rank0log(logger, 'Using a learning rate of ', FLAGS.lr)
- rank0log(logger, 'Checkpointing every ' + str(FLAGS.save_checkpoints_steps) + ' steps')
- rank0log(logger, 'Saving summary every ' + str(FLAGS.save_summary_steps) + ' steps')
+ rank0log(logger, "Using a learning rate of ", FLAGS.lr)
+ rank0log(logger, "Checkpointing every " + str(FLAGS.save_checkpoints_steps) + " steps")
+ rank0log(logger, "Saving summary every " + str(FLAGS.save_summary_steps) + " steps")
warmup_it = nstep_per_epoch * FLAGS.warmup_epochs
@@ -1009,62 +1204,74 @@ def main():
model_fn=cnn_model_function,
model_dir=FLAGS.log_dir,
params={
- 'model': FLAGS.model,
- 'decay_steps': decay_steps,
- 'n_classes': 1000,
- 'dtype': tf.float16 if FLAGS.fp16 else tf.float32,
- 'format': 'channels_first',
- 'device': '/gpu:0',
- 'lr': FLAGS.lr,
- 'mom': FLAGS.mom,
- 'wdecay': FLAGS.wdecay,
- 'use_larc': FLAGS.use_larc,
- 'leta': FLAGS.leta,
- 'steps': steps,
- 'lr_steps': lr_steps,
- 'lr_decay_mode': FLAGS.lr_decay_mode,
- 'warmup_it': warmup_it,
- 'warmup_lr': FLAGS.warmup_lr,
- 'cdr_first_decay_ratio': FLAGS.cdr_first_decay_ratio,
- 'cdr_t_mul': FLAGS.cdr_t_mul,
- 'cdr_m_mul': FLAGS.cdr_m_mul,
- 'cdr_alpha': FLAGS.cdr_alpha,
- 'lc_periods': FLAGS.lc_periods,
- 'lc_alpha': FLAGS.lc_alpha,
- 'lc_beta': FLAGS.lc_beta,
- 'loss_scale': FLAGS.loss_scale,
- 'adv_bn_init': FLAGS.adv_bn_init,
- 'conv_init': tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None
+ "model": FLAGS.model,
+ "decay_steps": decay_steps,
+ "n_classes": 1000,
+ "dtype": tf.float16 if FLAGS.fp16 else tf.float32,
+ "format": "channels_first",
+ "device": "/gpu:0",
+ "lr": FLAGS.lr,
+ "mom": FLAGS.mom,
+ "wdecay": FLAGS.wdecay,
+ "use_larc": FLAGS.use_larc,
+ "leta": FLAGS.leta,
+ "steps": steps,
+ "lr_steps": lr_steps,
+ "lr_decay_mode": FLAGS.lr_decay_mode,
+ "warmup_it": warmup_it,
+ "warmup_lr": FLAGS.warmup_lr,
+ "cdr_first_decay_ratio": FLAGS.cdr_first_decay_ratio,
+ "cdr_t_mul": FLAGS.cdr_t_mul,
+ "cdr_m_mul": FLAGS.cdr_m_mul,
+ "cdr_alpha": FLAGS.cdr_alpha,
+ "lc_periods": FLAGS.lc_periods,
+ "lc_alpha": FLAGS.lc_alpha,
+ "lc_beta": FLAGS.lc_beta,
+ "loss_scale": FLAGS.loss_scale,
+ "adv_bn_init": FLAGS.adv_bn_init,
+ "conv_init": tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None,
},
config=tf.estimator.RunConfig(
# tf_random_seed=31 * (1 + hvd.rank()),
session_config=config,
save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None,
- keep_checkpoint_max=None))
+ keep_checkpoint_max=None,
+ ),
+ )
if not FLAGS.eval:
num_preproc_threads = 5
rank0log(logger, "Using preprocessing threads per GPU: ", num_preproc_threads)
- training_hooks = [hvd.BroadcastGlobalVariablesHook(0),
- PrefillStagingAreasHook()]
+ training_hooks = [hvd.BroadcastGlobalVariablesHook(0), PrefillStagingAreasHook()]
if hvd.rank() == 0:
training_hooks.append(
- LogSessionRunHook(global_batch_size,
- num_training_samples,
- FLAGS.display_every, logger))
+ LogSessionRunHook(
+ global_batch_size, num_training_samples, FLAGS.display_every, logger
+ )
+ )
try:
start_time = time.time()
classifier.train(
input_fn=lambda: make_dataset(
train_filenames,
training_samples_per_rank,
- FLAGS.batch_size, height, width,
- FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue,
- training=True, num_threads=num_preproc_threads,
- shard=True, synthetic=FLAGS.synthetic, increased_aug=FLAGS.increased_aug),
+ FLAGS.batch_size,
+ height,
+ width,
+ FLAGS.brightness,
+ FLAGS.contrast,
+ FLAGS.saturation,
+ FLAGS.hue,
+ training=True,
+ num_threads=num_preproc_threads,
+ shard=True,
+ synthetic=FLAGS.synthetic,
+ increased_aug=FLAGS.increased_aug,
+ ),
max_steps=nstep,
- hooks=training_hooks)
+ hooks=training_hooks,
+ )
rank0log(logger, "Finished in ", time.time() - start_time)
except KeyboardInterrupt:
print("Keyboard interrupt")
@@ -1075,45 +1282,62 @@ def main():
tf.Session(config=config).run(barrier)
time.sleep(5) # a little extra margin...
if FLAGS.num_gpus == 1:
- rank0log(logger, """If you are evaluating checkpoints of a multi-GPU run on a single GPU,
+ rank0log(
+ logger,
+ """If you are evaluating checkpoints of a multi-GPU run on a single GPU,
ensure you set --num_gpus to the number of GPUs it was trained on.
- This will ensure that the epoch number is accurately displayed in the below logs.""")
+ This will ensure that the epoch number is accurately displayed in the below logs.""",
+ )
try:
ckpts = sort_and_load_ckpts(FLAGS.log_dir)
for i, c in enumerate(ckpts):
if i < len(ckpts) - 1:
- if (not FLAGS.eval_interval) or \
- (i % FLAGS.eval_interval != 0):
+ if (not FLAGS.eval_interval) or (i % FLAGS.eval_interval != 0):
continue
eval_result = classifier.evaluate(
input_fn=lambda: make_dataset(
eval_filenames,
- get_num_records(eval_filenames), FLAGS.batch_size,
- height, width,
- FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue,
- training=False, shard=True, increased_aug=False),
- checkpoint_path=c['path'])
- c['epoch'] = math.ceil(c['step'] / (num_training_samples / (FLAGS.batch_size * FLAGS.num_gpus)))
- c['top1'] = eval_result['val-top1acc']
- c['top5'] = eval_result['val-top5acc']
- c['loss'] = eval_result['loss']
- rank0log(logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
+ get_num_records(eval_filenames),
+ FLAGS.batch_size,
+ height,
+ width,
+ FLAGS.brightness,
+ FLAGS.contrast,
+ FLAGS.saturation,
+ FLAGS.hue,
+ training=False,
+ shard=True,
+ increased_aug=False,
+ ),
+ checkpoint_path=c["path"],
+ )
+ c["epoch"] = math.ceil(
+ c["step"] / (num_training_samples / (FLAGS.batch_size * FLAGS.num_gpus))
+ )
+ c["top1"] = eval_result["val-top1acc"]
+ c["top5"] = eval_result["val-top5acc"]
+ c["loss"] = eval_result["loss"]
+ rank0log(logger, " step epoch top1 top5 loss checkpoint_time(UTC)")
barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
for i, c in enumerate(ckpts):
tf.Session(config=config).run(barrier)
- if 'top1' not in c:
+ if "top1" not in c:
continue
- rank0log(logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
- .format(c['step'],
- c['epoch'],
- c['top1'] * 100,
- c['top5'] * 100,
- c['loss'],
- time=time.strftime('%Y-%m-%d %H:%M:%S',
- time.localtime(c['mtime']))))
+ rank0log(
+ logger,
+ "{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}".format(
+ c["step"],
+ c["epoch"],
+ c["top1"] * 100,
+ c["top5"] * 100,
+ c["loss"],
+ time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(c["mtime"])),
+ ),
+ )
rank0log(logger, "Finished evaluation")
except KeyboardInterrupt:
logger.error("Keyboard interrupt")
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/benchmarks/tf_benchmarks/README.md b/benchmarks/tf_benchmarks/README.md
index e1aecba4..badee3ba 100644
--- a/benchmarks/tf_benchmarks/README.md
+++ b/benchmarks/tf_benchmarks/README.md
@@ -1,9 +1,9 @@
# TensorFlow benchmarking scripts
-This folder contains the TF training scripts https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks.
+This folder contains a copy of [TensorFlow's `tf_cnn_benchmarks.py` script](https://github.com/tensorflow/benchmarks/blob/e3bd1370ba21b02c4d34340934ffb4941977d96f/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py).
## Basic usage
-**execute_tensorflow_training.py train** uses SageMaker python sdk to start a training job.
+**execute_tensorflow_training.py train** uses SageMaker python sdk to start a training job.
```bash
./execute_tensorflow_training.py train --help
@@ -26,7 +26,7 @@ Options:
--help Show this message and exit.
```
-**execute_tensorflow_training.py generate_reports** generate benchmark reports.
+**execute_tensorflow_training.py generate_reports** generate benchmark reports.
## Examples:
diff --git a/benchmarks/tf_benchmarks/benchmarks b/benchmarks/tf_benchmarks/benchmarks
deleted file mode 160000
index ec056be5..00000000
--- a/benchmarks/tf_benchmarks/benchmarks
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ec056be57f189ec96611a58e8dc5562a6d620139
diff --git a/benchmarks/tf_benchmarks/execute_tensorflow_training.py b/benchmarks/tf_benchmarks/execute_tensorflow_training.py
index b4f15304..e424638c 100755
--- a/benchmarks/tf_benchmarks/execute_tensorflow_training.py
+++ b/benchmarks/tf_benchmarks/execute_tensorflow_training.py
@@ -11,7 +11,6 @@
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
-
from __future__ import absolute_import
import argparse
@@ -26,13 +25,13 @@
dir_path = os.path.dirname(os.path.realpath(__file__))
_DEFAULT_HYPERPARAMETERS = {
- 'batch_size': 32,
- 'model': 'resnet32',
- 'num_epochs': 10,
- 'data_format': 'NHWC',
- 'summary_verbosity': 1,
- 'save_summaries_steps': 10,
- 'data_name': 'cifar10'
+ "batch_size": 32,
+ "model": "resnet32",
+ "num_epochs": 10,
+ "data_format": "NHWC",
+ "summary_verbosity": 1,
+ "save_summaries_steps": 10,
+ "data_name": "cifar10",
}
@@ -44,67 +43,73 @@ class ScriptModeTensorFlow(Framework):
create_model = TensorFlow.create_model
- def __init__(self, py_version='py3', **kwargs):
+ def __init__(self, py_version="py3", **kwargs):
super(ScriptModeTensorFlow, self).__init__(**kwargs)
self.py_version = py_version
self.image_name = None
- self.framework_version = '1.10.0'
+ self.framework_version = "1.10.0"
def get_args():
parser = argparse.ArgumentParser()
- parser.add_argument('-t', '--instance-types', nargs='+', help=' Set flag', required=True)
- parser.add_argument('-r', '--role', required=True)
- parser.add_argument('-w', '--wait', action='store_true')
- parser.add_argument('--region', default='us-west-2')
- parser.add_argument('--py-versions', nargs='+', help=' Set flag', default=['py3'])
- parser.add_argument('--checkpoint-path',
- default=os.path.join(default_bucket(), 'benchmarks', 'checkpoints'),
- help='The S3 location where the model checkpoints and tensorboard events are saved after training')
+ parser.add_argument(
+ "-t", "--instance-types", nargs="+", help=" Set flag", required=True
+ )
+ parser.add_argument("-r", "--role", required=True)
+ parser.add_argument("-w", "--wait", action="store_true")
+ parser.add_argument("--region", default="us-west-2")
+ parser.add_argument("--py-versions", nargs="+", help=" Set flag", default=["py3"])
+ parser.add_argument(
+ "--checkpoint-path",
+ default=os.path.join(default_bucket(), "benchmarks", "checkpoints"),
+ help="The S3 location where the model checkpoints and tensorboard events are saved after training",
+ )
return parser.parse_known_args()
def main(args, script_args):
for instance_type, py_version in itertools.product(args.instance_types, args.py_versions):
- base_name = '%s-%s-%s' % (py_version, instance_type[3:5], instance_type[6:])
+ base_name = "%s-%s-%s" % (py_version, instance_type[3:5], instance_type[6:])
model_dir = os.path.join(args.checkpoint_path, base_name)
job_hps = create_hyperparameters(model_dir, script_args)
- print('hyperparameters:')
+ print("hyperparameters:")
print(job_hps)
estimator = ScriptModeTensorFlow(
- entry_point='tf_cnn_benchmarks.py',
- role='SageMakerRole',
- source_dir=os.path.join(dir_path, 'tf_cnn_benchmarks'),
+ entry_point="tf_cnn_benchmarks.py",
+ role="SageMakerRole",
+ source_dir=os.path.join(dir_path, "tf_cnn_benchmarks"),
base_job_name=base_name,
train_instance_count=1,
hyperparameters=job_hps,
train_instance_type=instance_type,
)
- input_dir = 's3://sagemaker-sample-data-%s/spark/mnist/train/' % args.region
- estimator.fit({'train': input_dir}, wait=args.wait)
+ input_dir = "s3://sagemaker-sample-data-%s/spark/mnist/train/" % args.region
+ estimator.fit({"train": input_dir}, wait=args.wait)
print("To use TensorBoard, execute the following command:")
- cmd = 'S3_USE_HTTPS=0 S3_VERIFY_SSL=0 AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s'
+ cmd = "S3_USE_HTTPS=0 S3_VERIFY_SSL=0 AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s"
print(cmd % (args.region, args.checkpoint_path))
def create_hyperparameters(model_dir, script_args):
job_hps = _DEFAULT_HYPERPARAMETERS.copy()
- job_hps.update({'train_dir': model_dir, 'eval_dir': model_dir})
+ job_hps.update({"train_dir": model_dir, "eval_dir": model_dir})
- script_arg_keys_without_dashes = [key[2:] if key.startswith('--') else key[1:] for key in script_args[::2]]
+ script_arg_keys_without_dashes = [
+ key[2:] if key.startswith("--") else key[1:] for key in script_args[::2]
+ ]
script_arg_values = script_args[1::2]
job_hps.update(dict(zip(script_arg_keys_without_dashes, script_arg_values)))
return job_hps
-if __name__ == '__main__':
+if __name__ == "__main__":
args, script_args = get_args()
- main(args, script_args)
\ No newline at end of file
+ main(args, script_args)
diff --git a/benchmarks/tf_benchmarks/models b/benchmarks/tf_benchmarks/models
deleted file mode 160000
index bd835e57..00000000
--- a/benchmarks/tf_benchmarks/models
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bd835e5794e0833705a645ce74d4fdf8fbac6214
diff --git a/benchmarks/tf_benchmarks/tf_cnn_benchmarks/tf_cnn_benchmarks.py b/benchmarks/tf_benchmarks/tf_cnn_benchmarks/tf_cnn_benchmarks.py
new file mode 100644
index 00000000..c24f5e77
--- /dev/null
+++ b/benchmarks/tf_benchmarks/tf_cnn_benchmarks/tf_cnn_benchmarks.py
@@ -0,0 +1,68 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Benchmark script for TensorFlow.
+
+Originally copied from:
+https://github.com/tensorflow/benchmarks/blob/e3bd1370ba21b02c4d34340934ffb4941977d96f/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py
+"""
+from __future__ import absolute_import, division, print_function
+
+from absl import app
+from absl import flags as absl_flags
+import tensorflow.compat.v1 as tf
+
+import benchmark_cnn
+import cnn_util
+import flags
+import mlperf
+from cnn_util import log_fn
+
+
+flags.define_flags()
+for name in flags.param_specs.keys():
+ absl_flags.declare_key_flag(name)
+
+absl_flags.DEFINE_boolean(
+ "ml_perf_compliance_logging",
+ False,
+ "Print logs required to be compliant with MLPerf. If set, must clone the "
+ "MLPerf training repo https://github.com/mlperf/training and add "
+ "https://github.com/mlperf/training/tree/master/compliance to the "
+ "PYTHONPATH",
+)
+
+
+def main(positional_arguments):
+ # Command-line arguments like '--distortions False' are equivalent to
+ # '--distortions=True False', where False is a positional argument. To prevent
+ # this from silently running with distortions, we do not allow positional
+ # arguments.
+ assert len(positional_arguments) >= 1
+ if len(positional_arguments) > 1:
+ raise ValueError("Received unknown positional arguments: %s" % positional_arguments[1:])
+
+ params = benchmark_cnn.make_params_from_flags()
+ with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model):
+ params = benchmark_cnn.setup(params)
+ bench = benchmark_cnn.BenchmarkCNN(params)
+
+ tfversion = cnn_util.tensorflow_version_tuple()
+ log_fn("TensorFlow: %i.%i" % (tfversion[0], tfversion[1]))
+
+ bench.print_info()
+ bench.run()
+
+
+if __name__ == "__main__":
+ tf.disable_v2_behavior()
+ app.run(main) # Raises error on invalid flags, unlike tf.app.run()
diff --git a/buildspec-container-pr.yml b/buildspec-container-pr.yml
new file mode 100644
index 00000000..c43cb34f
--- /dev/null
+++ b/buildspec-container-pr.yml
@@ -0,0 +1,13 @@
+version: 0.2
+
+phases:
+ pre_build:
+ commands:
+ - PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
+ - echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'
+
+ build:
+ commands:
+
+ - error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1"
+ - execute-command-if-has-matching-changes "$error_cmd" "docker/"
diff --git a/buildspec-release.yml b/buildspec-release.yml
index e2ff7068..f2bd20c6 100644
--- a/buildspec-release.yml
+++ b/buildspec-release.yml
@@ -12,14 +12,14 @@ phases:
# run unit tests
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
- tox -e py27,py36 -- test/unit
+ tox -e py27,py36,py37 --parallel all -- test/unit
# run local integ tests
#- $(aws ecr get-login --no-include-email --region us-west-2)
- #- IGNORE_COVERAGE=- tox -e py27,py36 -- test/integ/local
+ #- IGNORE_COVERAGE=- tox -e py27,py37 -- test/integ/local
# run sagemaker integ tests
- #- IGNORE_COVERAGE=- tox -e py27,py36 -- test/integ/sagemaker
+ #- IGNORE_COVERAGE=- tox -e py27,py37 -- test/integ/sagemaker
# generate the distribution package
- python3 setup.py sdist
diff --git a/buildspec-unit.yml b/buildspec-unit.yml
deleted file mode 100644
index c3412df7..00000000
--- a/buildspec-unit.yml
+++ /dev/null
@@ -1,8 +0,0 @@
-version: 0.2
-
-phases:
- build:
- commands:
- - pip install --upgrade pip --quiet
- - pip install tox --quiet
- - tox -e ${TOX_ENVLIST} -- test/unit
\ No newline at end of file
diff --git a/buildspec.yml b/buildspec.yml
index cf0e3e16..f4c4da8a 100644
--- a/buildspec.yml
+++ b/buildspec.yml
@@ -2,9 +2,12 @@ version: 0.2
env:
variables:
- FRAMEWORK_VERSION: '1.15.0'
+ FRAMEWORK_VERSION: '1.15.2'
+ CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
+ GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-tensorflow-container'
+ DLC_ACCOUNT: '763104351884'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]'
@@ -15,110 +18,76 @@ phases:
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
+ - BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'
build:
commands:
+ - TOX_PARALLEL_NO_SPINNER=1
+ - PY_COLORS=0
+
# install
- - pip3 install -U -e .
- pip3 install -U -e .[test]
- # run flake8
+ # run linters
- tox -e flake8,twine
# run unit tests
- - tox -e py36,py27 test/unit
+ - tox -e py27,py36,py37 --parallel all test/unit
+
+ # define tags
+ - GENERIC_TAG="$FRAMEWORK_VERSION-tensorflow-$BUILD_ID"
+ - DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
+ - DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
+
+ # run local CPU integration tests (build and push the image to ECR repo)
+ - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
+ - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
+ - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
+ - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
+
+ # launch remote GPU instance
+ - prefix='ml.'
+ - instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
+ - create-key-pair
+ - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest
- # Create pip archive
- - root_dir=$(pwd)
- - build_id="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
+ # build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
- python3 setup.py sdist
- - tar_name=$(ls dist)
-
- # Find build artifacts
- - build_artifacts=$root_dir/docker/artifacts
-
- # build py2 images
-
- # prepare build context
- - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py2"
- - cp $root_dir/dist/$tar_name $build_dir
- - cp $build_artifacts/* $build_dir/
- - cd $build_dir
-
- # build cpu image
- - cpu_dockerfile="Dockerfile.cpu"
- - CPU_TAG_PY2="$FRAMEWORK_VERSION-cpu-py2-$build_id"
- - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY2 .
-
- # build gpu image
- - gpu_dockerfile="Dockerfile.gpu"
- - GPU_TAG_PY2="$FRAMEWORK_VERSION-gpu-py2-$build_id"
- - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY2 .
-
- # build py3 images
-
- # prepare build context
- - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py3"
- - cp $root_dir/dist/$tar_name $build_dir
- - cp $build_artifacts/* $build_dir/
- - cd $build_dir
-
- # build cpu image
- - cpu_dockerfile="Dockerfile.cpu"
- - CPU_TAG_PY3="$FRAMEWORK_VERSION-cpu-py3-$build_id"
- - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY3 .
-
- # build gpu image
- - gpu_dockerfile="Dockerfile.gpu"
- - GPU_TAG_PY3="$FRAMEWORK_VERSION-gpu-py3-$build_id"
- - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY3 .
-
- # push images to ecr
+ - build_dir="test/container/$FRAMEWORK_VERSION"
+ - $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
+ - docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
+ # push DLC GPU image to ECR
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- - docker push $PREPROD_IMAGE:$CPU_TAG_PY2
- - docker push $PREPROD_IMAGE:$GPU_TAG_PY2
- - docker push $PREPROD_IMAGE:$CPU_TAG_PY3
- - docker push $PREPROD_IMAGE:$GPU_TAG_PY3
-
- # launch remote gpu instance
- - instance_type='p2.xlarge'
- - create-key-pair
- - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu
+ - docker push $PREPROD_IMAGE:$DLC_GPU_TAG
- # run cpu integration tests
- - py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor cpu"
- - py2_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor cpu"
- - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml"
- - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml"
-
- # run gpu integration tests
+ # run GPU local integration tests
- printf "$SETUP_CMDS" > $SETUP_FILE
- - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor gpu"
- - py3_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
- - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml"
-
- - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor gpu"
- - py2_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
- - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml"
-
- # run sagemaker tests
- - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY2 --py-version 2 --processor cpu"
- - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml"
- - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY2 --py-version 2 --processor gpu"
- - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml"
- - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY3 --py-version 3 --processor cpu"
- - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml"
- - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY3 --py-version 3 --processor gpu"
- - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml"
-
+ # no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
+ - generic_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
+ - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
+ - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
+ - dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
+ - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
+ - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
+
+ # run CPU sagemaker integration tests
+ - test_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
+ - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
+ - test_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
+ - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
+
+ # run GPU sagemaker integration tests
+ - test_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
+ - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
+ - test_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
+ - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
finally:
- # shut down remote gpu instance
+ # shut down remote GPU instance
- cleanup-gpu-instances
- cleanup-key-pairs
- # remove ecr image
- - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY2
- - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY2
- - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY3
- - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY3
+ # remove ECR image
+ - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
+ - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
+ - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG
diff --git a/docker/1.15.2/py2/Dockerfile.cpu b/docker/1.15.2/py2/Dockerfile.cpu
new file mode 100644
index 00000000..7bb9acaa
--- /dev/null
+++ b/docker/1.15.2/py2/Dockerfile.cpu
@@ -0,0 +1,118 @@
+FROM ubuntu:18.04
+
+LABEL maintainer="Amazon AI"
+
+# Prevent docker build get stopped by requesting user interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+# Set environment variables for MKL
+# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
+ENV KMP_AFFINITY=granularity=fine,compact,1,0
+ENV KMP_BLOCKTIME=1
+ENV KMP_SETTINGS=0
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# See http://bugs.python.org/issue19846
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+# Specify the location of module that contains the training logic for SageMaker
+# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
+
+# Define framework-related package sources
+ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/cpu/final/tensorflow-1.15.2-cp27-cp27mu-manylinux2010_x86_64.whl
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+ software-properties-common \
+ build-essential \
+ openssh-client \
+ openssh-server \
+ ca-certificates \
+ curl \
+ git \
+ wget \
+ vim \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && mkdir -p /var/run/sshd \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get install -y \
+ python \
+ python-pip
+
+RUN pip --no-cache-dir install --upgrade \
+ pip \
+ setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python) /usr/local/bin/python
+
+RUN pip install --no-cache-dir -U \
+ numpy==1.16.5 \
+ scipy==1.2.2 \
+ scikit-learn==0.20.3 \
+ pandas==0.24.2 \
+ Pillow==6.2.2 \
+ h5py==2.9.0 \
+ keras_applications==1.0.8 \
+ keras_preprocessing==1.1.0 \
+ requests==2.22.0 \
+ keras==2.3.1 \
+ mpi4py==3.0.2 \
+ "cryptography>=2.3" \
+ "sagemaker-tensorflow>=1.15,<1.16" \
+ "sagemaker-tensorflow-training>=2,<3" \
+ # Let's install TensorFlow separately in the end to avoid the library version to be overwritten
+ && pip install --force-reinstall --no-cache-dir -U \
+ ${TF_URL} \
+ && pip install --no-cache-dir -U \
+ awscli \
+ && pip install --no-cache-dir -U \
+ horovod==0.18.2
+
+ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
+
+CMD ["bin/bash"]
diff --git a/docker/1.15.2/py2/Dockerfile.gpu b/docker/1.15.2/py2/Dockerfile.gpu
new file mode 100644
index 00000000..35686af5
--- /dev/null
+++ b/docker/1.15.2/py2/Dockerfile.gpu
@@ -0,0 +1,160 @@
+# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0.
+# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/
+FROM nvidia/cuda:10.0-base-ubuntu18.04
+
+LABEL maintainer="Amazon AI"
+
+# Prevent docker build get stopped by requesting user interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# See http://bugs.python.org/issue19846
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+# Specify the location of module that contains the training logic for SageMaker
+# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
+
+# Define framework-related package sources
+ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/gpu/final/tensorflow_gpu-1.15.2-cp27-cp27mu-manylinux2010_x86_64.whl
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ ca-certificates \
+ cuda-command-line-tools-10-0 \
+ cuda-cublas-dev-10-0 \
+ cuda-cudart-dev-10-0 \
+ cuda-cufft-dev-10-0 \
+ cuda-curand-dev-10-0 \
+ cuda-cusolver-dev-10-0 \
+ cuda-cusparse-dev-10-0 \
+ curl \
+ libcudnn7=7.5.1.10-1+cuda10.0 \
+ # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
+ libnccl2=2.4.7-1+cuda10.0 \
+ libgomp1 \
+ libnccl-dev=2.4.7-1+cuda10.0 \
+ libfreetype6-dev \
+ libhdf5-serial-dev \
+ libpng-dev \
+ libzmq3-dev \
+ git \
+ wget \
+ vim \
+ build-essential \
+ openssh-client \
+ openssh-server \
+ zlib1g-dev \
+ # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
+ # adds a new list which contains libnvinfer library, so it needs another
+ # 'apt-get update' to retrieve that list before it can actually install the library.
+ # We don't install libnvinfer-dev since we don't need to build against TensorRT,
+ # and libnvinfer4 doesn't contain libnvinfer.a static library.
+ && apt-get update \
+ && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
+ && apt-get update \
+ && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ libnvinfer5=5.0.2-1+cuda10.0 \
+ && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
+ && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
+ && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
+ && rm -rf /var/lib/apt/lists/* \
+ && mkdir -p /var/run/sshd
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+RUN apt-get update \
+ && apt-get install -y \
+ python \
+ python-pip
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Set default NCCL parameters
+RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH /usr/local/openmpi/bin/:$PATH
+ENV PATH=/usr/local/nvidia/bin:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN mkdir -p /var/run/sshd \
+ && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN pip --no-cache-dir install --upgrade \
+ pip \
+ setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python) /usr/local/bin/python
+
+RUN pip install --no-cache-dir -U \
+ numpy==1.16.5 \
+ scipy==1.2.2 \
+ scikit-learn==0.20.3 \
+ pandas==0.24.2 \
+ Pillow==6.2.2 \
+ h5py==2.9.0 \
+ keras_applications==1.0.8 \
+ keras_preprocessing==1.1.0 \
+ requests==2.22.0 \
+ keras==2.3.1 \
+ mpi4py==3.0.2 \
+ "cryptography>=2.3" \
+ "sagemaker-tensorflow>=1.15,<1.16" \
+ "sagemaker-tensorflow-training>=2,<3" \
+ # Let's install TensorFlow separately in the end to avoid the library version to be overwritten
+ && pip install --force-reinstall --no-cache-dir -U \
+ ${TF_URL} \
+ && pip install --no-cache-dir -U \
+ awscli
+
+# Install Horovod, temporarily using CUDA stubs
+RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
+ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \
+ horovod==0.18.2 \
+ && ldconfig
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
+ && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
+ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
+
+CMD ["bin/bash"]
diff --git a/docker/1.15.2/py3/Dockerfile.cpu b/docker/1.15.2/py3/Dockerfile.cpu
new file mode 100644
index 00000000..667a3edf
--- /dev/null
+++ b/docker/1.15.2/py3/Dockerfile.cpu
@@ -0,0 +1,121 @@
+FROM ubuntu:18.04
+
+LABEL maintainer="Amazon AI"
+
+# Prevent docker build get stopped by requesting user interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+# Set environment variables for MKL
+# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
+ENV KMP_AFFINITY=granularity=fine,compact,1,0
+ENV KMP_BLOCKTIME=1
+ENV KMP_SETTINGS=0
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# See http://bugs.python.org/issue19846
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+# Specify the location of module that contains the training logic for SageMaker
+# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
+
+# Define framework-related package sources
+ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/cpu/final/tensorflow-1.15.2-cp36-cp36m-manylinux2010_x86_64.whl
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+ python3-dev \
+ python3-pip \
+ python3-setuptools \
+ software-properties-common \
+ build-essential \
+ openssh-client \
+ openssh-server \
+ ca-certificates \
+ curl \
+ git \
+ wget \
+ vim \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && mkdir -p /var/run/sshd \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN pip3 --no-cache-dir install --upgrade \
+ pip \
+ setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python \
+ && ln -s $(which pip3) /usr/bin/pip
+
+RUN pip install --no-cache-dir -U \
+ numpy==1.17.4 \
+ scipy==1.2.2 \
+ scikit-learn==0.20.3 \
+ pandas==0.24.2 \
+ Pillow==7.0.0 \
+ h5py==2.9.0 \
+ keras_applications==1.0.8 \
+ keras_preprocessing==1.1.0 \
+ keras==2.3.1 \
+ requests==2.22.0 \
+ smdebug==0.7.2 \
+ sagemaker==1.50.17 \
+ sagemaker-experiments==0.1.7 \
+ mpi4py==3.0.2 \
+ "cryptography>=2.3" \
+ "sagemaker-tensorflow>=1.15,<1.16" \
+ "sagemaker-tensorflow-training>=2,<3" \
+ # Let's install TensorFlow separately in the end to avoid
+ # the library version to be overwritten
+ && pip install --force-reinstall --no-cache-dir -U \
+ ${TF_URL} \
+ && pip install --force-reinstall --no-cache-dir -U \
+ horovod==0.18.2 \
+ && pip install --no-cache-dir -U \
+ awscli
+
+ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
+
+CMD ["bin/bash"]
diff --git a/docker/1.15.2/py3/Dockerfile.gpu b/docker/1.15.2/py3/Dockerfile.gpu
new file mode 100644
index 00000000..56b5df5b
--- /dev/null
+++ b/docker/1.15.2/py3/Dockerfile.gpu
@@ -0,0 +1,167 @@
+# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0.
+# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/
+FROM nvidia/cuda:10.0-base-ubuntu18.04
+
+LABEL maintainer="Amazon AI"
+
+# Prevent docker build get stopped by requesting user interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# See http://bugs.python.org/issue19846
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+# Specify the location of module that contains the training logic for SageMaker
+# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
+
+# Define framework-related package sources
+ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/gpu/final/tensorflow_gpu-1.15.2-cp36-cp36m-manylinux2010_x86_64.whl
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ python3-dev \
+ python3-pip \
+ python3-setuptools \
+ python3-dev \
+ ca-certificates \
+ cuda-command-line-tools-10-0 \
+ cuda-cublas-dev-10-0 \
+ cuda-cudart-dev-10-0 \
+ cuda-cufft-dev-10-0 \
+ cuda-curand-dev-10-0 \
+ cuda-cusolver-dev-10-0 \
+ cuda-cusparse-dev-10-0 \
+ curl \
+ libcudnn7=7.5.1.10-1+cuda10.0 \
+ # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
+ libnccl2=2.4.7-1+cuda10.0 \
+ libgomp1 \
+ libnccl-dev=2.4.7-1+cuda10.0 \
+ libfreetype6-dev \
+ libhdf5-serial-dev \
+ libpng-dev \
+ libzmq3-dev \
+ git \
+ wget \
+ vim \
+ build-essential \
+ openssh-client \
+ openssh-server \
+ zlib1g-dev \
+ # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
+ # adds a new list which contains libnvinfer library, so it needs another
+ # 'apt-get update' to retrieve that list before it can actually install the
+ # library.
+ # We don't install libnvinfer-dev since we don't need to build against TensorRT,
+ # and libnvinfer4 doesn't contain libnvinfer.a static library.
+ && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
+ && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ libnvinfer5=5.0.2-1+cuda10.0 \
+ && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
+ && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
+ && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
+ && rm -rf /var/lib/apt/lists/* \
+ && mkdir -p /var/run/sshd
+
+###########################################################################
+# Horovod & its dependencies
+###########################################################################
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Set default NCCL parameters
+RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:$PATH
+ENV PATH=/usr/local/nvidia/bin:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN mkdir -p /var/run/sshd \
+ && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN pip3 --no-cache-dir install --upgrade \
+ pip \
+ setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python \
+ && ln -s $(which pip3) /usr/bin/pip
+
+RUN pip install --no-cache-dir -U \
+ numpy==1.17.4 \
+ scipy==1.2.2 \
+ scikit-learn==0.20.3 \
+ pandas==0.24.2 \
+ Pillow==7.0.0 \
+ h5py==2.9.0 \
+ keras_applications==1.0.8 \
+ keras_preprocessing==1.1.0 \
+ requests==2.22.0 \
+ keras==2.3.1 \
+ smdebug==0.7.2 \
+ sagemaker==1.50.17 \
+ sagemaker-experiments==0.1.7 \
+ mpi4py==3.0.2 \
+ "cryptography>=2.3" \
+ "sagemaker-tensorflow>=1.15,<1.16" \
+ "sagemaker-tensorflow-training>=2,<3" \
+ # Let's install TensorFlow separately in the end to avoid
+ # the library version to be overwritten
+ && pip install --force-reinstall --no-cache-dir -U \
+ ${TF_URL} \
+ && pip install --no-cache-dir -U \
+ awscli
+
+# Install Horovod, temporarily using CUDA stubs
+RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \
+ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \
+ horovod==0.18.2 \
+ && ldconfig
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
+ && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
+ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
+
+CMD ["bin/bash"]
diff --git a/docker/1.15.2/py37/Dockerfile.cpu b/docker/1.15.2/py37/Dockerfile.cpu
new file mode 100644
index 00000000..e46ea361
--- /dev/null
+++ b/docker/1.15.2/py37/Dockerfile.cpu
@@ -0,0 +1,138 @@
+FROM ubuntu:18.04
+
+LABEL maintainer="Amazon AI"
+
+# Prevent docker build get stopped by requesting user interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+# Set environment variables for MKL
+# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
+ENV KMP_AFFINITY=granularity=fine,compact,1,0
+ENV KMP_BLOCKTIME=1
+ENV KMP_SETTINGS=0
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# See http://bugs.python.org/issue19846
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+# Specify the location of module that contains the training logic for SageMaker
+# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
+
+# Define framework-related package sources
+ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/cpu/final/tensorflow_cpu-1.15.2-cp37-cp37m-manylinux2010_x86_64.whl
+ARG PYTHON=python3
+ARG PYTHON_PIP=python3-pip
+ARG PIP=pip3
+ARG PYTHON_VERSION=3.7.7
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+ build-essential \
+ ca-certificates \
+ curl \
+ git \
+ openssh-client \
+ openssh-server \
+ vim \
+ wget \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && mkdir -p /var/run/sshd \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+ libbz2-dev \
+ libc6-dev \
+ libffi-dev \
+ libgdbm-dev \
+ libncursesw5-dev \
+ libreadline-gplv2-dev \
+ libsqlite3-dev \
+ libssl-dev \
+ tk-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
+ && tar -xvf Python-$PYTHON_VERSION.tgz \
+ && cd Python-$PYTHON_VERSION \
+ && ./configure && make && make install \
+ && make && make install && rm -rf ../Python-$PYTHON_VERSION*
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+ pip \
+ setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python \
+ && ln -s $(which pip3) /usr/bin/pip
+
+RUN ${PIP} install --no-cache-dir -U \
+ numpy==1.17.4 \
+ scipy==1.2.2 \
+ scikit-learn==0.20.3 \
+ pandas==0.24.2 \
+ Pillow==7.0.0 \
+ h5py==2.10.0 \
+ requests==2.22.0 \
+ smdebug==0.7.2 \
+ sagemaker-experiments==0.1.7 \
+ mpi4py==3.0.2 \
+ "cryptography>=2.3" \
+ "sagemaker-tensorflow>=1.15,<1.16" \
+ sagemaker-tensorflow-training==10.1.0 \
+ # Let's install TensorFlow separately in the end to avoid
+ # the library version to be overwritten
+ && ${PIP} install --force-reinstall --no-cache-dir -U \
+ ${TF_URL} \
+ && ${PIP} install --force-reinstall --no-cache-dir -U \
+ horovod==0.18.2 \
+ && ${PIP} install --no-cache-dir -U \
+ awscli
+
+ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
+
+CMD ["bin/bash"]
diff --git a/docker/1.15.2/py37/Dockerfile.gpu b/docker/1.15.2/py37/Dockerfile.gpu
new file mode 100644
index 00000000..aefc97ab
--- /dev/null
+++ b/docker/1.15.2/py37/Dockerfile.gpu
@@ -0,0 +1,184 @@
+# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0.
+# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/
+FROM nvidia/cuda:10.0-base-ubuntu18.04
+
+LABEL maintainer="Amazon AI"
+
+# Prevent docker build get stopped by requesting user interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# See http://bugs.python.org/issue19846
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+# Specify the location of module that contains the training logic for SageMaker
+# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
+
+# Define framework-related package sources
+ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/gpu/final/tensorflow_gpu-1.15.2-cp37-cp37m-manylinux2010_x86_64.whl
+ARG PYTHON=python3
+ARG PYTHON_PIP=python3-pip
+ARG PIP=pip3
+ARG PYTHON_VERSION=3.7.7
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ build-essential \
+ ca-certificates \
+ cuda-command-line-tools-10-0 \
+ cuda-cublas-dev-10-0 \
+ cuda-cudart-dev-10-0 \
+ cuda-cufft-dev-10-0 \
+ cuda-curand-dev-10-0 \
+ cuda-cusolver-dev-10-0 \
+ cuda-cusparse-dev-10-0 \
+ curl \
+ libcudnn7=7.5.1.10-1+cuda10.0 \
+ # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
+ libnccl2=2.4.7-1+cuda10.0 \
+ libgomp1 \
+ libnccl-dev=2.4.7-1+cuda10.0 \
+ libfreetype6-dev \
+ libhdf5-serial-dev \
+ libpng-dev \
+ libzmq3-dev \
+ git \
+ wget \
+ vim \
+ openssh-client \
+ openssh-server \
+ zlib1g-dev \
+ # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
+ # adds a new list which contains libnvinfer library, so it needs another
+ # 'apt-get update' to retrieve that list before it can actually install the
+ # library.
+ # We don't install libnvinfer-dev since we don't need to build against TensorRT,
+ # and libnvinfer4 doesn't contain libnvinfer.a static library.
+ && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
+ && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
+ libnvinfer5=5.0.2-1+cuda10.0 \
+ && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
+ && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
+ && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
+ && rm -rf /var/lib/apt/lists/* \
+ && mkdir -p /var/run/sshd
+
+###########################################################################
+# Horovod & its dependencies
+###########################################################################
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Set default NCCL parameters
+RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:$PATH
+ENV PATH=/usr/local/nvidia/bin:$PATH
+
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN mkdir -p /var/run/sshd \
+ && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+ libbz2-dev \
+ libc6-dev \
+ libffi-dev \
+ libgdbm-dev \
+ libncursesw5-dev \
+ libreadline-gplv2-dev \
+ libsqlite3-dev \
+ libssl-dev \
+ tk-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
+ && tar -xvf Python-$PYTHON_VERSION.tgz \
+ && cd Python-$PYTHON_VERSION \
+ && ./configure && make && make install \
+ && make && make install && rm -rf ../Python-$PYTHON_VERSION*
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+ pip \
+ setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python \
+ && ln -s $(which pip3) /usr/bin/pip
+
+RUN ${PIP} install --no-cache-dir -U \
+ numpy==1.17.4 \
+ scipy==1.2.2 \
+ scikit-learn==0.20.3 \
+ pandas==0.24.2 \
+ Pillow==7.0.0 \
+ h5py==2.10.0 \
+ requests==2.22.0 \
+ smdebug==0.7.2 \
+ sagemaker-experiments==0.1.7 \
+ mpi4py==3.0.2 \
+ "cryptography>=2.3" \
+ "sagemaker-tensorflow>=1.15,<1.16" \
+ sagemaker-tensorflow-training==10.1.0 \
+ # Let's install TensorFlow separately in the end to avoid
+ # the library version to be overwritten
+ && ${PIP} install --force-reinstall --no-cache-dir -U \
+ ${TF_URL} \
+ && ${PIP} install --no-cache-dir -U \
+ awscli
+
+# Install Horovod, temporarily using CUDA stubs
+RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \
+ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \
+ horovod==0.18.2 \
+ && ldconfig
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
+ && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
+ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
+
+CMD ["bin/bash"]
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..4c5649dc
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+markers =
+ deploy_test
+ skip_cpu
+ skip_gpu
diff --git a/scripts/build_all.py b/scripts/build_all.py
deleted file mode 100644
index 9f340d5d..00000000
--- a/scripts/build_all.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You
-# may not use this file except in compliance with the License. A copy of
-# the License is located at
-#
-# http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is
-# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
-# ANY KIND, either express or implied. See the License for the specific
-# language governing permissions and limitations under the License.
-from __future__ import absolute_import
-
-import argparse
-import os
-import subprocess
-
-VERSION = '1.13.1'
-REPO = 'sagemaker-tensorflow-scriptmode'
-PY2_CPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl' # noqa
-PY3_CPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' # noqa
-PY2_GPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl' # noqa
-PY3_GPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' # noqa
-DEV_ACCOUNT = '142577830533'
-REGION = 'us-west-2'
-
-
-def _parse_args():
-
- parser = argparse.ArgumentParser()
-
- parser.add_argument('--account', type=str, default=DEV_ACCOUNT)
- parser.add_argument('--region', type=str, default=REGION)
- parser.add_argument('--version', type=str, default=VERSION)
- parser.add_argument('--py2-cpu-binary', type=str, default=PY2_CPU_BINARY)
- parser.add_argument('--py3-cpu-binary', type=str, default=PY3_CPU_BINARY)
- parser.add_argument('--py2-gpu-binary', type=str, default=PY2_GPU_BINARY)
- parser.add_argument('--py3-gpu-binary', type=str, default=PY3_GPU_BINARY)
- parser.add_argument('--repo', type=str, default=REPO)
-
- return parser.parse_args()
-
-
-args = _parse_args()
-binaries = {
- 'py2-cpu': args.py2_cpu_binary,
- 'py3-cpu': args.py3_cpu_binary,
- 'py2-gpu': args.py2_gpu_binary,
- 'py3-gpu': args.py3_gpu_binary
-}
-build_dir = os.path.join('docker', args.version)
-
-# Run docker-login so we can pull the cached image
-login_cmd = subprocess.check_output(
- 'aws ecr get-login --no-include-email --registry-id {}'.format(args.account).split())
-print('Executing docker login command: '.format(login_cmd))
-subprocess.check_call(login_cmd.split())
-
-for arch in ['cpu', 'gpu']:
- for py_version in ['2', '3']:
-
- binary_url = binaries['py{}-{}'.format(py_version, arch)]
- binary_file = os.path.basename(binary_url)
- cmd = 'wget -O {}/{} {}'.format(build_dir, binary_file, binary_url)
- print('Downloading binary file: {}'.format(cmd))
- subprocess.check_call(cmd.split())
-
- tag = '{}-{}-py{}'.format(args.version, arch, py_version)
- prev_image_uri = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(args.account, args.region, args.repo, tag)
- dockerfile = os.path.join(build_dir, 'Dockerfile.{}'.format(arch))
-
- tar_file_name = subprocess.check_output('ls {}/sagemaker_tensorflow_container*'.format(build_dir),
- shell=True).strip().decode('ascii')
- print('framework_support_installable is {}'.format(os.path.basename(tar_file_name)))
-
- build_cmd = 'docker build -f {} --cache-from {} --build-arg framework_support_installable={} ' \
- '--build-arg py_version={} --build-arg framework_installable={} ' \
- '-t {}:{} {}'.format(dockerfile, prev_image_uri, os.path.basename(tar_file_name), py_version,
- binary_file, args.repo, tag, build_dir)
- print('Building docker image: {}'.format(build_cmd))
- subprocess.check_call(build_cmd.split())
-
- print('Deleting binary file {}'.format(binary_file))
- subprocess.check_call('rm {}'.format(os.path.join(build_dir, binary_file)).split())
diff --git a/scripts/publish_all.py b/scripts/publish_all.py
deleted file mode 100644
index 2c78e8a7..00000000
--- a/scripts/publish_all.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You
-# may not use this file except in compliance with the License. A copy of
-# the License is located at
-#
-# http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is
-# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
-# ANY KIND, either express or implied. See the License for the specific
-# language governing permissions and limitations under the License.
-from __future__ import absolute_import
-
-import argparse
-import subprocess
-
-DEV_ACCOUNT = '142577830533'
-VERSION = '1.13.1'
-REGION = 'us-west-2'
-REPO = 'sagemaker-tensorflow-scriptmode'
-
-
-def _parse_args():
-
- parser = argparse.ArgumentParser()
-
- parser.add_argument('--account', type=str, default=DEV_ACCOUNT)
- parser.add_argument('--version', type=str, default=VERSION)
- parser.add_argument('--repo', type=str, default=REPO)
- parser.add_argument('--region', type=str, default=REGION)
-
- return parser.parse_args()
-
-
-args = _parse_args()
-
-for arch in ['cpu', 'gpu']:
- for py_version in ['2', '3']:
- source = '{}:{}-{}-py{}'.format(args.repo, args.version, arch, py_version)
- dest = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(args.account, args.region, source)
- tag_cmd = 'docker tag {} {}'.format(source, dest)
- print('Tagging image: {}'.format(tag_cmd))
- subprocess.check_call(tag_cmd.split())
- login_cmd = subprocess.check_output(
- 'aws ecr get-login --no-include-email --registry-id {} --region {}'
- .format(args.account, args.region).split())
- print('Executing docker login command: {}'.format(login_cmd))
- subprocess.check_call(login_cmd.split())
- push_cmd = 'docker push {}'.format(dest)
- print('Pushing image: {}'.format(push_cmd))
- subprocess.check_call(push_cmd.split())
diff --git a/setup.py b/setup.py
index 983ebd13..67cfbe56 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,7 @@
import os
from os.path import basename
from os.path import splitext
+import sys
from setuptools import find_packages, setup
@@ -25,41 +26,60 @@ def read(fname):
def read_version():
- return read('VERSION').strip()
+ return read("VERSION").strip()
-setup(
- name='sagemaker_tensorflow_training',
- version=read_version(),
- description='Open source library for creating '
- 'TensorFlow containers to run on Amazon SageMaker.',
+test_dependencies = [
+ "tox",
+ "flake8",
+ "pytest",
+ "pytest-cov",
+ "pytest-xdist",
+ "mock",
+ "sagemaker==1.50.1",
+ "tensorflow<2.0",
+ "docker-compose",
+ "boto3==1.10.50",
+ "six==1.13.0",
+ "python-dateutil>=2.1,<2.8.1",
+ "botocore==1.13.50",
+ "requests-mock",
+ "awscli>=1.16.314",
+]
- packages=find_packages(where='src', exclude=('test',)),
- package_dir={'': 'src'},
- py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')],
-
- long_description=read('README.rst'),
- author='Amazon Web Services',
- url='https://github.com/aws/sagemaker-tensorflow-containers',
- license='Apache License 2.0',
+if sys.version_info.major > 2:
+ test_dependencies.append("sagemaker-experiments==0.1.7")
+setup(
+ name="sagemaker_tensorflow_training",
+ version=read_version(),
+ description="Open source library for creating "
+ "TensorFlow containers to run on Amazon SageMaker.",
+ packages=find_packages(where="src", exclude=("test",)),
+ package_dir={"": "src"},
+ py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")],
+ long_description=read("README.rst"),
+ author="Amazon Web Services",
+ url="https://github.com/aws/sagemaker-tensorflow-containers",
+ license="Apache License 2.0",
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Natural Language :: English",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
- 'Programming Language :: Python :: 2.7',
- 'Programming Language :: Python :: 3.6',
+ "Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
],
-
- install_requires=['sagemaker-containers>=2.6.2', 'numpy', 'scipy', 'sklearn',
- 'pandas', 'Pillow', 'h5py'],
- extras_require={
- 'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock',
- 'sagemaker==1.50.1', 'tensorflow<2.0', 'docker-compose', 'boto3==1.10.50',
- 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.50',
- 'requests-mock', 'awscli==1.16.314'],
- 'benchmark': ['click']
- },
+ install_requires=[
+ "sagemaker-training>=4.3.0,<5.2.0",
+ "numpy",
+ "scipy",
+ "sklearn",
+ "pandas",
+ "Pillow",
+ "h5py",
+ ],
+ extras_require={"test": test_dependencies, "benchmark": ["click"]},
)
diff --git a/src/sagemaker_tensorflow_container/s3_utils.py b/src/sagemaker_tensorflow_container/s3_utils.py
index 0137ef25..15902c55 100644
--- a/src/sagemaker_tensorflow_container/s3_utils.py
+++ b/src/sagemaker_tensorflow_container/s3_utils.py
@@ -20,23 +20,23 @@
def configure(model_dir, job_region):
- os.environ['S3_REGION'] = _s3_region(job_region, model_dir)
+ os.environ["S3_REGION"] = _s3_region(job_region, model_dir)
# setting log level to WARNING
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
- os.environ['S3_USE_HTTPS'] = '1'
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
+ os.environ["S3_USE_HTTPS"] = "1"
def _s3_region(job_region, model_dir):
- if model_dir and model_dir.startswith('s3://'):
- s3 = boto3.client('s3', region_name=job_region)
+ if model_dir and model_dir.startswith("s3://"):
+ s3 = boto3.client("s3", region_name=job_region)
# We get the AWS region of the checkpoint bucket, which may be different from
# the region this container is currently running in.
parsed_url = urlparse(model_dir)
bucket_name = parsed_url.netloc
- bucket_location = s3.get_bucket_location(Bucket=bucket_name)['LocationConstraint']
+ bucket_location = s3.get_bucket_location(Bucket=bucket_name)["LocationConstraint"]
return bucket_location or job_region
else:
diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py
index bce6a69c..f71db52a 100644
--- a/src/sagemaker_tensorflow_container/training.py
+++ b/src/sagemaker_tensorflow_container/training.py
@@ -19,15 +19,15 @@
import subprocess
import time
-import sagemaker_containers.beta.framework as framework
+from sagemaker_training import entry_point, environment, mapping, runner
import tensorflow as tf
from sagemaker_tensorflow_container import s3_utils
logger = logging.getLogger(__name__)
-SAGEMAKER_PARAMETER_SERVER_ENABLED = 'sagemaker_parameter_server_enabled'
-MODEL_DIR = '/opt/ml/model'
+SAGEMAKER_PARAMETER_SERVER_ENABLED = "sagemaker_parameter_server_enabled"
+MODEL_DIR = "/opt/ml/model"
def _is_host_master(hosts, current_host):
@@ -56,50 +56,46 @@ def _build_tf_config(hosts, current_host, ps_task=False):
ps = hosts if len(hosts) > 1 else None
def host_addresses(hosts, port=2222):
- return ['{}:{}'.format(host, port) for host in hosts]
+ return ["{}:{}".format(host, port) for host in hosts]
- tf_config = {
- 'cluster': {
- 'master': host_addresses(masters)
- },
- 'environment': 'cloud'
- }
+ tf_config = {"cluster": {"master": host_addresses(masters)}, "environment": "cloud"}
if ps:
- tf_config['cluster']['ps'] = host_addresses(ps, port='2223')
+ tf_config["cluster"]["ps"] = host_addresses(ps, port="2223")
if workers:
- tf_config['cluster']['worker'] = host_addresses(workers)
+ tf_config["cluster"]["worker"] = host_addresses(workers)
if ps_task:
if ps is None:
raise ValueError(
- 'Cannot have a ps task if there are no parameter servers in the cluster')
- task_type = 'ps'
+ "Cannot have a ps task if there are no parameter servers in the cluster"
+ )
+ task_type = "ps"
task_index = ps.index(current_host)
elif _is_host_master(hosts, current_host):
- task_type = 'master'
+ task_type = "master"
task_index = 0
else:
- task_type = 'worker'
+ task_type = "worker"
task_index = workers.index(current_host)
- tf_config['task'] = {'index': task_index, 'type': task_type}
+ tf_config["task"] = {"index": task_index, "type": task_type}
return tf_config
def _run_ps(env, cluster):
- logger.info('Running distributed training job with parameter servers')
+ logger.info("Running distributed training job with parameter servers")
cluster_spec = tf.train.ClusterSpec(cluster)
task_index = env.hosts.index(env.current_host)
# Force parameter server to run on cpu. Running multiple TensorFlow processes on the same
# GPU is not safe:
# https://stackoverflow.com/questions/46145100/is-it-unsafe-to-run-multiple-tensorflow-processes-on-the-same-gpu
- no_gpu_config = tf.ConfigProto(device_count={'GPU': 0})
+ no_gpu_config = tf.ConfigProto(device_count={"GPU": 0})
server = tf.train.Server(
- cluster_spec, job_name='ps', task_index=task_index, config=no_gpu_config
+ cluster_spec, job_name="ps", task_index=task_index, config=no_gpu_config
)
multiprocessing.Process(target=lambda: server.join()).start()
@@ -107,20 +103,27 @@ def _run_ps(env, cluster):
def _run_worker(env, cmd_args, tf_config):
env_vars = env.to_env_vars()
- env_vars['TF_CONFIG'] = json.dumps(tf_config)
-
- framework.entry_point.run(env.module_dir, env.user_entry_point, cmd_args, env_vars)
+ env_vars["TF_CONFIG"] = json.dumps(tf_config)
+
+ entry_point.run(
+ uri=env.module_dir,
+ user_entry_point=env.user_entry_point,
+ args=cmd_args,
+ env_vars=env_vars,
+ capture_error=True,
+ )
def _wait_until_master_is_down(master):
while True:
try:
subprocess.check_call(
- ['curl', '{}:2222'.format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- logger.info('master {} is still up, waiting for it to exit'.format(master))
+ ["curl", "{}:2222".format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
+ logger.info("master {} is still up, waiting for it to exit".format(master))
time.sleep(10)
except subprocess.CalledProcessError:
- logger.info('master {} is down, stopping parameter server'.format(master))
+ logger.info("master {} is down, stopping parameter server".format(master))
return
@@ -128,18 +131,19 @@ def train(env, cmd_args):
"""Get training job environment from env and run the training job.
Args:
- env (sagemaker_containers.beta.framework.env.TrainingEnv): Instance of TrainingEnv class
+ env (sagemaker_training.env.TrainingEnv): Instance of TrainingEnv class
"""
parameter_server_enabled = env.additional_framework_parameters.get(
- SAGEMAKER_PARAMETER_SERVER_ENABLED, False)
+ SAGEMAKER_PARAMETER_SERVER_ENABLED, False
+ )
if len(env.hosts) > 1 and parameter_server_enabled:
tf_config = _build_tf_config(hosts=env.hosts, current_host=env.current_host)
- logger.info('Running distributed training job with parameter servers')
- logger.info('Launching parameter server process')
- _run_ps(env, tf_config['cluster'])
- logger.info('Launching worker process')
+ logger.info("Running distributed training job with parameter servers")
+ logger.info("Launching parameter server process")
+ _run_ps(env, tf_config["cluster"])
+ logger.info("Launching worker process")
_run_worker(env, cmd_args, tf_config)
if not _is_host_master(env.hosts, env.current_host):
@@ -147,15 +151,21 @@ def train(env, cmd_args):
else:
- mpi_enabled = env.additional_framework_parameters.get('sagemaker_mpi_enabled')
+ mpi_enabled = env.additional_framework_parameters.get("sagemaker_mpi_enabled")
if mpi_enabled:
- runner_type = framework.runner.MPIRunnerType
+ runner_type = runner.MPIRunnerType
else:
- runner_type = framework.runner.ProcessRunnerType
+ runner_type = runner.ProcessRunnerType
- framework.entry_point.run(env.module_dir, env.user_entry_point, cmd_args, env.to_env_vars(),
- runner=runner_type)
+ entry_point.run(
+ uri=env.module_dir,
+ user_entry_point=env.user_entry_point,
+ args=cmd_args,
+ env_vars=env.to_env_vars(),
+ capture_error=True,
+ runner_type=runner_type,
+ )
def _log_model_missing_warning(model_dir):
@@ -165,48 +175,56 @@ def _log_model_missing_warning(model_dir):
if filenames:
file_exists = True
for f in filenames:
- if 'saved_model.pb' in f or 'saved_model.pbtxt' in f:
+ if "saved_model.pb" in f or "saved_model.pbtxt" in f:
pb_file_exists = True
path, direct_parent_dir = os.path.split(dirpath)
if not str.isdigit(direct_parent_dir):
- logger.warn('Your model will NOT be servable with SageMaker TensorFlow Serving containers. '
- 'The SavedModel bundle is under directory \"{}\", not a numeric name.'
- .format(direct_parent_dir))
+ logger.warn(
+ "Your model will NOT be servable with SageMaker TensorFlow Serving containers. "
+ 'The SavedModel bundle is under directory "{}", not a numeric name.'.format(
+ direct_parent_dir
+ )
+ )
if not file_exists:
- logger.warn('No model artifact is saved under path {}.'
- ' Your training job will not save any model files to S3.\n'
- 'For details of how to construct your training script see:\n'
- 'https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script'
- .format(model_dir))
+ logger.warn(
+ "No model artifact is saved under path {}."
+ " Your training job will not save any model files to S3.\n"
+ "For details of how to construct your training script see:\n"
+ "https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script".format(
+ model_dir
+ )
+ )
elif not pb_file_exists:
- logger.warn('Your model will NOT be servable with SageMaker TensorFlow Serving container. '
- 'The model artifact was not saved in the TensorFlow SavedModel directory structure:\n'
- 'https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory')
+ logger.warn(
+ "Your model will NOT be servable with SageMaker TensorFlow Serving container. "
+ "The model artifact was not saved in the TensorFlow SavedModel directory structure:\n"
+ "https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory"
+ )
def _model_dir_with_training_job(model_dir, job_name):
- if model_dir.startswith('/opt/ml'):
+ if model_dir and model_dir.startswith("/opt/ml"):
return model_dir
else:
- return '{}/{}/model'.format(model_dir, job_name)
+ return "{}/{}/model".format(model_dir, job_name)
def main():
"""Training entry point
"""
- hyperparameters = framework.env.read_hyperparameters()
- env = framework.training_env(hyperparameters=hyperparameters)
+ hyperparameters = environment.read_hyperparameters()
+ env = environment.Environment(hyperparameters=hyperparameters)
user_hyperparameters = env.hyperparameters
# If the training job is part of the multiple training jobs for tuning, we need to append the training job name to
# model_dir in case they read from/write to the same object
- if '_tuning_objective_metric' in hyperparameters:
- model_dir = _model_dir_with_training_job(hyperparameters.get('model_dir'), env.job_name)
- logger.info('Appending the training job name to model_dir: {}'.format(model_dir))
- user_hyperparameters['model_dir'] = model_dir
+ if "_tuning_objective_metric" in hyperparameters:
+ model_dir = _model_dir_with_training_job(hyperparameters.get("model_dir"), env.job_name)
+ logger.info("Appending the training job name to model_dir: {}".format(model_dir))
+ user_hyperparameters["model_dir"] = model_dir
- s3_utils.configure(user_hyperparameters.get('model_dir'), os.environ.get('SAGEMAKER_REGION'))
- train(env, framework.mapping.to_cmd_args(user_hyperparameters))
+ s3_utils.configure(user_hyperparameters.get("model_dir"), os.environ.get("SAGEMAKER_REGION"))
+ train(env, mapping.to_cmd_args(user_hyperparameters))
_log_model_missing_warning(MODEL_DIR)
diff --git a/test/__init__.py b/test/__init__.py
deleted file mode 100644
index 57862f92..00000000
--- a/test/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License").
-# You may not use this file except in compliance with the License.
-# A copy of the License is located at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# or in the "license" file accompanying this file. This file is distributed
-# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-from __future__ import absolute_import
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 00000000..56d58673
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,197 @@
+# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from __future__ import absolute_import
+
+import logging
+import os
+
+import boto3
+import pytest
+from sagemaker import LocalSession, Session
+
+from utils import image_utils
+
+# these regions have some p2 and p3 instances, but not enough for automated testing
+NO_P2_REGIONS = [
+ "ca-central-1",
+ "eu-central-1",
+ "eu-west-2",
+ "us-west-1",
+ "eu-west-3",
+ "eu-north-1",
+ "sa-east-1",
+ "ap-east-1",
+ "me-south-1",
+]
+NO_P3_REGIONS = [
+ "ap-southeast-1",
+ "ap-southeast-2",
+ "ap-south-1",
+ "ca-central-1",
+ "eu-central-1",
+ "eu-west-2",
+ "us-west-1" "eu-west-3",
+ "eu-north-1",
+ "sa-east-1",
+ "ap-east-1",
+ "me-south-1",
+]
+
+
+logger = logging.getLogger(__name__)
+logging.getLogger("boto").setLevel(logging.INFO)
+logging.getLogger("botocore").setLevel(logging.INFO)
+logging.getLogger("factory.py").setLevel(logging.INFO)
+logging.getLogger("auth.py").setLevel(logging.INFO)
+logging.getLogger("connectionpool.py").setLevel(logging.INFO)
+
+DIR_PATH = os.path.dirname(os.path.realpath(__file__))
+
+
+def pytest_addoption(parser):
+ parser.addoption("--build-image", "-B", action="store_true")
+ parser.addoption("--push-image", "-P", action="store_true")
+ parser.addoption("--dockerfile-type", "-T", choices=["dlc.cpu", "dlc.gpu", "tf"], default="tf")
+ parser.addoption("--dockerfile", "-D", default=None)
+ parser.addoption("--docker-base-name", default="sagemaker-tensorflow-training")
+ parser.addoption("--tag", default=None)
+ parser.addoption("--region", default="us-west-2")
+ parser.addoption("--framework-version", default="1.15.2")
+ parser.addoption("--processor", default="cpu", choices=["cpu", "gpu", "cpu,gpu"])
+ parser.addoption("--py-version", default="3", choices=["2", "3", "2,3"])
+ parser.addoption("--account-id", default="142577830533")
+ parser.addoption("--instance-type", default=None)
+
+
+def pytest_generate_tests(metafunc):
+ if "py_version" in metafunc.fixturenames:
+ py_version_params = ["py" + v for v in metafunc.config.getoption("--py-version").split(",")]
+ metafunc.parametrize("py_version", py_version_params, scope="session")
+
+ if "processor" in metafunc.fixturenames:
+ processor_params = metafunc.config.getoption("--processor").split(",")
+ metafunc.parametrize("processor", processor_params, scope="session")
+
+
+@pytest.fixture(scope="session", name="dockerfile_type")
+def fixture_dockerfile_type(request):
+ return request.config.getoption("--dockerfile-type")
+
+
+@pytest.fixture(scope="session", name="dockerfile")
+def fixture_dockerfile(request, dockerfile_type):
+ dockerfile = request.config.getoption("--dockerfile")
+ return dockerfile if dockerfile else "Dockerfile.{}".format(dockerfile_type)
+
+
+@pytest.fixture(scope="session", name="build_image", autouse=True)
+def fixture_build_image(request, framework_version, dockerfile, image_uri, region):
+ build_image = request.config.getoption("--build-image")
+ if build_image:
+ return image_utils.build_image(
+ framework_version=framework_version,
+ dockerfile=dockerfile,
+ image_uri=image_uri,
+ region=region,
+ cwd=os.path.join(DIR_PATH, ".."),
+ )
+
+ return image_uri
+
+
+@pytest.fixture(scope="session", name="push_image", autouse=True)
+def fixture_push_image(request, image_uri, region, account_id):
+ push_image = request.config.getoption("--push-image")
+ if push_image:
+ return image_utils.push_image(image_uri, region, account_id)
+ return None
+
+
+@pytest.fixture(scope="session")
+def docker_base_name(request):
+ return request.config.getoption("--docker-base-name")
+
+
+@pytest.fixture(scope="session")
+def region(request):
+ return request.config.getoption("--region")
+
+
+@pytest.fixture(scope="session")
+def framework_version(request):
+ return request.config.getoption("--framework-version")
+
+
+@pytest.fixture(scope="session")
+def tag(request, framework_version, processor, py_version):
+ provided_tag = request.config.getoption("--tag")
+ default_tag = "{}-{}-py{}".format(framework_version, processor, py_version)
+ return provided_tag if provided_tag is not None else default_tag
+
+
+@pytest.fixture(scope="session")
+def sagemaker_session(region):
+ return Session(boto_session=boto3.Session(region_name=region))
+
+
+@pytest.fixture(scope="session")
+def sagemaker_local_session(region):
+ return LocalSession(boto_session=boto3.Session(region_name=region))
+
+
+@pytest.fixture(scope="session")
+def account_id(request):
+ return request.config.getoption("--account-id")
+
+
+@pytest.fixture
+def instance_type(request, processor):
+ provided_instance_type = request.config.getoption("--instance-type")
+ default_instance_type = "ml.c4.xlarge" if processor == "cpu" else "ml.p2.xlarge"
+ return provided_instance_type if provided_instance_type is not None else default_instance_type
+
+
+@pytest.fixture(autouse=True)
+def skip_by_device_type(request, processor):
+ is_gpu = processor == "gpu"
+ if (request.node.get_closest_marker("skip_gpu") and is_gpu) or (
+ request.node.get_closest_marker("skip_cpu") and not is_gpu
+ ):
+ pytest.skip("Skipping because running on '{}' instance".format(processor))
+
+
+@pytest.fixture(autouse=True)
+def skip_gpu_instance_restricted_regions(region, instance_type):
+ if (region in NO_P2_REGIONS and instance_type.startswith("ml.p2")) or (
+ region in NO_P3_REGIONS and instance_type.startswith("ml.p3")
+ ):
+ pytest.skip("Skipping GPU test in region {}".format(region))
+
+
+@pytest.fixture(autouse=True)
+def skip_by_dockerfile_type(request, dockerfile_type):
+ is_generic = dockerfile_type == "tf"
+ if request.node.get_closest_marker("skip_generic") and is_generic:
+ pytest.skip("Skipping because running generic image without mpi and horovod")
+
+
+@pytest.fixture(name="docker_registry", scope="session")
+def fixture_docker_registry(account_id, region):
+ return "{}.dkr.ecr.{}.amazonaws.com".format(account_id, region) if account_id else None
+
+
+@pytest.fixture(name="image_uri", scope="session")
+def fixture_image_uri(docker_registry, docker_base_name, tag):
+ if docker_registry:
+ return "{}/{}:{}".format(docker_registry, docker_base_name, tag)
+ return "{}:{}".format(docker_base_name, tag)
diff --git a/test/container/1.15.2/Dockerfile.dlc.cpu b/test/container/1.15.2/Dockerfile.dlc.cpu
new file mode 100644
index 00000000..98764974
--- /dev/null
+++ b/test/container/1.15.2/Dockerfile.dlc.cpu
@@ -0,0 +1,6 @@
+ARG region
+FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:1.15.2-cpu-py2
+
+COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
+RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
+ rm /sagemaker_tensorflow_training.tar.gz
diff --git a/test/container/1.15.2/Dockerfile.dlc.gpu b/test/container/1.15.2/Dockerfile.dlc.gpu
new file mode 100644
index 00000000..15344f6e
--- /dev/null
+++ b/test/container/1.15.2/Dockerfile.dlc.gpu
@@ -0,0 +1,6 @@
+ARG region
+FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:1.15.2-gpu-py3
+
+COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
+RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
+ rm /sagemaker_tensorflow_training.tar.gz
diff --git a/test/container/1.15.2/Dockerfile.tf b/test/container/1.15.2/Dockerfile.tf
new file mode 100644
index 00000000..b1a62168
--- /dev/null
+++ b/test/container/1.15.2/Dockerfile.tf
@@ -0,0 +1,7 @@
+FROM tensorflow/tensorflow:1.15.2-gpu-py3
+
+ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
+
+COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
+RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
+ rm /sagemaker_tensorflow_training.tar.gz
diff --git a/test/integration/__init__.py b/test/integration/__init__.py
index 966dd7d4..a2e25d25 100644
--- a/test/integration/__init__.py
+++ b/test/integration/__init__.py
@@ -14,35 +14,18 @@
import logging
import os
+import random
+import time
-logging.getLogger('boto3').setLevel(logging.INFO)
-logging.getLogger('botocore').setLevel(logging.INFO)
+logging.getLogger("boto3").setLevel(logging.INFO)
+logging.getLogger("botocore").setLevel(logging.INFO)
-RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', 'resources')
+RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "resources")
-# these regions have some p2 and p3 instances, but not enough for automated testing
-NO_P2_REGIONS = [
- 'ca-central-1',
- 'eu-central-1',
- 'eu-west-2',
- 'us-west-1',
- 'eu-west-3',
- 'eu-north-1',
- 'sa-east-1',
- 'ap-east-1',
- 'me-south-1'
-]
-NO_P3_REGIONS = [
- 'ap-southeast-1',
- 'ap-southeast-2',
- 'ap-south-1',
- 'ca-central-1',
- 'eu-central-1',
- 'eu-west-2',
- 'us-west-1'
- 'eu-west-3',
- 'eu-north-1',
- 'sa-east-1',
- 'ap-east-1',
- 'me-south-1'
-]
+
+def unique_name_from_base(base, max_length=63):
+ unique = "%04x" % random.randrange(16 ** 4) # 4-digit hex
+ ts = str(int(time.time()))
+ available_length = max_length - 2 - len(ts) - len(unique)
+ trimmed = base[:available_length]
+ return "{}-{}-{}".format(trimmed, ts, unique)
diff --git a/test/integration/conftest.py b/test/integration/conftest.py
deleted file mode 100644
index 4b599675..00000000
--- a/test/integration/conftest.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License").
-# You may not use this file except in compliance with the License.
-# A copy of the License is located at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# or in the "license" file accompanying this file. This file is distributed
-# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-from __future__ import absolute_import
-
-import logging
-import os
-
-import boto3
-import pytest
-from sagemaker import LocalSession, Session
-from sagemaker.tensorflow import TensorFlow
-
-from test.integration import NO_P2_REGIONS, NO_P3_REGIONS
-
-logger = logging.getLogger(__name__)
-logging.getLogger('boto').setLevel(logging.INFO)
-logging.getLogger('botocore').setLevel(logging.INFO)
-logging.getLogger('factory.py').setLevel(logging.INFO)
-logging.getLogger('auth.py').setLevel(logging.INFO)
-logging.getLogger('connectionpool.py').setLevel(logging.INFO)
-
-SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
-
-
-def pytest_addoption(parser):
- parser.addoption('--docker-base-name', default='sagemaker-tensorflow-scriptmode')
- parser.addoption('--tag', default=None)
- parser.addoption('--region', default='us-west-2')
- parser.addoption('--framework-version', default=TensorFlow.LATEST_VERSION)
- parser.addoption('--processor', default='cpu', choices=['cpu', 'gpu', 'cpu,gpu'])
- parser.addoption('--py-version', default='3', choices=['2', '3', '2,3'])
- parser.addoption('--account-id', default='142577830533')
- parser.addoption('--instance-type', default=None)
-
-
-def pytest_configure(config):
- os.environ['TEST_PY_VERSIONS'] = config.getoption('--py-version')
- os.environ['TEST_PROCESSORS'] = config.getoption('--processor')
-
-
-@pytest.fixture(scope='session')
-def docker_base_name(request):
- return request.config.getoption('--docker-base-name')
-
-
-@pytest.fixture(scope='session')
-def region(request):
- return request.config.getoption('--region')
-
-
-@pytest.fixture(scope='session')
-def framework_version(request):
- return request.config.getoption('--framework-version')
-
-
-@pytest.fixture
-def tag(request, framework_version, processor, py_version):
- provided_tag = request.config.getoption('--tag')
- default_tag = '{}-{}-py{}'.format(framework_version, processor, py_version)
- return provided_tag if provided_tag is not None else default_tag
-
-
-@pytest.fixture(scope='session')
-def sagemaker_session(region):
- return Session(boto_session=boto3.Session(region_name=region))
-
-
-@pytest.fixture(scope='session')
-def sagemaker_local_session(region):
- return LocalSession(boto_session=boto3.Session(region_name=region))
-
-
-@pytest.fixture(scope='session')
-def account_id(request):
- return request.config.getoption('--account-id')
-
-
-@pytest.fixture
-def instance_type(request, processor):
- provided_instance_type = request.config.getoption('--instance-type')
- default_instance_type = 'ml.c4.xlarge' if processor == 'cpu' else 'ml.p2.xlarge'
- return provided_instance_type if provided_instance_type is not None else default_instance_type
-
-
-@pytest.fixture(autouse=True)
-def skip_by_device_type(request, processor):
- is_gpu = (processor == 'gpu')
- if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \
- (request.node.get_closest_marker('skip_cpu') and not is_gpu):
- pytest.skip('Skipping because running on \'{}\' instance'.format(processor))
-
-
-@pytest.fixture(autouse=True)
-def skip_gpu_instance_restricted_regions(region, instance_type):
- if (region in NO_P2_REGIONS and instance_type.startswith('ml.p2')) or \
- (region in NO_P3_REGIONS and instance_type.startswith('ml.p3')):
- pytest.skip('Skipping GPU test in region {}'.format(region))
-
-
-@pytest.fixture
-def docker_image(docker_base_name, tag):
- return '{}:{}'.format(docker_base_name, tag)
-
-
-@pytest.fixture
-def ecr_image(account_id, docker_base_name, tag, region):
- return '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(
- account_id, region, docker_base_name, tag)
diff --git a/test/integration/local/test_horovod.py b/test/integration/local/test_horovod.py
index f35ba03a..2137f4ab 100644
--- a/test/integration/local/test_horovod.py
+++ b/test/integration/local/test_horovod.py
@@ -19,48 +19,64 @@
import pytest
from sagemaker.tensorflow import TensorFlow
-from test.integration.utils import processor, py_version # noqa: F401
+RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
-RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
+
+@pytest.mark.skip_cpu
+@pytest.mark.skip_generic
+def test_distributed_training_horovod_gpu(
+ sagemaker_local_session, image_uri, tmpdir, framework_version
+):
+ _test_distributed_training_horovod(
+ 1, 2, sagemaker_local_session, image_uri, tmpdir, framework_version, "local_gpu"
+ )
@pytest.mark.skip_gpu
-@pytest.mark.parametrize('instances, processes', [
- [1, 2],
- (2, 1),
- (2, 2),
- (5, 2)])
-def test_distributed_training_horovod_basic(instances,
- processes,
- sagemaker_local_session,
- docker_image,
- tmpdir,
- framework_version):
- output_path = 'file://%s' % tmpdir
+@pytest.mark.skip_generic
+@pytest.mark.parametrize("instances, processes", [(1, 2), (2, 1), (2, 2), (5, 2)])
+def test_distributed_training_horovod_cpu(
+ instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version
+):
+ _test_distributed_training_horovod(
+ instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version, "local"
+ )
+
+
+def _test_distributed_training_horovod(
+ instances, processes, session, image_uri, tmpdir, framework_version, instance_type
+):
+ output_path = "file://%s" % tmpdir
estimator = TensorFlow(
- entry_point=os.path.join(RESOURCE_PATH, 'hvdbasic', 'train_hvd_basic.py'),
- role='SageMakerRole',
- train_instance_type='local',
- sagemaker_session=sagemaker_local_session,
+ entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_basic.py"),
+ role="SageMakerRole",
+ train_instance_type=instance_type,
+ sagemaker_session=session,
train_instance_count=instances,
- image_name=docker_image,
+ image_name=image_uri,
output_path=output_path,
framework_version=framework_version,
- hyperparameters={'sagemaker_mpi_enabled': True,
- 'sagemaker_network_interface_name': 'eth0',
- 'sagemaker_mpi_num_of_processes_per_host': processes})
+ hyperparameters={
+ "sagemaker_mpi_enabled": True,
+ "sagemaker_network_interface_name": "eth0",
+ "sagemaker_mpi_num_of_processes_per_host": processes,
+ },
+ )
- estimator.fit('file://{}'.format(os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed')))
+ estimator.fit("file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data-distributed")))
tmp = str(tmpdir)
- extract_files(output_path.replace('file://', ''), tmp)
+ extract_files(output_path.replace("file://", ""), tmp)
size = instances * processes
for rank in range(size):
local_rank = rank % processes
- assert read_json('local-rank-%s-rank-%s' % (local_rank, rank), tmp) == {
- 'local-rank': local_rank, 'rank': rank, 'size': size}
+ assert read_json("local-rank-%s-rank-%s" % (local_rank, rank), tmp) == {
+ "local-rank": local_rank,
+ "rank": rank,
+ "size": size,
+ }
def read_json(file, tmp):
@@ -69,14 +85,14 @@ def read_json(file, tmp):
def assert_files_exist_in_tar(output_path, files):
- if output_path.startswith('file://'):
+ if output_path.startswith("file://"):
output_path = output_path[7:]
- model_file = os.path.join(output_path, 'model.tar.gz')
+ model_file = os.path.join(output_path, "model.tar.gz")
with tarfile.open(model_file) as tar:
for f in files:
tar.getmember(f)
def extract_files(output_path, tmpdir):
- with tarfile.open(os.path.join(output_path, 'model.tar.gz')) as tar:
+ with tarfile.open(os.path.join(output_path, "model.tar.gz")) as tar:
tar.extractall(tmpdir)
diff --git a/test/integration/local/test_keras.py b/test/integration/local/test_keras.py
deleted file mode 100644
index 1eca0c2a..00000000
--- a/test/integration/local/test_keras.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You
-# may not use this file except in compliance with the License. A copy of
-# the License is located at
-#
-# http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is
-# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
-# ANY KIND, either express or implied. See the License for the specific
-# language governing permissions and limitations under the License.
-from __future__ import absolute_import
-
-import logging
-import os
-
-import numpy as np
-import pytest
-from sagemaker.tensorflow import serving, TensorFlow
-
-from test.integration import RESOURCE_PATH
-from test.integration.utils import processor, py_version # noqa: F401
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-
-@pytest.mark.skip(reason="Serving part fails because of version mismatch.")
-def test_keras_training(sagemaker_local_session, docker_image, tmpdir, framework_version):
- entry_point = os.path.join(RESOURCE_PATH, 'keras_inception.py')
- output_path = 'file://{}'.format(tmpdir)
-
- estimator = TensorFlow(
- entry_point=entry_point,
- role='SageMakerRole',
- train_instance_count=1,
- train_instance_type='local',
- image_name=docker_image,
- sagemaker_session=sagemaker_local_session,
- model_dir='/opt/ml/model',
- output_path=output_path,
- framework_version=framework_version,
- py_version='py3')
-
- estimator.fit()
-
- model = serving.Model(model_data=output_path,
- role='SageMakerRole',
- framework_version=framework_version,
- sagemaker_session=sagemaker_local_session)
-
- predictor = model.deploy(initial_instance_count=1, instance_type='local')
-
- assert predictor.predict(np.random.randn(4, 4, 4, 2) * 255)
-
- predictor.delete_endpoint()
diff --git a/test/integration/local/test_training.py b/test/integration/local/test_training.py
index bd1641b0..35a676a6 100644
--- a/test/integration/local/test_training.py
+++ b/test/integration/local/test_training.py
@@ -18,136 +18,109 @@
import pytest
from sagemaker.tensorflow import TensorFlow
-from test.integration.utils import processor, py_version # noqa: F401
-
-RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
-TF_CHECKPOINT_FILES = ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta']
+RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
+TF_CHECKPOINT_FILES = ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"]
@pytest.fixture # noqa: F811
def py_full_version(py_version): # noqa: F811
- if py_version == '2':
- return '2.7'
+ if py_version == "2":
+ return "2.7"
else:
- return '3.6'
-
-
-@pytest.mark.skip_gpu
-def test_py_versions(sagemaker_local_session, docker_image, py_full_version, framework_version, tmpdir):
- output_path = 'file://{}'.format(tmpdir)
- run_tf_training(script=os.path.join(RESOURCE_PATH, 'test_py_version', 'entry.py'),
- instance_type='local',
- instance_count=1,
- sagemaker_local_session=sagemaker_local_session,
- docker_image=docker_image,
- framework_version=framework_version,
- output_path=output_path,
- training_data_path=None)
-
- with tarfile.open(os.path.join(str(tmpdir), 'output.tar.gz')) as tar:
- output_file = tar.getmember('py_version')
- tar.extractall(path=str(tmpdir), members=[output_file])
-
- with open(os.path.join(str(tmpdir), 'py_version')) as f:
- assert f.read().strip() == py_full_version
+ return "3.6"
@pytest.mark.skip_gpu
-def test_mnist_cpu(sagemaker_local_session, docker_image, tmpdir, framework_version):
- output_path = 'file://{}'.format(tmpdir)
- run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist.py'),
- instance_type='local',
- instance_count=1,
- sagemaker_local_session=sagemaker_local_session,
- docker_image=docker_image,
- framework_version=framework_version,
- output_path=output_path,
- training_data_path='file://{}'.format(
- os.path.join(RESOURCE_PATH, 'mnist', 'data')))
- _assert_files_exist_in_tar(output_path, ['my_model.h5'])
-
-
-@pytest.mark.skip_cpu
-def test_gpu(sagemaker_local_session, docker_image, framework_version):
- run_tf_training(script=os.path.join(RESOURCE_PATH, 'gpu_device_placement.py'),
- instance_type='local_gpu',
- instance_count=1,
- sagemaker_local_session=sagemaker_local_session,
- docker_image=docker_image,
- framework_version=framework_version,
- training_data_path='file://{}'.format(
- os.path.join(RESOURCE_PATH, 'mnist', 'data')))
+def test_mnist_cpu(sagemaker_local_session, image_uri, tmpdir, framework_version):
+ output_path = "file://{}".format(tmpdir)
+ run_tf_training(
+ script=os.path.join(RESOURCE_PATH, "mnist", "mnist.py"),
+ instance_type="local",
+ instance_count=1,
+ sagemaker_local_session=sagemaker_local_session,
+ image_uri=image_uri,
+ framework_version=framework_version,
+ output_path=output_path,
+ training_data_path="file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data")),
+ )
+ _assert_files_exist_in_tar(output_path, ["my_model.h5"])
@pytest.mark.skip_gpu
-def test_distributed_training_cpu_no_ps(sagemaker_local_session,
- docker_image,
- tmpdir,
- framework_version):
- output_path = 'file://{}'.format(tmpdir)
- run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist_estimator.py'),
- instance_type='local',
- instance_count=2,
- sagemaker_local_session=sagemaker_local_session,
- docker_image=docker_image,
- framework_version=framework_version,
- output_path=output_path,
- training_data_path='file://{}'.format(
- os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed')))
+def test_distributed_training_cpu_no_ps(
+ sagemaker_local_session, image_uri, tmpdir, framework_version
+):
+ output_path = "file://{}".format(tmpdir)
+ run_tf_training(
+ script=os.path.join(RESOURCE_PATH, "mnist", "mnist_estimator.py"),
+ instance_type="local",
+ instance_count=2,
+ sagemaker_local_session=sagemaker_local_session,
+ image_uri=image_uri,
+ framework_version=framework_version,
+ output_path=output_path,
+ training_data_path="file://{}".format(
+ os.path.join(RESOURCE_PATH, "mnist", "data-distributed")
+ ),
+ )
_assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES)
@pytest.mark.skip_gpu
-def test_distributed_training_cpu_ps(sagemaker_local_session,
- docker_image,
- tmpdir,
- framework_version):
- output_path = 'file://{}'.format(tmpdir)
- run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist_estimator.py'),
- instance_type='local',
- instance_count=2,
- sagemaker_local_session=sagemaker_local_session,
- docker_image=docker_image,
- framework_version=framework_version,
- output_path=output_path,
- hyperparameters={'sagemaker_parameter_server_enabled': True},
- training_data_path='file://{}'.format(
- os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed')))
+def test_distributed_training_cpu_ps(sagemaker_local_session, image_uri, tmpdir, framework_version):
+ output_path = "file://{}".format(tmpdir)
+ run_tf_training(
+ script=os.path.join(RESOURCE_PATH, "mnist", "mnist_estimator.py"),
+ instance_type="local",
+ instance_count=2,
+ sagemaker_local_session=sagemaker_local_session,
+ image_uri=image_uri,
+ framework_version=framework_version,
+ output_path=output_path,
+ hyperparameters={"sagemaker_parameter_server_enabled": True},
+ training_data_path="file://{}".format(
+ os.path.join(RESOURCE_PATH, "mnist", "data-distributed")
+ ),
+ )
_assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES)
-def run_tf_training(script,
- instance_type,
- instance_count,
- sagemaker_local_session,
- docker_image,
- framework_version,
- training_data_path,
- output_path=None,
- hyperparameters=None):
+def run_tf_training(
+ script,
+ instance_type,
+ instance_count,
+ sagemaker_local_session,
+ image_uri,
+ framework_version,
+ training_data_path,
+ output_path=None,
+ hyperparameters=None,
+):
hyperparameters = hyperparameters or {}
- estimator = TensorFlow(entry_point=script,
- role='SageMakerRole',
- train_instance_count=instance_count,
- train_instance_type=instance_type,
- sagemaker_session=sagemaker_local_session,
- image_name=docker_image,
- model_dir='/opt/ml/model',
- output_path=output_path,
- hyperparameters=hyperparameters,
- base_job_name='test-tf',
- framework_version=framework_version,
- py_version='py3')
+ estimator = TensorFlow(
+ entry_point=script,
+ role="SageMakerRole",
+ train_instance_count=instance_count,
+ train_instance_type=instance_type,
+ sagemaker_session=sagemaker_local_session,
+ image_name=image_uri,
+ model_dir="/opt/ml/model",
+ output_path=output_path,
+ hyperparameters=hyperparameters,
+ base_job_name="test-tf",
+ framework_version=framework_version,
+ py_version="py3",
+ )
estimator.fit(training_data_path)
def _assert_files_exist_in_tar(output_path, files):
- if output_path.startswith('file://'):
+ if output_path.startswith("file://"):
output_path = output_path[7:]
- model_file = os.path.join(output_path, 'model.tar.gz')
+ model_file = os.path.join(output_path, "model.tar.gz")
with tarfile.open(model_file) as tar:
for f in files:
tar.getmember(f)
diff --git a/test/integration/sagemaker/test_horovod.py b/test/integration/sagemaker/test_horovod.py
index 1d2bd8ac..de7c3ff1 100644
--- a/test/integration/sagemaker/test_horovod.py
+++ b/test/integration/sagemaker/test_horovod.py
@@ -14,39 +14,68 @@
import os
+import pytest
import sagemaker
from sagemaker.tensorflow import TensorFlow
+from sagemaker.utils import unique_name_from_base
-from test.integration.utils import processor, py_version, unique_name_from_base # noqa: F401
+RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
-RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
+@pytest.mark.skip_generic
+def test_distributed_training_horovod(
+ sagemaker_session, instance_type, image_uri, tmpdir, framework_version
+):
-def test_distributed_training_horovod(sagemaker_session,
- instance_type,
- ecr_image,
- tmpdir,
- framework_version):
-
- mpi_options = '-verbose -x orte_base_help_aggregate=0'
+ mpi_options = "-verbose -x orte_base_help_aggregate=0"
estimator = TensorFlow(
- entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'),
- role='SageMakerRole',
+ entry_point=os.path.join(RESOURCE_PATH, "mnist", "horovod_mnist.py"),
+ role="SageMakerRole",
train_instance_type=instance_type,
train_instance_count=2,
- image_name=ecr_image,
+ image_name=image_uri,
framework_version=framework_version,
- py_version='py3',
+ py_version="py3",
script_mode=True,
- hyperparameters={'sagemaker_mpi_enabled': True,
- 'sagemaker_mpi_custom_mpi_options': mpi_options,
- 'sagemaker_mpi_num_of_processes_per_host': 1},
- sagemaker_session=sagemaker_session)
+ hyperparameters={
+ "sagemaker_mpi_enabled": True,
+ "sagemaker_mpi_custom_mpi_options": mpi_options,
+ "sagemaker_mpi_num_of_processes_per_host": 1,
+ },
+ sagemaker_session=sagemaker_session,
+ )
- estimator.fit(job_name=unique_name_from_base('test-tf-horovod'))
+ estimator.fit(job_name=unique_name_from_base("test-tf-horovod"))
model_data_source = sagemaker.local.data.get_data_source_instance(
- estimator.model_data, sagemaker_session)
+ estimator.model_data, sagemaker_session
+ )
for filename in model_data_source.get_file_list():
- assert os.path.basename(filename) == 'model.tar.gz'
+ assert os.path.basename(filename) == "model.tar.gz"
+
+
+@pytest.mark.skip_generic
+def test_distributed_training_horovod_with_env_vars(
+ sagemaker_session, instance_type, image_uri, tmpdir, framework_version
+):
+
+ mpi_options = "-verbose -x orte_base_help_aggregate=0"
+ estimator = TensorFlow(
+ entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"),
+ role="SageMakerRole",
+ train_instance_type=instance_type,
+ train_instance_count=2,
+ image_name=image_uri,
+ framework_version=framework_version,
+ py_version="py3",
+ script_mode=True,
+ hyperparameters={
+ "sagemaker_mpi_enabled": True,
+ "sagemaker_mpi_custom_mpi_options": mpi_options,
+ "sagemaker_mpi_num_of_processes_per_host": 2,
+ },
+ sagemaker_session=sagemaker_session,
+ )
+
+ estimator.fit(job_name=unique_name_from_base("test-tf-horovod-env-vars"))
diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py
index 25c8db3e..c466f573 100644
--- a/test/integration/sagemaker/test_mnist.py
+++ b/test/integration/sagemaker/test_mnist.py
@@ -18,143 +18,125 @@
import pytest
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import HyperparameterTuner, IntegerParameter
+from sagemaker.utils import unique_name_from_base
from six.moves.urllib.parse import urlparse
-from test.integration.utils import processor, py_version, unique_name_from_base # noqa: F401
from timeout import timeout
@pytest.mark.deploy_test
-def test_mnist(sagemaker_session, ecr_image, instance_type, framework_version):
- resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
- script = os.path.join(resource_path, 'mnist', 'mnist.py')
- estimator = TensorFlow(entry_point=script,
- role='SageMakerRole',
- train_instance_type=instance_type,
- train_instance_count=1,
- sagemaker_session=sagemaker_session,
- image_name=ecr_image,
- framework_version=framework_version,
- script_mode=True)
+def test_mnist(sagemaker_session, image_uri, instance_type, framework_version):
+ resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
+ script = os.path.join(resource_path, "mnist", "mnist.py")
+ estimator = TensorFlow(
+ entry_point=script,
+ role="SageMakerRole",
+ train_instance_type=instance_type,
+ train_instance_count=1,
+ sagemaker_session=sagemaker_session,
+ image_name=image_uri,
+ framework_version=framework_version,
+ script_mode=True,
+ )
inputs = estimator.sagemaker_session.upload_data(
- path=os.path.join(resource_path, 'mnist', 'data'),
- key_prefix='scriptmode/mnist')
- estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist'))
+ path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist"
+ )
+ estimator.fit(inputs, job_name=unique_name_from_base("test-sagemaker-mnist"))
_assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
-def test_distributed_mnist_no_ps(sagemaker_session, ecr_image, instance_type, framework_version):
- resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
- script = os.path.join(resource_path, 'mnist', 'mnist.py')
- estimator = TensorFlow(entry_point=script,
- role='SageMakerRole',
- train_instance_count=2,
- train_instance_type=instance_type,
- sagemaker_session=sagemaker_session,
- image_name=ecr_image,
- framework_version=framework_version,
- script_mode=True)
+def test_distributed_mnist_no_ps(sagemaker_session, image_uri, instance_type, framework_version):
+ resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
+ script = os.path.join(resource_path, "mnist", "mnist.py")
+ estimator = TensorFlow(
+ entry_point=script,
+ role="SageMakerRole",
+ train_instance_count=2,
+ train_instance_type=instance_type,
+ sagemaker_session=sagemaker_session,
+ image_name=image_uri,
+ framework_version=framework_version,
+ script_mode=True,
+ )
inputs = estimator.sagemaker_session.upload_data(
- path=os.path.join(resource_path, 'mnist', 'data'),
- key_prefix='scriptmode/mnist')
- estimator.fit(inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist'))
+ path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist"
+ )
+ estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist"))
_assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
-def test_distributed_mnist_ps(sagemaker_session, ecr_image, instance_type, framework_version):
- resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
- script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py')
- estimator = TensorFlow(entry_point=script,
- role='SageMakerRole',
- hyperparameters={'sagemaker_parameter_server_enabled': True},
- train_instance_count=2,
- train_instance_type=instance_type,
- sagemaker_session=sagemaker_session,
- image_name=ecr_image,
- framework_version=framework_version,
- script_mode=True)
+def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, framework_version):
+ resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
+ script = os.path.join(resource_path, "mnist", "mnist_estimator.py")
+ estimator = TensorFlow(
+ entry_point=script,
+ role="SageMakerRole",
+ hyperparameters={"sagemaker_parameter_server_enabled": True},
+ train_instance_count=2,
+ train_instance_type=instance_type,
+ sagemaker_session=sagemaker_session,
+ image_name=image_uri,
+ framework_version=framework_version,
+ script_mode=True,
+ )
inputs = estimator.sagemaker_session.upload_data(
- path=os.path.join(resource_path, 'mnist', 'data-distributed'),
- key_prefix='scriptmode/mnist-distributed')
- estimator.fit(inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist'))
+ path=os.path.join(resource_path, "mnist", "data-distributed"),
+ key_prefix="scriptmode/mnist-distributed",
+ )
+ estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist"))
_assert_checkpoint_exists(sagemaker_session.boto_region_name, estimator.model_dir, 0)
_assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
-def test_s3_plugin(sagemaker_session, ecr_image, instance_type, region, framework_version):
- resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
- script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py')
- estimator = TensorFlow(entry_point=script,
- role='SageMakerRole',
- hyperparameters={
- # Saving a checkpoint after every 5 steps to hammer the S3 plugin
- 'save-checkpoint-steps': 10,
- # Disable throttling for checkpoint and model saving
- 'throttle-secs': 0,
- # Without the patch training jobs would fail around 100th to
- # 150th step
- 'max-steps': 200,
- # Large batch size would result in a larger checkpoint file
- 'batch-size': 1024,
- # This makes the training job exporting model during training.
- # Stale model garbage collection will also be performed.
- 'export-model-during-training': True
- },
- train_instance_count=1,
- train_instance_type=instance_type,
- sagemaker_session=sagemaker_session,
- image_name=ecr_image,
- framework_version=framework_version,
- script_mode=True)
- estimator.fit('s3://sagemaker-sample-data-{}/tensorflow/mnist'.format(region),
- job_name=unique_name_from_base('test-tf-sm-s3-mnist'))
- _assert_s3_file_exists(region, estimator.model_data)
- _assert_checkpoint_exists(region, estimator.model_dir, 200)
-
-
-def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version):
- resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
- script = os.path.join(resource_path, 'mnist', 'mnist.py')
-
- estimator = TensorFlow(entry_point=script,
- role='SageMakerRole',
- train_instance_type=instance_type,
- train_instance_count=1,
- sagemaker_session=sagemaker_session,
- image_name=ecr_image,
- framework_version=framework_version,
- script_mode=True)
-
- hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)}
- objective_metric_name = 'accuracy'
- metric_definitions = [{'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)'}]
-
- tuner = HyperparameterTuner(estimator,
- objective_metric_name,
- hyperparameter_ranges,
- metric_definitions,
- max_jobs=2,
- max_parallel_jobs=2)
+def test_tuning(sagemaker_session, image_uri, instance_type, framework_version):
+ resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
+ script = os.path.join(resource_path, "mnist", "mnist.py")
+
+ estimator = TensorFlow(
+ entry_point=script,
+ role="SageMakerRole",
+ train_instance_type=instance_type,
+ train_instance_count=1,
+ sagemaker_session=sagemaker_session,
+ image_name=image_uri,
+ framework_version=framework_version,
+ script_mode=True,
+ )
+
+ hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)}
+ objective_metric_name = "accuracy"
+ metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)"}]
+
+ tuner = HyperparameterTuner(
+ estimator,
+ objective_metric_name,
+ hyperparameter_ranges,
+ metric_definitions,
+ max_jobs=2,
+ max_parallel_jobs=2,
+ )
with timeout(minutes=20):
inputs = estimator.sagemaker_session.upload_data(
- path=os.path.join(resource_path, 'mnist', 'data'),
- key_prefix='scriptmode/mnist')
+ path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist"
+ )
- tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32)
+ tuning_job_name = unique_name_from_base("test-tf-sm-tuning", max_length=32)
tuner.fit(inputs, job_name=tuning_job_name)
tuner.wait()
def _assert_checkpoint_exists(region, model_dir, checkpoint_number):
- _assert_s3_file_exists(region, os.path.join(model_dir, 'graph.pbtxt'))
- _assert_s3_file_exists(region,
- os.path.join(model_dir, 'model.ckpt-{}.index'.format(checkpoint_number)))
- _assert_s3_file_exists(region,
- os.path.join(model_dir, 'model.ckpt-{}.meta'.format(checkpoint_number)))
+ _assert_s3_file_exists(region, os.path.join(model_dir, "graph.pbtxt"))
+ _assert_s3_file_exists(
+ region, os.path.join(model_dir, "model.ckpt-{}.index".format(checkpoint_number))
+ )
+ _assert_s3_file_exists(
+ region, os.path.join(model_dir, "model.ckpt-{}.meta".format(checkpoint_number))
+ )
def _assert_s3_file_exists(region, s3_url):
parsed_url = urlparse(s3_url)
- s3 = boto3.resource('s3', region_name=region)
- s3.Object(parsed_url.netloc, parsed_url.path.lstrip('/')).load()
+ s3 = boto3.resource("s3", region_name=region)
+ s3.Object(parsed_url.netloc, parsed_url.path.lstrip("/")).load()
diff --git a/test/integration/sagemaker/test_tuning_model_dir.py b/test/integration/sagemaker/test_tuning_model_dir.py
index e833c3a4..c113c1cb 100644
--- a/test/integration/sagemaker/test_tuning_model_dir.py
+++ b/test/integration/sagemaker/test_tuning_model_dir.py
@@ -16,30 +16,35 @@
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import HyperparameterTuner, IntegerParameter
-
-from test.integration.utils import processor, py_version, unique_name_from_base # noqa: F401
-
-
-def test_model_dir_with_training_job_name(sagemaker_session, ecr_image, instance_type, framework_version):
- resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources')
- script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py')
-
- estimator = TensorFlow(entry_point=script,
- role='SageMakerRole',
- train_instance_type=instance_type,
- train_instance_count=1,
- image_name=ecr_image,
- framework_version=framework_version,
- py_version='py3',
- sagemaker_session=sagemaker_session)
-
- tuner = HyperparameterTuner(estimator=estimator,
- objective_metric_name='accuracy',
- hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)},
- metric_definitions=[{'Name': 'accuracy', 'Regex': 'accuracy=([01])'}],
- max_jobs=1,
- max_parallel_jobs=1)
+from sagemaker.utils import unique_name_from_base
+
+
+def test_model_dir_with_training_job_name(
+ sagemaker_session, image_uri, instance_type, framework_version
+):
+ resource_path = os.path.join(os.path.dirname(__file__), "../..", "resources")
+ script = os.path.join(resource_path, "tuning_model_dir", "entry.py")
+
+ estimator = TensorFlow(
+ entry_point=script,
+ role="SageMakerRole",
+ train_instance_type=instance_type,
+ train_instance_count=1,
+ image_name=image_uri,
+ framework_version=framework_version,
+ py_version="py3",
+ sagemaker_session=sagemaker_session,
+ )
+
+ tuner = HyperparameterTuner(
+ estimator=estimator,
+ objective_metric_name="accuracy",
+ hyperparameter_ranges={"arbitrary_value": IntegerParameter(0, 1)},
+ metric_definitions=[{"Name": "accuracy", "Regex": "accuracy=([01])"}],
+ max_jobs=1,
+ max_parallel_jobs=1,
+ )
# User script has logic to check for the correct model_dir
- tuner.fit(job_name=unique_name_from_base('test-tf-model-dir', max_length=32))
+ tuner.fit(job_name=unique_name_from_base("test-tf-model-dir", max_length=32))
tuner.wait()
diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py
index d4738d32..1ff4278c 100644
--- a/test/integration/sagemaker/timeout.py
+++ b/test/integration/sagemaker/timeout.py
@@ -16,7 +16,7 @@
import logging
import signal
-LOGGER = logging.getLogger('timeout')
+LOGGER = logging.getLogger("timeout")
class TimeoutError(Exception):
@@ -39,7 +39,7 @@ def timeout(seconds=0, minutes=0, hours=0):
limit = seconds + 60 * minutes + 3600 * hours
def handler(signum, frame):
- raise TimeoutError('timed out after {} seconds'.format(limit))
+ raise TimeoutError("timed out after {} seconds".format(limit))
try:
signal.signal(signal.SIGALRM, handler)
diff --git a/test/integration/utils.py b/test/integration/utils.py
deleted file mode 100644
index 4944eb20..00000000
--- a/test/integration/utils.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You
-# may not use this file except in compliance with the License. A copy of
-# the License is located at
-#
-# http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is
-# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
-# ANY KIND, either express or implied. See the License for the specific
-# language governing permissions and limitations under the License.
-from __future__ import absolute_import
-
-import os
-import random
-import time
-
-import pytest
-
-
-def unique_name_from_base(base, max_length=63):
- unique = '%04x' % random.randrange(16**4) # 4-digit hex
- ts = str(int(time.time()))
- available_length = max_length - 2 - len(ts) - len(unique)
- trimmed = base[:available_length]
- return '{}-{}-{}'.format(trimmed, ts, unique)
-
-
-@pytest.fixture(params=os.environ['TEST_PY_VERSIONS'].split(','))
-def py_version(request):
- return request.param
-
-
-@pytest.fixture(params=os.environ['TEST_PROCESSORS'].split(','))
-def processor(request):
- return request.param
diff --git a/test/resources/gpu_device_placement.py b/test/resources/gpu_device_placement.py
deleted file mode 100644
index 11bbcdff..00000000
--- a/test/resources/gpu_device_placement.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License").
-# You may not use this file except in compliance with the License.
-# A copy of the License is located at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# or in the "license" file accompanying this file. This file is distributed
-# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-import tensorflow as tf
-
-# https://www.tensorflow.org/programmers_guide/using_gpu
-print('-' * 87)
-print('Run GPU test.')
-with tf.device('/gpu:0'):
- a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
- b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
-c = tf.matmul(a, b)
-sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
-# Runs the op.
-print(sess.run(c))
-print('-' * 87)
-print('')
diff --git a/test/resources/hvdbasic/train_hvd_basic.py b/test/resources/hvdbasic/train_hvd_basic.py
index cc068678..24a35a8b 100644
--- a/test/resources/hvdbasic/train_hvd_basic.py
+++ b/test/resources/hvdbasic/train_hvd_basic.py
@@ -4,8 +4,10 @@
hvd.init()
-with open(os.path.join('/opt/ml/model/local-rank-%s-rank-%s' % (hvd.local_rank(), hvd.rank())), 'w+') as f:
- basic_info = {'local-rank': hvd.local_rank(), 'rank': hvd.rank(), 'size': hvd.size()}
+with open(
+ os.path.join("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank())), "w+"
+) as f:
+ basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()}
print(basic_info)
json.dump(basic_info, f)
diff --git a/test/resources/hvdbasic/train_hvd_env_vars.py b/test/resources/hvdbasic/train_hvd_env_vars.py
new file mode 100644
index 00000000..da67367c
--- /dev/null
+++ b/test/resources/hvdbasic/train_hvd_env_vars.py
@@ -0,0 +1,19 @@
+import json
+import os
+import horovod.tensorflow as hvd
+
+hvd.init()
+
+with open("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank()), "w+") as f:
+ basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()}
+
+ print(basic_info)
+ json.dump(basic_info, f)
+
+val = os.environ.get("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI")
+host = os.environ.get("SM_CURRENT_HOST")
+
+assert val is not None
+assert host is not None
+
+print("host {}: AWS_CONTAINER_CREDENTIALS_RELATIVE_URI={}".format(host, val))
diff --git a/test/resources/keras_inception.py b/test/resources/keras_inception.py
deleted file mode 100644
index ebfd1a0e..00000000
--- a/test/resources/keras_inception.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License").
-# You may not use this file except in compliance with the License.
-# A copy of the License is located at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# or in the "license" file accompanying this file. This file is distributed
-# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-import argparse
-import os
-
-import keras
-import tensorflow as tf
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--model_dir', type=str)
-
-args = parser.parse_args()
-
-
-# Loading pre-trained Keras model
-model = keras.applications.inception_v3.InceptionV3(weights='imagenet')
-
-# Exports the keras model as TensorFlow Serving Saved Model
-with tf.Session() as session:
-
- init = tf.global_variables_initializer()
- session.run(init)
-
- tf.saved_model.simple_save(
- session,
- os.path.join(args.model_dir, 'inception-model/1'),
- inputs={'input_image': model.input},
- outputs={t.name: t for t in model.outputs})
diff --git a/test/resources/mnist/horovod_mnist.py b/test/resources/mnist/horovod_mnist.py
index 1014f2bb..f2bf4e8f 100644
--- a/test/resources/mnist/horovod_mnist.py
+++ b/test/resources/mnist/horovod_mnist.py
@@ -10,120 +10,84 @@
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
-from __future__ import absolute_import, print_function
-
import os
-import subprocess
-
-import keras
-from keras.datasets import mnist
-from keras.models import Sequential
-from keras.layers import Dense, Dropout, Flatten
-from keras.layers import Conv2D, MaxPooling2D
-from keras import backend as K
import tensorflow as tf
-import horovod.keras as hvd
-
+import horovod.tensorflow as hvd
# Horovod: initialize Horovod.
hvd.init()
# Horovod: pin GPU to be used to process local rank (one GPU per process)
-config = tf.ConfigProto()
-config.gpu_options.allow_growth = True
-config.gpu_options.visible_device_list = str(hvd.local_rank())
-K.set_session(tf.Session(config=config))
-
-batch_size = 128
-num_classes = 10
-
-epochs = 1
-
-# Input image dimensions
-img_rows, img_cols = 28, 28
-
-# The data, shuffled and split between train and test sets
-(x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-x_train = x_train[:600]
-y_train = y_train[:600]
-x_test = x_test[:100]
-y_test = y_test[:100]
-
-if K.image_data_format() == 'channels_first':
- x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
- x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
- input_shape = (1, img_rows, img_cols)
-else:
- x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
- x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
- input_shape = (img_rows, img_cols, 1)
-
-x_train = x_train.astype('float32')
-x_test = x_test.astype('float32')
-x_train /= 255
-x_test /= 255
-print('x_train shape:', x_train.shape)
-print(x_train.shape[0], 'train samples')
-print(x_test.shape[0], 'test samples')
-
-# Convert class vectors to binary class matrices
-y_train = keras.utils.to_categorical(y_train, num_classes)
-y_test = keras.utils.to_categorical(y_test, num_classes)
-
-model = Sequential()
-model.add(Conv2D(32, kernel_size=(3, 3),
- activation='relu',
- input_shape=input_shape))
-model.add(Conv2D(64, (3, 3), activation='relu'))
-model.add(MaxPooling2D(pool_size=(2, 2)))
-model.add(Dropout(0.25))
-model.add(Flatten())
-model.add(Dense(128, activation='relu'))
-model.add(Dropout(0.5))
-model.add(Dense(num_classes, activation='softmax'))
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+ tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+ tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
+
+(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(
+ path="mnist-%d.npz" % hvd.rank()
+)
+
+dataset = tf.data.Dataset.from_tensor_slices(
+ (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))
+)
+dataset = dataset.repeat().shuffle(10000).batch(128)
+
+mnist_model = tf.keras.Sequential(
+ [
+ tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
+ tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
+ tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+ tf.keras.layers.Dropout(0.25),
+ tf.keras.layers.Flatten(),
+ tf.keras.layers.Dense(128, activation="relu"),
+ tf.keras.layers.Dropout(0.5),
+ tf.keras.layers.Dense(10, activation="softmax"),
+ ]
+)
+loss = tf.losses.SparseCategoricalCrossentropy()
# Horovod: adjust learning rate based on number of GPUs.
-opt = keras.optimizers.Adadelta(1.0 * hvd.size())
+opt = tf.optimizers.Adam(0.001 * hvd.size())
+
+checkpoint_dir = "./checkpoints"
+checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
+
+
+@tf.function
+def training_step(images, labels, first_batch):
+ with tf.GradientTape() as tape:
+ probs = mnist_model(images, training=True)
+ loss_value = loss(labels, probs)
-# Horovod: add Horovod Distributed Optimizer.
-opt = hvd.DistributedOptimizer(opt)
+ # Horovod: add Horovod Distributed GradientTape.
+ tape = hvd.DistributedGradientTape(tape)
-model.compile(loss=keras.losses.categorical_crossentropy,
- optimizer=opt,
- metrics=['accuracy'])
+ grads = tape.gradient(loss_value, mnist_model.trainable_variables)
+ opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
-callbacks = [
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint.
- hvd.callbacks.BroadcastGlobalVariablesCallback(0),
-]
+ #
+ # Note: broadcast should be done after the first gradient step to ensure optimizer
+ # initialization.
+ if first_batch:
+ hvd.broadcast_variables(mnist_model.variables, root_rank=0)
+ hvd.broadcast_variables(opt.variables(), root_rank=0)
-# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
-if hvd.rank() == 0:
- callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
-
-model.fit(x_train, y_train,
- batch_size=batch_size,
- callbacks=callbacks,
- epochs=epochs,
- verbose=1,
- validation_data=(x_test, y_test))
-score = model.evaluate(x_test, y_test, verbose=0)
-print('Test loss:', score[0])
-print('Test accuracy:', score[1])
+ return loss_value
-if hvd.rank() == 0:
- # Exports the keras model as TensorFlow Serving Saved Model
- with K.get_session() as session:
+# Horovod: adjust number of steps based on number of GPUs.
+for batch, (images, labels) in enumerate(dataset.take(600 // hvd.size())):
+ loss_value = training_step(images, labels, batch == 0)
- init = tf.global_variables_initializer()
- session.run(init)
+ if batch % 10 == 0 and hvd.local_rank() == 0:
+ print("Step #%d\tLoss: %.6f" % (batch, loss_value))
- tf.saved_model.simple_save(
- session,
- os.path.join('/opt/ml/model/mnist/1'),
- inputs={'input_image': model.input},
- outputs={t.name: t for t in model.outputs})
+# Horovod: save checkpoints only on worker 0 to prevent other workers from
+# corrupting it.
+if hvd.rank() == 0:
+ # Export the keras model as Tensorflow SavedModelBundle
+ mnist_model.save(os.path.join("/opt/ml/model/mnist/1"), save_format="tf")
diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py
index e4349ce2..e1c2b275 100644
--- a/test/resources/mnist/mnist.py
+++ b/test/resources/mnist/mnist.py
@@ -7,63 +7,49 @@
import tensorflow as tf
-
def _parse_args():
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
- parser.add_argument('--epochs', type=int, default=1)
+ parser.add_argument("--epochs", type=int, default=1)
# Data, model, and output directories
- parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
- parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
- parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
+ parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
+ parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
return parser.parse_known_args()
def _load_training_data(base_dir):
- x_train = np.load(os.path.join(base_dir, 'train', 'x_train.npy'))
- y_train = np.load(os.path.join(base_dir, 'train', 'y_train.npy'))
+ x_train = np.load(os.path.join(base_dir, "train", "x_train.npy"))
+ y_train = np.load(os.path.join(base_dir, "train", "y_train.npy"))
return x_train, y_train
def _load_testing_data(base_dir):
- x_test = np.load(os.path.join(base_dir, 'test', 'x_test.npy'))
- y_test = np.load(os.path.join(base_dir, 'test', 'y_test.npy'))
+ x_test = np.load(os.path.join(base_dir, "test", "x_test.npy"))
+ y_test = np.load(os.path.join(base_dir, "test", "y_test.npy"))
return x_test, y_test
-def assert_can_track_sagemaker_experiments():
- in_sagemaker_training = 'TRAINING_JOB_ARN' in os.environ
- in_python_three = sys.version_info[0] == 3
-
- if in_sagemaker_training and in_python_three:
- import smexperiments.tracker
-
- with smexperiments.tracker.Tracker.load() as tracker:
- tracker.log_parameter('param', 1)
- tracker.log_metric('metric', 1.0)
-
-
args, unknown = _parse_args()
-model = tf.keras.models.Sequential([
- tf.keras.layers.Flatten(input_shape=(28, 28)),
- tf.keras.layers.Dense(512, activation=tf.nn.relu),
- tf.keras.layers.Dropout(0.2),
- tf.keras.layers.Dense(10, activation=tf.nn.softmax)
-])
+model = tf.keras.models.Sequential(
+ [
+ tf.keras.layers.Flatten(input_shape=(28, 28)),
+ tf.keras.layers.Dense(512, activation=tf.nn.relu),
+ tf.keras.layers.Dropout(0.2),
+ tf.keras.layers.Dense(10, activation=tf.nn.softmax),
+ ]
+)
-model.compile(optimizer='adam',
- loss='sparse_categorical_crossentropy',
- metrics=['accuracy'])
+model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
x_train, y_train = _load_training_data(args.train)
x_test, y_test = _load_testing_data(args.train)
model.fit(x_train, y_train, epochs=args.epochs)
model.evaluate(x_test, y_test)
if args.current_host == args.hosts[0]:
- model.save(os.path.join('/opt/ml/model', 'my_model.h5'))
- assert_can_track_sagemaker_experiments()
+ model.save(os.path.join("/opt/ml/model", "my_model.h5"))
diff --git a/test/resources/mnist/mnist_estimator.py b/test/resources/mnist/mnist_estimator.py
index d0b991f2..82fb75ac 100644
--- a/test/resources/mnist/mnist_estimator.py
+++ b/test/resources/mnist/mnist_estimator.py
@@ -4,171 +4,176 @@
from __future__ import division
from __future__ import print_function
+import logging
+
import numpy as np
import tensorflow as tf
import os
import argparse
import json
+
def cnn_model_fn(features, labels, mode):
- """Model function for CNN."""
- # Input Layer
- # Reshape X to 4-D tensor: [batch_size, width, height, channels]
- # MNIST images are 28x28 pixels, and have one color channel
- input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
-
- # Convolutional Layer #1
- # Computes 32 features using a 5x5 filter with ReLU activation.
- # Padding is added to preserve width and height.
- # Input Tensor Shape: [batch_size, 28, 28, 1]
- # Output Tensor Shape: [batch_size, 28, 28, 32]
- conv1 = tf.layers.conv2d(
- inputs=input_layer,
- filters=32,
- kernel_size=[5, 5],
- padding="same",
- activation=tf.nn.relu)
-
- # Pooling Layer #1
- # First max pooling layer with a 2x2 filter and stride of 2
- # Input Tensor Shape: [batch_size, 28, 28, 32]
- # Output Tensor Shape: [batch_size, 14, 14, 32]
- pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
-
- # Convolutional Layer #2
- # Computes 64 features using a 5x5 filter.
- # Padding is added to preserve width and height.
- # Input Tensor Shape: [batch_size, 14, 14, 32]
- # Output Tensor Shape: [batch_size, 14, 14, 64]
- conv2 = tf.layers.conv2d(
- inputs=pool1,
- filters=64,
- kernel_size=[5, 5],
- padding="same",
- activation=tf.nn.relu)
-
- # Pooling Layer #2
- # Second max pooling layer with a 2x2 filter and stride of 2
- # Input Tensor Shape: [batch_size, 14, 14, 64]
- # Output Tensor Shape: [batch_size, 7, 7, 64]
- pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
-
- # Flatten tensor into a batch of vectors
- # Input Tensor Shape: [batch_size, 7, 7, 64]
- # Output Tensor Shape: [batch_size, 7 * 7 * 64]
- pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
-
- # Dense Layer
- # Densely connected layer with 1024 neurons
- # Input Tensor Shape: [batch_size, 7 * 7 * 64]
- # Output Tensor Shape: [batch_size, 1024]
- dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
-
- # Add dropout operation; 0.6 probability that element will be kept
- dropout = tf.layers.dropout(
- inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
-
- # Logits layer
- # Input Tensor Shape: [batch_size, 1024]
- # Output Tensor Shape: [batch_size, 10]
- logits = tf.layers.dense(inputs=dropout, units=10)
-
- predictions = {
- # Generate predictions (for PREDICT and EVAL mode)
- "classes": tf.argmax(input=logits, axis=1),
- # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
- # `logging_hook`.
- "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
- }
- if mode == tf.estimator.ModeKeys.PREDICT:
- return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
-
- # Calculate Loss (for both TRAIN and EVAL modes)
- loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
- # Configure the Training Op (for TRAIN mode)
- if mode == tf.estimator.ModeKeys.TRAIN:
- optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
- train_op = optimizer.minimize(
- loss=loss,
- global_step=tf.train.get_global_step())
- return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
-
- # Add evaluation metrics (for EVAL mode)
- eval_metric_ops = {
- "accuracy": tf.metrics.accuracy(
- labels=labels, predictions=predictions["classes"])}
- return tf.estimator.EstimatorSpec(
- mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
+ """Model function for CNN."""
+ # Input Layer
+ # Reshape X to 4-D tensor: [batch_size, width, height, channels]
+ # MNIST images are 28x28 pixels, and have one color channel
+ input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
+
+ # Convolutional Layer #1
+ # Computes 32 features using a 5x5 filter with ReLU activation.
+ # Padding is added to preserve width and height.
+ # Input Tensor Shape: [batch_size, 28, 28, 1]
+ # Output Tensor Shape: [batch_size, 28, 28, 32]
+ conv1 = tf.compat.v1.layers.conv2d(
+ inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu
+ )
+
+ # Pooling Layer #1
+ # First max pooling layer with a 2x2 filter and stride of 2
+ # Input Tensor Shape: [batch_size, 28, 28, 32]
+ # Output Tensor Shape: [batch_size, 14, 14, 32]
+ pool1 = tf.compat.v1.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
+
+ # Convolutional Layer #2
+ # Computes 64 features using a 5x5 filter.
+ # Padding is added to preserve width and height.
+ # Input Tensor Shape: [batch_size, 14, 14, 32]
+ # Output Tensor Shape: [batch_size, 14, 14, 64]
+ conv2 = tf.compat.v1.layers.conv2d(
+ inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu
+ )
+
+ # Pooling Layer #2
+ # Second max pooling layer with a 2x2 filter and stride of 2
+ # Input Tensor Shape: [batch_size, 14, 14, 64]
+ # Output Tensor Shape: [batch_size, 7, 7, 64]
+ pool2 = tf.compat.v1.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
+
+ # Flatten tensor into a batch of vectors
+ # Input Tensor Shape: [batch_size, 7, 7, 64]
+ # Output Tensor Shape: [batch_size, 7 * 7 * 64]
+ pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
+
+ # Dense Layer
+ # Densely connected layer with 1024 neurons
+ # Input Tensor Shape: [batch_size, 7 * 7 * 64]
+ # Output Tensor Shape: [batch_size, 1024]
+ dense = tf.compat.v1.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
+
+ # Add dropout operation; 0.6 probability that element will be kept
+ dropout = tf.compat.v1.layers.dropout(
+ inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN
+ )
+
+ # Logits layer
+ # Input Tensor Shape: [batch_size, 1024]
+ # Output Tensor Shape: [batch_size, 10]
+ logits = tf.compat.v1.layers.dense(inputs=dropout, units=10)
+
+ predictions = {
+ # Generate predictions (for PREDICT and EVAL mode)
+ "classes": tf.argmax(input=logits, axis=1),
+ # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
+ # `logging_hook`.
+ "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
+ }
+ if mode == tf.estimator.ModeKeys.PREDICT:
+ return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
+
+ # Calculate Loss (for both TRAIN and EVAL modes)
+ loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+
+ # Configure the Training Op (for TRAIN mode)
+ if mode == tf.estimator.ModeKeys.TRAIN:
+ optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001)
+ train_op = optimizer.minimize(loss=loss, global_step=tf.compat.v1.train.get_global_step())
+ return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+
+ # Add evaluation metrics (for EVAL mode)
+ eval_metric_ops = {
+ "accuracy": tf.compat.v1.metrics.accuracy(labels=labels, predictions=predictions["classes"])
+ }
+ return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
+
def _load_training_data(base_dir):
- x_train = np.load(os.path.join(base_dir, 'train_data.npy'))
- y_train = np.load(os.path.join(base_dir, 'train_labels.npy'))
+ x_train = np.load(os.path.join(base_dir, "train_data.npy"))
+ y_train = np.load(os.path.join(base_dir, "train_labels.npy"))
return x_train, y_train
+
def _load_testing_data(base_dir):
- x_test = np.load(os.path.join(base_dir, 'eval_data.npy'))
- y_test = np.load(os.path.join(base_dir, 'eval_labels.npy'))
+ x_test = np.load(os.path.join(base_dir, "eval_data.npy"))
+ y_test = np.load(os.path.join(base_dir, "eval_labels.npy"))
return x_test, y_test
+
def _parse_args():
parser = argparse.ArgumentParser()
- parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
- parser.add_argument('--model_dir', type=str)
- parser.add_argument('--max-steps', type=int, default=200)
- parser.add_argument('--save-checkpoint-steps', type=int, default=200)
- parser.add_argument('--throttle-secs', type=int, default=60)
- parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))
- parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
- parser.add_argument('--batch-size', type=int, default=100)
- parser.add_argument('--export-model-during-training', type=bool, default=False)
+ parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
+ parser.add_argument("--model_dir", type=str)
+ parser.add_argument("--max-steps", type=int, default=200)
+ parser.add_argument("--save-checkpoint-steps", type=int, default=200)
+ parser.add_argument("--throttle-secs", type=int, default=60)
+ parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+ parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
+ parser.add_argument("--batch-size", type=int, default=100)
+ parser.add_argument("--export-model-during-training", type=bool, default=False)
return parser.parse_known_args()
+
def serving_input_fn():
- inputs = {'x': tf.placeholder(tf.float32, [None, 784])}
+ inputs = {"x": tf.compat.v1.placeholder(tf.float32, [None, 784])}
return tf.estimator.export.ServingInputReceiver(inputs, inputs)
+
if __name__ == "__main__":
args, unknown = _parse_args()
for arg in vars(args):
print(arg, getattr(args, arg))
- tf.logging.set_verbosity(tf.logging.DEBUG)
+ logger = tf.get_logger()
+ logger.setLevel(logging.DEBUG)
+ # tf.logging.set_verbosity(tf.logging.DEBUG)
train_data, train_labels = _load_training_data(args.train)
eval_data, eval_labels = _load_testing_data(args.train)
# Saving a checkpoint after every step
run_config = tf.estimator.RunConfig(save_checkpoints_steps=args.save_checkpoint_steps)
mnist_classifier = tf.estimator.Estimator(
- model_fn=cnn_model_fn, model_dir=args.model_dir, config=run_config)
+ model_fn=cnn_model_fn, model_dir=args.model_dir, config=run_config
+ )
# Set up logging for predictions
# Log the values in the "Softmax" tensor with label "probabilities"
tensors_to_log = {"probabilities": "softmax_tensor"}
- logging_hook = tf.train.LoggingTensorHook(
- tensors=tensors_to_log, every_n_iter=50
- )
+ logging_hook = tf.estimator.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50)
# Train the model
- train_input_fn = tf.estimator.inputs.numpy_input_fn(
+ train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
x={"x": train_data},
y=train_labels,
batch_size=args.batch_size,
num_epochs=None,
- shuffle=True)
+ shuffle=True,
+ )
- exporter = tf.estimator.LatestExporter('Servo', serving_input_receiver_fn=serving_input_fn) \
- if args.export_model_during_training else None
+ exporter = (
+ tf.compat.v1.estimator.LatestExporter("Servo", serving_input_receiver_fn=serving_input_fn)
+ if args.export_model_during_training
+ else None
+ )
# Evaluate the model and print results
- eval_input_fn = tf.estimator.inputs.numpy_input_fn(
- x={"x": eval_data},
- y=eval_labels,
- num_epochs=1,
- shuffle=False)
+ eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
+ x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False
+ )
train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=args.max_steps)
- eval_spec = tf.estimator.EvalSpec(eval_input_fn, throttle_secs=args.throttle_secs, exporters=exporter)
+ eval_spec = tf.estimator.EvalSpec(
+ eval_input_fn, throttle_secs=args.throttle_secs, exporters=exporter
+ )
tf.estimator.train_and_evaluate(mnist_classifier, train_spec, eval_spec)
if args.current_host == args.hosts[0]:
- mnist_classifier.export_savedmodel('/opt/ml/model', serving_input_fn)
+ mnist_classifier.export_saved_model("/opt/ml/model", serving_input_fn)
diff --git a/test/resources/tuning_model_dir/entry.py b/test/resources/tuning_model_dir/entry.py
index 0bce7165..09d44abc 100644
--- a/test/resources/tuning_model_dir/entry.py
+++ b/test/resources/tuning_model_dir/entry.py
@@ -16,11 +16,13 @@
import os
parser = argparse.ArgumentParser()
-parser.add_argument('--model_dir', type=str)
-parser.add_argument('--arbitrary_value', type=int, default=0)
+parser.add_argument("--model_dir", type=str)
+parser.add_argument("--arbitrary_value", type=int, default=0)
args = parser.parse_args()
-assert os.environ['TRAINING_JOB_NAME'] in args.model_dir, 'model_dir not unique to training job: %s' % args.model_dir
+assert os.environ["TRAINING_JOB_NAME"] in args.model_dir, (
+ "model_dir not unique to training job: %s" % args.model_dir
+)
# For the "hyperparameter tuning" to work
-print('accuracy=1')
+print("accuracy=1")
diff --git a/test/unit/test_deep_learning_container.py b/test/unit/test_deep_learning_container.py
deleted file mode 100644
index 7d5d7d86..00000000
--- a/test/unit/test_deep_learning_container.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License'). You
-# may not use this file except in compliance with the License. A copy of
-# the License is located at
-#
-# http://aws.amazon.com/apache2.0/
-#
-# or in the 'license' file accompanying this file. This file is
-# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
-# ANY KIND, either express or implied. See the License for the specific
-# language governing permissions and limitations under the License.
-from __future__ import absolute_import
-
-import unittest
-
-from docker.build_artifacts import deep_learning_container as deep_learning_container_to_test
-import pytest
-import requests
-
-
-@pytest.fixture(name='fixture_valid_instance_id')
-def fixture_valid_instance_id(requests_mock):
- return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id',
- text='i-123t32e11s32t1231')
-
-
-@pytest.fixture(name='fixture_invalid_instance_id')
-def fixture_invalid_instance_id(requests_mock):
- return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text='i-123')
-
-
-@pytest.fixture(name='fixture_none_instance_id')
-def fixture_none_instance_id(requests_mock):
- return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text=None)
-
-
-@pytest.fixture(name='fixture_invalid_region')
-def fixture_invalid_region(requests_mock):
- return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document',
- json={'region': 'test'})
-
-
-@pytest.fixture(name='fixture_valid_region')
-def fixture_valid_region(requests_mock):
- return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document',
- json={'region': 'us-east-1'})
-
-
-def test_retrieve_instance_id(fixture_valid_instance_id):
- result = deep_learning_container_to_test._retrieve_instance_id()
- assert 'i-123t32e11s32t1231' == result
-
-
-def test_retrieve_none_instance_id(fixture_none_instance_id):
- result = deep_learning_container_to_test._retrieve_instance_id()
- assert result is None
-
-
-def test_retrieve_invalid_instance_id(fixture_invalid_instance_id):
- result = deep_learning_container_to_test._retrieve_instance_id()
- assert result is None
-
-
-def test_retrieve_invalid_region(fixture_invalid_region):
- result = deep_learning_container_to_test._retrieve_instance_region()
- assert result is None
-
-
-def test_retrieve_valid_region(fixture_valid_region):
- result = deep_learning_container_to_test._retrieve_instance_region()
- assert 'us-east-1' == result
-
-
-def test_query_bucket(requests_mock, fixture_valid_region, fixture_valid_instance_id):
- fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231'
- fixture_valid_region.return_value = 'us-east-1'
- requests_mock.get(('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com'
- '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231'),
- text='Access Denied')
- actual_response = deep_learning_container_to_test.query_bucket()
- assert 'Access Denied' == actual_response.text
-
-
-def test_query_bucket_region_none(fixture_invalid_region, fixture_valid_instance_id):
- fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231'
- fixture_invalid_region.return_value = None
- actual_response = deep_learning_container_to_test.query_bucket()
- assert actual_response is None
-
-
-def test_query_bucket_instance_id_none(requests_mock, fixture_valid_region, fixture_none_instance_id):
- fixture_none_instance_id.return_value = None
- fixture_valid_region.return_value = 'us-east-1'
- actual_response = deep_learning_container_to_test.query_bucket()
- assert actual_response is None
-
-
-def test_query_bucket_instance_id_invalid(requests_mock, fixture_valid_region, fixture_invalid_instance_id):
- fixture_invalid_instance_id.return_value = None
- fixture_valid_region.return_value = 'us-east-1'
- actual_response = deep_learning_container_to_test.query_bucket()
- assert actual_response is None
-
-
-def test_HTTP_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id):
- fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231'
- fixture_valid_region.return_value = 'us-east-1'
- query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com'
- '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231')
-
- requests_mock.get(
- query_s3_url,
- exc=requests.exceptions.HTTPError)
- requests_mock.side_effect = requests.exceptions.HTTPError
-
- with pytest.raises(requests.exceptions.HTTPError):
- actual_response = requests.get(query_s3_url)
- assert actual_response is None
-
-
-def test_connection_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id):
- fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231'
- fixture_valid_region.return_value = 'us-east-1'
- query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com'
- '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231')
-
- requests_mock.get(
- query_s3_url,
- exc=requests.exceptions.ConnectionError)
-
- with pytest.raises(requests.exceptions.ConnectionError):
- actual_response = requests.get(
- query_s3_url)
-
- assert actual_response is None
-
-
-def test_timeout_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id):
- fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231'
- fixture_valid_region.return_value = 'us-east-1'
- query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com'
- '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231')
-
- requests_mock.get(
- query_s3_url,
- exc=requests.Timeout)
-
- with pytest.raises(requests.exceptions.Timeout):
- actual_response = requests.get(
- query_s3_url)
-
- assert actual_response is None
-
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/test/unit/test_s3_utils.py b/test/unit/test_s3_utils.py
index 03de70a3..2bd63bf8 100644
--- a/test/unit/test_s3_utils.py
+++ b/test/unit/test_s3_utils.py
@@ -19,30 +19,30 @@
from sagemaker_tensorflow_container import s3_utils
-BUCKET_REGION = 'us-west-2'
-JOB_REGION = 'us-west-1'
-JOB_BUKCET = 'sagemaker-us-west-2-000-00-1'
-PREFIX = 'sagemaker/something'
-MODEL_DIR = 's3://{}/{}'.format(JOB_BUKCET, PREFIX)
+BUCKET_REGION = "us-west-2"
+JOB_REGION = "us-west-1"
+JOB_BUKCET = "sagemaker-us-west-2-000-00-1"
+PREFIX = "sagemaker/something"
+MODEL_DIR = "s3://{}/{}".format(JOB_BUKCET, PREFIX)
-@patch('boto3.client')
+@patch("boto3.client")
def test_configure(client):
s3 = MagicMock()
client.return_value = s3
- loc = {'LocationConstraint': BUCKET_REGION}
+ loc = {"LocationConstraint": BUCKET_REGION}
s3.get_bucket_location.return_value = loc
s3_utils.configure(MODEL_DIR, JOB_REGION)
- assert os.environ['S3_REGION'] == BUCKET_REGION
- assert os.environ['TF_CPP_MIN_LOG_LEVEL'] == '1'
- assert os.environ['S3_USE_HTTPS'] == '1'
+ assert os.environ["S3_REGION"] == BUCKET_REGION
+ assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1"
+ assert os.environ["S3_USE_HTTPS"] == "1"
def test_configure_local_dir():
- s3_utils.configure('/opt/ml/model', JOB_REGION)
+ s3_utils.configure("/opt/ml/model", JOB_REGION)
- assert os.environ['S3_REGION'] == JOB_REGION
- assert os.environ['TF_CPP_MIN_LOG_LEVEL'] == '1'
- assert os.environ['S3_USE_HTTPS'] == '1'
+ assert os.environ["S3_REGION"] == JOB_REGION
+ assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1"
+ assert os.environ["S3_USE_HTTPS"] == "1"
diff --git a/test/unit/test_training.py b/test/unit/test_training.py
index b69beed2..2795af44 100644
--- a/test/unit/test_training.py
+++ b/test/unit/test_training.py
@@ -17,32 +17,32 @@
from mock import MagicMock, patch
import pytest
-from sagemaker_containers.beta.framework import runner
+from sagemaker_training import runner
import tensorflow as tf
from sagemaker_tensorflow_container import training
-MODULE_DIR = 's3://my/bucket'
-MODULE_NAME = 'script_name'
-LOG_LEVEL = 'Debug'
-HOST1 = 'host1'
-HOST2 = 'host2'
+MODULE_DIR = "s3://my/bucket"
+MODULE_NAME = "script_name"
+LOG_LEVEL = "Debug"
+HOST1 = "host1"
+HOST2 = "host2"
HOST_LIST = [HOST1, HOST2]
CURRENT_HOST = HOST1
-CMD_ARGS = {'some_key': 'some_value'}
+CMD_ARGS = {"some_key": "some_value"}
CLUSTER_WITH_PS = {
- 'master': ['{}:2222'.format(HOST1)],
- 'worker': ['{}:2222'.format(HOST2)],
- 'ps': ['{}:2223'.format(HOST1), '{}:2223'.format(HOST2)]
+ "master": ["{}:2222".format(HOST1)],
+ "worker": ["{}:2222".format(HOST2)],
+ "ps": ["{}:2223".format(HOST1), "{}:2223".format(HOST2)],
}
-MASTER_TASK = {'index': 0, 'type': 'master'}
-WORKER_TASK = {'index': 0, 'type': 'worker'}
-PS_TASK_1 = {'index': 0, 'type': 'ps'}
-PS_TASK_2 = {'index': 1, 'type': 'ps'}
-MODEL_DIR = 's3://bucket/prefix'
-MODEL_DIR_CMD_LIST = ['--model_dir', MODEL_DIR]
-REGION = 'us-west-2'
-RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', 'resources')
+MASTER_TASK = {"index": 0, "type": "master"}
+WORKER_TASK = {"index": 0, "type": "worker"}
+PS_TASK_1 = {"index": 0, "type": "ps"}
+PS_TASK_2 = {"index": 1, "type": "ps"}
+MODEL_DIR = "s3://bucket/prefix"
+MODEL_DIR_CMD_LIST = ["--model_dir", MODEL_DIR]
+REGION = "us-west-2"
+RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "resources")
@pytest.fixture
@@ -50,9 +50,7 @@ def distributed_training_env():
env = simple_training_env()
env.hosts = HOST_LIST
- env.additional_framework_parameters = {
- training.SAGEMAKER_PARAMETER_SERVER_ENABLED: True
- }
+ env.additional_framework_parameters = {training.SAGEMAKER_PARAMETER_SERVER_ENABLED: True}
return env
@@ -65,187 +63,238 @@ def simple_training_env():
env = MagicMock()
env.module_dir = MODULE_DIR
env.user_entry_point = MODULE_NAME
- env.hyperparameters = {'model_dir': MODEL_DIR}
+ env.hyperparameters = {"model_dir": MODEL_DIR}
env.log_level = LOG_LEVEL
env.additional_framework_parameters = {}
env.hosts = CURRENT_HOST
env.current_host = CURRENT_HOST
env.to_env_vars = lambda: {}
- env.job_name = 'test-training-job'
+ env.job_name = "test-training-job"
return env
def test_is_host_master():
assert training._is_host_master(HOST_LIST, CURRENT_HOST) is True
- assert training._is_host_master(HOST_LIST, 'host2') is False
- assert training._is_host_master(HOST_LIST, 'somehost') is False
+ assert training._is_host_master(HOST_LIST, "host2") is False
+ assert training._is_host_master(HOST_LIST, "somehost") is False
-@patch('sagemaker_containers.beta.framework.entry_point.run')
+@patch("sagemaker_training.entry_point.run")
def test_single_machine(run_module, single_machine_training_env):
training.train(single_machine_training_env, MODEL_DIR_CMD_LIST)
- run_module.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST,
- single_machine_training_env.to_env_vars(),
- runner=runner.ProcessRunnerType)
+ run_module.assert_called_with(
+ uri=MODULE_DIR,
+ user_entry_point=MODULE_NAME,
+ args=MODEL_DIR_CMD_LIST,
+ env_vars=single_machine_training_env.to_env_vars(),
+ capture_error=True,
+ runner_type=runner.ProcessRunnerType,
+ )
-@patch('sagemaker_containers.beta.framework.entry_point.run')
+@patch("sagemaker_training.entry_point.run")
def test_train_horovod(run_module, single_machine_training_env):
- single_machine_training_env.additional_framework_parameters['sagemaker_mpi_enabled'] = True
+ single_machine_training_env.additional_framework_parameters["sagemaker_mpi_enabled"] = True
training.train(single_machine_training_env, MODEL_DIR_CMD_LIST)
- run_module.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST,
- single_machine_training_env.to_env_vars(),
- runner=runner.MPIRunnerType)
-
-
-@pytest.mark.skipif(sys.version_info.major != 3,
- reason="Skip this for python 2 because of dict key order mismatch")
-@patch('tensorflow.train.ClusterSpec')
-@patch('tensorflow.train.Server')
-@patch('sagemaker_containers.beta.framework.entry_point.run')
-@patch('multiprocessing.Process', lambda target: target())
-@patch('time.sleep', MagicMock())
+ run_module.assert_called_with(
+ uri=MODULE_DIR,
+ user_entry_point=MODULE_NAME,
+ args=MODEL_DIR_CMD_LIST,
+ env_vars=single_machine_training_env.to_env_vars(),
+ capture_error=True,
+ runner_type=runner.MPIRunnerType,
+ )
+
+
+@pytest.mark.skip_on_pipeline
+@pytest.mark.skipif(
+ sys.version_info.major != 3, reason="Skip this for python 2 because of dict key order mismatch"
+)
+@patch("tensorflow.train.ClusterSpec")
+@patch("tensorflow.train.Server")
+@patch("sagemaker_training.entry_point.run")
+@patch("multiprocessing.Process", lambda target: target())
+@patch("time.sleep", MagicMock())
def test_train_distributed_master(run, tf_server, cluster_spec, distributed_training_env):
training.train(distributed_training_env, MODEL_DIR_CMD_LIST)
- cluster_spec.assert_called_with({'worker': ['host2:2222'],
- 'master': ['host1:2222'],
- 'ps': ['host1:2223', 'host2:2223']})
+ cluster_spec.assert_called_with(
+ {"worker": ["host2:2222"], "master": ["host1:2222"], "ps": ["host1:2223", "host2:2223"]}
+ )
tf_server.assert_called_with(
- cluster_spec(), job_name='ps', task_index=0, config=tf.ConfigProto(device_count={'GPU': 0})
+ cluster_spec(), job_name="ps", task_index=0, config=tf.ConfigProto(device_count={"GPU": 0})
)
tf_server().join.assert_called_with()
- tf_config = '{"cluster": {' \
- '"master": ["host1:2222"], ' \
- '"ps": ["host1:2223", "host2:2223"], ' \
- '"worker": ["host2:2222"]}, ' \
- '"environment": "cloud", ' \
- '"task": {"index": 0, "type": "master"}}'
+ tf_config = (
+ '{"cluster": {'
+ '"master": ["host1:2222"], '
+ '"ps": ["host1:2223", "host2:2223"], '
+ '"worker": ["host2:2222"]}, '
+ '"environment": "cloud", '
+ '"task": {"index": 0, "type": "master"}}'
+ )
- run.assert_called_with('s3://my/bucket', 'script_name', MODEL_DIR_CMD_LIST,
- {'TF_CONFIG': tf_config})
+ run.assert_called_with(
+ uri="s3://my/bucket",
+ user_entry_point="script_name",
+ args=MODEL_DIR_CMD_LIST,
+ env_vars={"TF_CONFIG": tf_config},
+ capture_error=True,
+ )
-@pytest.mark.skipif(sys.version_info.major != 3,
- reason="Skip this for python 2 because of dict key order mismatch")
-@patch('tensorflow.train.ClusterSpec')
-@patch('tensorflow.train.Server')
-@patch('sagemaker_containers.beta.framework.entry_point.run')
-@patch('multiprocessing.Process', lambda target: target())
-@patch('time.sleep', MagicMock())
+@pytest.mark.skip_on_pipeline
+@pytest.mark.skipif(
+ sys.version_info.major != 3, reason="Skip this for python 2 because of dict key order mismatch"
+)
+@patch("tensorflow.train.ClusterSpec")
+@patch("tensorflow.train.Server")
+@patch("sagemaker_training.entry_point.run")
+@patch("multiprocessing.Process", lambda target: target())
+@patch("time.sleep", MagicMock())
def test_train_distributed_worker(run, tf_server, cluster_spec, distributed_training_env):
distributed_training_env.current_host = HOST2
training.train(distributed_training_env, MODEL_DIR_CMD_LIST)
- cluster_spec.assert_called_with({'worker': ['host2:2222'],
- 'master': ['host1:2222'],
- 'ps': ['host1:2223', 'host2:2223']})
+ cluster_spec.assert_called_with(
+ {"worker": ["host2:2222"], "master": ["host1:2222"], "ps": ["host1:2223", "host2:2223"]}
+ )
tf_server.assert_called_with(
- cluster_spec(), job_name='ps', task_index=1, config=tf.ConfigProto(device_count={'GPU': 0})
+ cluster_spec(), job_name="ps", task_index=1, config=tf.ConfigProto(device_count={"GPU": 0})
)
tf_server().join.assert_called_with()
- tf_config = '{"cluster": {' \
- '"master": ["host1:2222"], ' \
- '"ps": ["host1:2223", "host2:2223"], ' \
- '"worker": ["host2:2222"]}, ' \
- '"environment": "cloud", ' \
- '"task": {"index": 0, "type": "worker"}}'
+ tf_config = (
+ '{"cluster": {'
+ '"master": ["host1:2222"], '
+ '"ps": ["host1:2223", "host2:2223"], '
+ '"worker": ["host2:2222"]}, '
+ '"environment": "cloud", '
+ '"task": {"index": 0, "type": "worker"}}'
+ )
- run.assert_called_with('s3://my/bucket', 'script_name', MODEL_DIR_CMD_LIST,
- {'TF_CONFIG': tf_config})
+ run.assert_called_with(
+ uri="s3://my/bucket",
+ user_entry_point="script_name",
+ args=MODEL_DIR_CMD_LIST,
+ env_vars={"TF_CONFIG": tf_config},
+ capture_error=True,
+ )
-@patch('sagemaker_containers.beta.framework.entry_point.run')
+@patch("sagemaker_training.entry_point.run")
def test_train_distributed_no_ps(run, distributed_training_env):
distributed_training_env.additional_framework_parameters[
- training.SAGEMAKER_PARAMETER_SERVER_ENABLED] = False
+ training.SAGEMAKER_PARAMETER_SERVER_ENABLED
+ ] = False
distributed_training_env.current_host = HOST2
training.train(distributed_training_env, MODEL_DIR_CMD_LIST)
- run.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST,
- distributed_training_env.to_env_vars(), runner=runner.ProcessRunnerType)
+ run.assert_called_with(
+ uri=MODULE_DIR,
+ user_entry_point=MODULE_NAME,
+ args=MODEL_DIR_CMD_LIST,
+ env_vars=distributed_training_env.to_env_vars(),
+ capture_error=True,
+ runner_type=runner.ProcessRunnerType,
+ )
def test_build_tf_config():
assert training._build_tf_config(HOST_LIST, HOST1) == {
- 'cluster': CLUSTER_WITH_PS,
- 'environment': 'cloud',
- 'task': MASTER_TASK
+ "cluster": CLUSTER_WITH_PS,
+ "environment": "cloud",
+ "task": MASTER_TASK,
}
assert training._build_tf_config(HOST_LIST, HOST1, ps_task=True) == {
- 'cluster': CLUSTER_WITH_PS,
- 'environment': 'cloud',
- 'task': PS_TASK_1
+ "cluster": CLUSTER_WITH_PS,
+ "environment": "cloud",
+ "task": PS_TASK_1,
}
assert training._build_tf_config(HOST_LIST, HOST2) == {
- 'cluster': CLUSTER_WITH_PS,
- 'environment': 'cloud',
- 'task': WORKER_TASK
+ "cluster": CLUSTER_WITH_PS,
+ "environment": "cloud",
+ "task": WORKER_TASK,
}
assert training._build_tf_config(HOST_LIST, HOST2, ps_task=True) == {
- 'cluster': CLUSTER_WITH_PS,
- 'environment': 'cloud',
- 'task': PS_TASK_2}
+ "cluster": CLUSTER_WITH_PS,
+ "environment": "cloud",
+ "task": PS_TASK_2,
+ }
def test_build_tf_config_error():
with pytest.raises(ValueError) as error:
training._build_tf_config([HOST1], HOST1, ps_task=True)
- assert 'Cannot have a ps task if there are no parameter servers in the cluster' in str(error.value)
+ assert "Cannot have a ps task if there are no parameter servers in the cluster" in str(
+ error.value
+ )
-@patch('sagemaker_tensorflow_container.training.logger')
+@patch("sagemaker_tensorflow_container.training.logger")
def test_log_model_missing_warning_no_model(logger):
- path = os.path.join(RESOURCE_PATH, 'test_dir_empty')
+ path = os.path.join(RESOURCE_PATH, "test_dir_empty")
if not os.path.exists(path):
os.mkdir(path)
training._log_model_missing_warning(path)
- logger.warn.assert_called_with('No model artifact is saved under path {}.'
- ' Your training job will not save any model files to S3.\n'
- 'For details of how to construct your training script see:\n'
- 'https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script' # noqa
- .format(path))
+ logger.warn.assert_called_with(
+ "No model artifact is saved under path {}."
+ " Your training job will not save any model files to S3.\n"
+ "For details of how to construct your training script see:\n"
+ "https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script".format( # noqa
+ path
+ )
+ )
-@patch('sagemaker_tensorflow_container.training.logger')
+@patch("sagemaker_tensorflow_container.training.logger")
def test_log_model_missing_warning_wrong_format(logger):
- training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_wrong_model'))
- logger.warn.assert_called_with('Your model will NOT be servable with SageMaker TensorFlow Serving container. '
- 'The model artifact was not saved in the TensorFlow '
- 'SavedModel directory structure:\n'
- 'https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory')
+ training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_wrong_model"))
+ logger.warn.assert_called_with(
+ "Your model will NOT be servable with SageMaker TensorFlow Serving container. "
+ "The model artifact was not saved in the TensorFlow "
+ "SavedModel directory structure:\n"
+ "https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory"
+ )
-@patch('sagemaker_tensorflow_container.training.logger')
+@patch("sagemaker_tensorflow_container.training.logger")
def test_log_model_missing_warning_wrong_parent_dir(logger):
- training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_wrong_parent_dir'))
- logger.warn.assert_called_with('Your model will NOT be servable with SageMaker TensorFlow Serving containers. '
- 'The SavedModel bundle is under directory \"{}\", not a numeric name.'
- .format('not-digit'))
+ training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_wrong_parent_dir"))
+ logger.warn.assert_called_with(
+ "Your model will NOT be servable with SageMaker TensorFlow Serving containers. "
+ 'The SavedModel bundle is under directory "{}", not a numeric name.'.format("not-digit")
+ )
-@patch('sagemaker_tensorflow_container.training.logger')
+@patch("sagemaker_tensorflow_container.training.logger")
def test_log_model_missing_warning_correct(logger):
- training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_correct_model'))
+ training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_correct_model"))
logger.warn.assert_not_called()
-@patch('sagemaker_tensorflow_container.training.logger')
-@patch('sagemaker_tensorflow_container.training.train')
-@patch('logging.Logger.setLevel')
-@patch('sagemaker_containers.beta.framework.training_env')
-@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={})
-@patch('sagemaker_tensorflow_container.s3_utils.configure')
-def test_main(configure_s3_env, read_hyperparameters, training_env,
- set_level, train, logger, single_machine_training_env):
+@patch("sagemaker_tensorflow_container.training.logger")
+@patch("sagemaker_tensorflow_container.training.train")
+@patch("logging.Logger.setLevel")
+@patch("sagemaker_training.environment.Environment")
+@patch("sagemaker_training.environment.read_hyperparameters", return_value={})
+@patch("sagemaker_tensorflow_container.s3_utils.configure")
+def test_main(
+ configure_s3_env,
+ read_hyperparameters,
+ training_env,
+ set_level,
+ train,
+ logger,
+ single_machine_training_env,
+):
training_env.return_value = single_machine_training_env
- os.environ['SAGEMAKER_REGION'] = REGION
+ os.environ["SAGEMAKER_REGION"] = REGION
training.main()
read_hyperparameters.assert_called_once_with()
training_env.assert_called_once_with(hyperparameters={})
@@ -253,46 +302,71 @@ def test_main(configure_s3_env, read_hyperparameters, training_env,
configure_s3_env.assert_called_once()
-@patch('sagemaker_tensorflow_container.training.logger')
-@patch('sagemaker_tensorflow_container.training.train')
-@patch('logging.Logger.setLevel')
-@patch('sagemaker_containers.beta.framework.training_env')
-@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={'model_dir': MODEL_DIR})
-@patch('sagemaker_tensorflow_container.s3_utils.configure')
-def test_main_simple_training_model_dir(configure_s3_env, read_hyperparameters, training_env,
- set_level, train, logger, single_machine_training_env):
+@patch("sagemaker_tensorflow_container.training.logger")
+@patch("sagemaker_tensorflow_container.training.train")
+@patch("logging.Logger.setLevel")
+@patch("sagemaker_training.environment.Environment")
+@patch("sagemaker_training.environment.read_hyperparameters", return_value={"model_dir": MODEL_DIR})
+@patch("sagemaker_tensorflow_container.s3_utils.configure")
+def test_main_simple_training_model_dir(
+ configure_s3_env,
+ read_hyperparameters,
+ training_env,
+ set_level,
+ train,
+ logger,
+ single_machine_training_env,
+):
training_env.return_value = single_machine_training_env
- os.environ['SAGEMAKER_REGION'] = REGION
+ os.environ["SAGEMAKER_REGION"] = REGION
training.main()
configure_s3_env.assert_called_once_with(MODEL_DIR, REGION)
-@patch('sagemaker_tensorflow_container.training.logger')
-@patch('sagemaker_tensorflow_container.training.train')
-@patch('logging.Logger.setLevel')
-@patch('sagemaker_containers.beta.framework.training_env')
-@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={'model_dir': MODEL_DIR,
- '_tuning_objective_metric': 'auc'})
-@patch('sagemaker_tensorflow_container.s3_utils.configure')
-def test_main_tuning_model_dir(configure_s3_env, read_hyperparameters, training_env,
- set_level, train, logger, single_machine_training_env):
+@patch("sagemaker_tensorflow_container.training.logger")
+@patch("sagemaker_tensorflow_container.training.train")
+@patch("logging.Logger.setLevel")
+@patch("sagemaker_training.environment.Environment")
+@patch(
+ "sagemaker_training.environment.read_hyperparameters",
+ return_value={"model_dir": MODEL_DIR, "_tuning_objective_metric": "auc"},
+)
+@patch("sagemaker_tensorflow_container.s3_utils.configure")
+def test_main_tuning_model_dir(
+ configure_s3_env,
+ read_hyperparameters,
+ training_env,
+ set_level,
+ train,
+ logger,
+ single_machine_training_env,
+):
training_env.return_value = single_machine_training_env
- os.environ['SAGEMAKER_REGION'] = REGION
+ os.environ["SAGEMAKER_REGION"] = REGION
training.main()
- expected_model_dir = '{}/{}/model'.format(MODEL_DIR, single_machine_training_env.job_name)
+ expected_model_dir = "{}/{}/model".format(MODEL_DIR, single_machine_training_env.job_name)
configure_s3_env.assert_called_once_with(expected_model_dir, REGION)
-@patch('sagemaker_tensorflow_container.training.logger')
-@patch('sagemaker_tensorflow_container.training.train')
-@patch('logging.Logger.setLevel')
-@patch('sagemaker_containers.beta.framework.training_env')
-@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={'model_dir': '/opt/ml/model',
- '_tuning_objective_metric': 'auc'})
-@patch('sagemaker_tensorflow_container.s3_utils.configure')
-def test_main_tuning_mpi_model_dir(configure_s3_env, read_hyperparameters, training_env,
- set_level, train, logger, single_machine_training_env):
+@patch("sagemaker_tensorflow_container.training.logger")
+@patch("sagemaker_tensorflow_container.training.train")
+@patch("logging.Logger.setLevel")
+@patch("sagemaker_training.environment.Environment")
+@patch(
+ "sagemaker_training.environment.read_hyperparameters",
+ return_value={"model_dir": "/opt/ml/model", "_tuning_objective_metric": "auc"},
+)
+@patch("sagemaker_tensorflow_container.s3_utils.configure")
+def test_main_tuning_mpi_model_dir(
+ configure_s3_env,
+ read_hyperparameters,
+ training_env,
+ set_level,
+ train,
+ logger,
+ single_machine_training_env,
+):
training_env.return_value = single_machine_training_env
- os.environ['SAGEMAKER_REGION'] = REGION
+ os.environ["SAGEMAKER_REGION"] = REGION
training.main()
- configure_s3_env.assert_called_once_with('/opt/ml/model', REGION)
+ configure_s3_env.assert_called_once_with("/opt/ml/model", REGION)
diff --git a/test/resources/test_py_version/entry.py b/test/utils/__init__.py
similarity index 67%
rename from test/resources/test_py_version/entry.py
rename to test/utils/__init__.py
index 8f71a01b..79cb9cdf 100644
--- a/test/resources/test_py_version/entry.py
+++ b/test/utils/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
@@ -11,12 +11,3 @@
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import
-
-import os
-import sys
-
-
-py_version = '%s.%s' % (sys.version_info.major, sys.version_info.minor)
-
-with open(os.path.join(os.environ['SM_OUTPUT_DIR'], 'py_version'), 'a') as f:
- f.write(py_version)
diff --git a/test/utils/image_utils.py b/test/utils/image_utils.py
new file mode 100644
index 00000000..9fe5b590
--- /dev/null
+++ b/test/utils/image_utils.py
@@ -0,0 +1,72 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+import subprocess
+import sys
+
+CYAN_COLOR = "\033[36m"
+END_COLOR = "\033[0m"
+DLC_AWS_ID = "763104351884"
+
+
+def build_image(framework_version, dockerfile, image_uri, region, cwd="."):
+ _check_call("python setup.py sdist")
+
+ if "dlc" in dockerfile:
+ ecr_login(region, DLC_AWS_ID)
+
+ dockerfile_location = os.path.join("test", "container", framework_version, dockerfile)
+
+ subprocess.check_call(
+ [
+ "docker",
+ "build",
+ "-t",
+ image_uri,
+ "-f",
+ dockerfile_location,
+ "--build-arg",
+ "region={}".format(region),
+ cwd,
+ ],
+ cwd=cwd,
+ )
+ print("created image {}".format(image_uri))
+ return image_uri
+
+
+def push_image(ecr_image, region, aws_id):
+ ecr_login(region, aws_id)
+ _check_call("docker push {}".format(ecr_image))
+
+
+def ecr_login(region, aws_id):
+ login = _check_call(
+ "aws ecr get-login --registry-ids {} ".format(aws_id)
+ + "--no-include-email --region {}".format(region)
+ )
+ _check_call(login.decode("utf-8").rstrip("\n"))
+
+
+def _check_call(cmd, *popenargs, **kwargs):
+ if isinstance(cmd, str):
+ cmd = cmd.split(" ")
+ _print_cmd(cmd)
+ return subprocess.check_output(cmd, *popenargs, **kwargs)
+
+
+def _print_cmd(cmd):
+ print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR))
+ sys.stdout.flush()
diff --git a/tox.ini b/tox.ini
index b4f6fbb0..17ed3095 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
# and then run "tox" from this directory.
[tox]
-envlist = py27,py36,flake8
+envlist = py27,py36,py37,flake8
skip_missing_interpreters = False
[travis]
@@ -27,6 +27,8 @@ exclude =
benchmarks/
max-complexity = 10
ignore =
+ C901,
+ E203, # whitespace before ':': Black disagrees with and explicitly violates this.
FI10,
FI12,
FI13,
@@ -43,7 +45,7 @@ ignore =
FI55,
FI56,
FI57,
- E722
+ W503
require-code = True
@@ -61,7 +63,6 @@ passenv =
commands =
coverage run --rcfile .coveragerc_{envname} --source sagemaker_tensorflow_container -m py.test {posargs}
{env:IGNORE_COVERAGE:} coverage report --include *sagemaker_tensorflow_container* --show-missing
-deps = sagemaker-containers
extras = test
[testenv:flake8]