diff --git a/.coveragerc_py37 b/.coveragerc_py37 new file mode 100644 index 00000000..96bb72bf --- /dev/null +++ b/.coveragerc_py37 @@ -0,0 +1,20 @@ +[run] +branch = True +timid = True + +[report] +exclude_lines = + pragma: no cover + pragma: py3 no cover + if six.PY2 + elif six.PY2 + +partial_branches = + pragma: no cover + pragma: py3 no cover + if six.PY3 + elif six.PY3 + +show_missing = True + +fail_under = 90 diff --git a/.flake8 b/.flake8 index a87e2f9f..83270830 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] -application_import_names = sagemaker_tensorflow_container, test, timeout, utils +application_import_names = image_utils, integration, sagemaker_tensorflow_container, test, timeout, utils import-order-style = google diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..978cf8cf --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,31 @@ +--- +name: Bug report +about: File a report to help us reproduce and fix the problem +title: '' +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To reproduce** +A clear, step-by-step set of instructions to reproduce the bug. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots or logs** +If applicable, add screenshots or logs to help explain your problem. + +**System information** +A description of your system. Please provide: +- **Toolkit version**: +- **Framework version**: +- **Python version**: +- **CPU or GPU**: +- **Custom Docker image (Y/N)**: + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..9df79c90 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: false +contact_links: + - name: Ask a question + url: https://stackoverflow.com/questions/tagged/amazon-sagemaker + about: Use Stack Overflow to ask and answer questions diff --git a/.github/ISSUE_TEMPLATE/documentation-request.md b/.github/ISSUE_TEMPLATE/documentation-request.md new file mode 100644 index 00000000..b64cd478 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation-request.md @@ -0,0 +1,17 @@ +--- +name: Documentation request +about: Request improved documentation +title: '' +labels: '' +assignees: '' + +--- + +**What did you find confusing? Please describe.** +A clear and concise description of what you found confusing. Ex. I tried to [...] but I didn't understand how to [...] + +**Describe how documentation can be improved** +A clear and concise description of where documentation was lacking and how it can be improved. + +**Additional context** +Add any other context or screenshots about the documentation request here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..bff1cb4e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest new functionality for this toolkit +title: '' +labels: '' +assignees: '' + +--- + +**Describe the feature you'd like** +A clear and concise description of the functionality you want. + +**How would this feature be used? Please describe.** +A clear and concise description of the use case for this feature. Please provide an example, if possible. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/CHANGELOG.md b/CHANGELOG.md index af391c6a..58039444 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,359 @@ # Changelog +## v10.1.8 (2020-12-08) + +### Bug Fixes and Other Changes + + * workaround to print stderr when capture_error is True + +## v10.1.7 (2020-11-06) + +### Bug Fixes and Other Changes + + * propagate log level + +## v10.1.6 (2020-10-15) + +### Bug Fixes and Other Changes + + * add condition to avoid error when 'model_dir' is None + +## v10.1.5 (2020-08-23) + +### Bug Fixes and Other Changes + + * call entry_point.run with capture_error=True + +## v10.1.4.post4 (2020-07-01) + +### Testing and Release Infrastructure + + * add integration test for MPI env vars propagation + +## v10.1.4.post3 (2020-06-29) + +### Testing and Release Infrastructure + + * add issue templates + +## v10.1.4.post2 (2020-06-18) + +### Documentation Changes + + * remove confusing information from the Readme. + +### Testing and Release Infrastructure + + * add single-instance, multi-process Horovod test for local GPU + +## v10.1.4.post1 (2020-06-11) + +### Testing and Release Infrastructure + + * Rename buildspec files. + +## v10.1.4.post0 (2020-06-10) + +### Documentation Changes + + * remove functional test info from branch + * Update README.rst + +### Testing and Release Infrastructure + + * Make docker folder read only, remove unused tests. + +## v10.1.4 (2020-06-10) + +### Bug Fixes and Other Changes + + * bump version of sagemaker-training for script entry point fix. + +## v10.1.3 (2020-05-12) + +### Bug Fixes and Other Changes + + * Bump version of sagemaker-training for typing fix + +### Testing and Release Infrastructure + + * remove unused build scripts. + +## v10.1.2 (2020-05-05) + +### Bug Fixes and Other Changes + + * Add py37 to sm tests + +## v10.1.1 (2020-05-04) + +### Bug Fixes and Other Changes + + * remove sagemaker pysdk, keras_applications and keras_preprocessing in docker files + * Fix sm integration issues + * add dockerfiles for tf 1.15.2 py37 containers + +## v10.1.0 (2020-04-29) + +### Features + + * Python 3.7 support + +### Testing and Release Infrastructure + + * Fix buildspecs + +## v10.0.0 (2020-04-27) + +### Breaking Changes + + * Replace sagemaker-containers with sagemaker-training + +### Testing and Release Infrastructure + + * remove CHANGELOG entries from failed builds + * bump version to prepare for new version scheme + * add training script to benchmark directory + * skip image push in PR build if no changes + +## v2.3.2 (2020-04-07) + +### Bug Fixes and Other Changes + + * Bump smdebug version + +## v2.3.1 (2020-04-06) + +### Bug Fixes and Other Changes + + * updating pillow version of tf1.15 + +## v2.3.0 (2020-04-02) + +### Features + + * install sagemaker-tensorflow-toolkit from PyPI. + +## v2.2.8 (2020-04-01) + +### Bug Fixes and Other Changes + + * Allowing arguments for deep_learning_container.py for tf1.15 + +## v2.2.7.post0 (2020-03-31) + +### Testing and Release Infrastructure + + * refactor toolkit tests. + +## v2.2.7 (2020-03-26) + +### Bug Fixes and Other Changes + + * Adding of deep_learning_container.py in Tf1.15 + +## v2.2.6 (2020-03-16) + +### Bug Fixes and Other Changes + + * smdebug 0.7.1 + * Added marker to skip on pipeline + +## v2.2.5 (2020-03-12) + +### Bug Fixes and Other Changes + + * install smexperiments when python >= 3.6 + * SM integration test for TF 1.x + * upgrade to latest sagemaker-experiments + * Added pytest fixture + +## v2.2.4 (2020-03-11) + +### Bug Fixes and Other Changes + + * update smdebug wheel + * Revert "Update smdebug to 0.7.0 - TF 1.15.2 (#298)" + +## v2.2.3 (2020-03-10) + +### Bug Fixes and Other Changes + + * update smdebug wheel + * Update smdebug to 0.7.0 - TF 1.15.2 + * install SageMaker Python SDK into Python 3 images + +## v2.2.2.post0 (2020-03-05) + +### Testing and Release Infrastructure + + * fix PR build + +## v2.2.2 (2020-02-20) + +### Bug Fixes and Other Changes + + * copy all tests to test-toolkit folder. + +## v2.2.1 (2020-02-17) + +### Bug Fixes and Other Changes + + * update: update r1.15.2 dockerfiles + +## v2.2.0 (2020-02-13) + +### Features + + * Add release to PyPI. Change package name to sagemaker-tensorflow-training. + +### Bug Fixes and Other Changes + + * pin awscli to latest version + * Pin awscli to latest + * bump smdebug version to 0.5.0.post0 + * update: Update awscli version and remove related pins + * update: Update buildspec for TF 1.15.0 + * update copyright year in license header + +### Documentation Changes + + * update README.rst + * Add link to TF 2.0 branch + +### Testing and Release Infrastructure + + * Add twine check during PR. + * properly fail build if has-matching-changes fails + * properly fail build if has-matching-changes fails + +## v0.1.0 (2020-02-12) + +### Features + + * Add release to PyPI. Change package name to sagemaker-tensorflow-training. + +### Bug Fixes and Other Changes + + * pin awscli to latest version + * Pin awscli to latest + * bump smdebug version to 0.5.0.post0 + * update: Update awscli version and remove related pins + * update: Update buildspec for TF 1.15.0 + * update copyright year in license header + * update: Release TF 1.15.0 dockerfiles + * use regional endpoint for STS in builds + * update documentation link in warning message + * update instance type region availability. + * license file was missing from root of repo. + * install tensorflow<2.0 + * merge dockerfiles + * move script mode branch to master + * use last legacy mode version for --framework-version test arg default + * Pin pytest and pluggy to work around configparser error + * Use multiprocessing.Process to launch parameter server + * increase grpc message size limit to 2gb + * Fix typo in serving method name + * restore python-dev package in image + * Add default tag to functional tests + * update link to correct docs + * Add EI Dockerfile for 1.11 + * Add EI documentation within README + * add Dockerfile for EI + * Use get_closest_marker instead of get_marker + * Add docker files of TF 1.12 + * Default GRPC timeout for EI & Allow timeout to be configurable + * remove requests from test dependencies + * catch RpcError due to change in GRPC + * Icywang86rui gpu fix + * Read port range from container support for TFS port + * Unfreeze requests version + * export_saved_model: copy asset files + * add port to dockerfile + * Updated TF Pipe Mode Version + * Fix MKL setting + * Set MKL vars plus tests + * increase test timeout + * Add back https to S3 + * Add 1.11.0 CPU and GPU Dockerfile + * pin requests version + * fix memory leak in serving + * Update region in s3 boto client in serve + * Update readme with instructions for 1.9.0 and above + * Fix deserialization of dicts for json predict requests + * Add dockerfile and update test for tensorflow 1.10.0 + * Support tensorflow 1.9.0 + * Add integ tests to verify that tensorflow in gpu-image can access gpu-devices. + * train on 3 epochs for pipe mode test + * Change error classes used by _default_input_fn() and _default_output_fn() + * Changing assertion to check only existence + * Install sagemaker-tensorflow from pypi. Add MKL environment variables for TF 1.8 + * get most recent saved model to export + * pip install tensorflow 1.8 in 1.8 cpu image + * install tensorflow extensions + * upgrade cpu binaries in docker build + * Force upgrade of the framework binaries to make sure the right binaries are installed. + * Add Pillow to pip install list + * Increase train steps for cifar distributed test to mitigate race condition + * Add TensorFlow 1.8 dockerfiles + * Add TensorFlow 1.7 dockerfiles + * Explain how to download tf binaries from PyPI + * Allow training without S3 + * Fix hyperparameter name for detecting a tuning job + * Checkout v1.4.1 tag instead of r1.4 branch + * Move processing of requirements file in. + * Generate checkpoint path using TRAINING_JOB_NAME environment variable if needed + * Wrap user-provided model_fn to pass arguments positionally (maintains compatibility with existing behavior) + * Add more unit tests for trainer, fix __all__ and rename train.py to avoid import conflict + * Use regional endpoint for S3 client + * Update README.rst + * Pass input_channels to eval_input_fn if defined + * Fix setup.py to refer to renamed README + * Add test and build instructions + * Fix year in license headers + * Add TensorFlow 1.6 + * Add test instructions in README + * Add container support to install_requires + * Add Apache license headers + * Use wget to install tensorflow-model-server + * Fix file path for integ test + * Fix s3_prefix path in integ test + * Fix typo in path for integ test + * Add input_channels to train_input_fn interface. + * Update logging and make serving_input_fn optional. + * remove pip install in tensorflow training + * Modify integration tests to run nvidia-docker for gpu + * add h5py for keras models + * Add local integ tests & resources + * Restructure repo to use a directory per TF version for dockerfiles + * Rename "feature_map" variables to "feature_dict" to avoid overloading it with the ML term "feature map" + * Copying in changes from internal repo: + * Add functional test + * Fix FROM image names for final build dockerfiles + * Add dockerfiles for building our production images (TF 1.4) + * GPU Dockerfile and setup.py fixes + * Add base image Dockerfiles for 1.4 + * Merge pull request #1 from aws/mvs-first-commit + * first commit + * Updating initial README.md from template + * Creating initial file from template + * Creating initial file from template + * Creating initial file from template + * Creating initial file from template + * Creating initial file from template + * Initial commit + +### Documentation Changes + + * update README.rst + * Add link to TF 2.0 branch + +### Testing and Release Infrastructure + + * Add twine check during PR. + * properly fail build if has-matching-changes fails + * properly fail build if has-matching-changes fails + ## v0.1.0 (2019-05-22) ### Bug fixes and other changes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..5cc14234 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,61 @@ +# Contributing Guidelines + +Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional +documentation, we greatly value feedback and contributions from our community. + +Please read through this document before submitting any issues or pull requests to ensure we have all the necessary +information to effectively respond to your bug report or contribution. + + +## Reporting Bugs/Feature Requests + +We welcome you to use the GitHub issue tracker to report bugs or suggest features. + +When filing an issue, please check [existing open](https://github.com/aws/sagemaker-tensorflow-training-toolkit/issues), or [recently closed](https://github.com/aws/sagemaker-tensorflow-training-toolkit/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already +reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: + +* A reproducible test case or series of steps +* The version of our code being used +* Any modifications you've made relevant to the bug +* Anything unusual about your environment or deployment + + +## Contributing via Pull Requests +Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: + +1. You are working against the latest source on the *master* branch. +2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. +3. You open an issue to discuss any significant work - we would hate for your time to be wasted. + +To send us a pull request, please: + +1. Fork the repository. +2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. +3. Ensure local tests pass. +4. Commit to your fork using clear commit messages. +5. Send us a pull request, answering any default questions in the pull request interface. +6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. + +GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and +[creating a pull request](https://help.github.com/articles/creating-a-pull-request/). + + +## Finding contributions to work on +Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/sagemaker-tensorflow-training-toolkit/labels/help%20wanted) issues is a great place to start. + + +## Code of Conduct +This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). +For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact +opensource-codeofconduct@amazon.com with any additional questions or comments. + + +## Security issue notifications +If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. + + +## Licensing + +See the [LICENSE](https://github.com/aws/sagemaker-tensorflow-training-toolkit//blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. + +We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. diff --git a/README.rst b/README.rst index 6d031bf9..92aed6e2 100644 --- a/README.rst +++ b/README.rst @@ -1,290 +1,29 @@ -=============================== -SageMaker TensorFlow Containers -=============================== +===================================== +SageMaker TensorFlow Training Toolkit +===================================== -SageMaker TensorFlow Containers is an open source library for making the -TensorFlow framework run on `Amazon SageMaker `__. +SageMaker TensorFlow Training Toolkit is an open-source library for using TensorFlow to train models on Amazon SageMaker. -This repository also contains Dockerfiles which install this library, TensorFlow, and dependencies -for building SageMaker TensorFlow images. +For inference, see `SageMaker TensorFlow Inference Toolkit `__. -For information on running TensorFlow jobs on SageMaker: `Python -SDK `__. +For the Dockerfiles used for building SageMaker TensorFlow Containers, see `AWS Deep Learning Containers `__. + +For information on running TensorFlow jobs on Amazon SageMaker, please refer to the `SageMaker Python SDK documentation `__. For notebook examples: `SageMaker Notebook Examples `__. -Table of Contents ------------------ - -#. `Getting Started <#getting-started>`__ -#. `Building your Image <#building-your-image>`__ -#. `Running the tests <#running-the-tests>`__ - -Getting Started ---------------- - -Prerequisites -~~~~~~~~~~~~~ - -Make sure you have installed all of the following prerequisites on your -development machine: - -- `Docker `__ - -For Testing on GPU -^^^^^^^^^^^^^^^^^^ - -- `Nvidia-Docker `__ - -Recommended -^^^^^^^^^^^ - -- A Python environment management tool. (e.g. - `PyEnv `__, - `VirtualEnv `__) - -Building your Image -------------------- - -`Amazon SageMaker `__ -utilizes Docker containers to run all training jobs & inference endpoints. - -The Docker images are built from the Dockerfiles specified in -`Docker/ `__. - -The Docker files are grouped based on TensorFlow version and separated -based on Python version and processor type. - -The Docker files for TensorFlow 2.0 are available in the -`tf-2 `__ branch, in -`docker/2.0.0/ `__. - -The Docker images, used to run training & inference jobs, are built from -both corresponding "base" and "final" Dockerfiles. - -Base Images -~~~~~~~~~~~ - -The "base" Dockerfile encompass the installation of the framework and all of the dependencies -needed. It is needed before building image for TensorFlow 1.8.0 and before. -Building a base image is not required for images for TensorFlow 1.9.0 and onwards. - -Tagging scheme is based on --. (e.g. 1.4 -.1-cpu-py2) - -All "final" Dockerfiles build images using base images that use the tagging scheme -above. - -If you want to build your "base" Docker image, then use: - -:: - - # All build instructions assume you're building from the same directory as the Dockerfile. - - # CPU - docker build -t tensorflow-base:-cpu- -f Dockerfile.cpu . - - # GPU - docker build -t tensorflow-base:-gpu- -f Dockerfile.gpu . - -:: - - # Example - - # CPU - docker build -t tensorflow-base:1.4.1-cpu-py2 -f Dockerfile.cpu . - - # GPU - docker build -t tensorflow-base:1.4.1-gpu-py2 -f Dockerfile.gpu . - -Final Images -~~~~~~~~~~~~ - -The "final" Dockerfiles encompass the installation of the SageMaker specific support code. - -For images of TensorFlow 1.8.0 and before, all "final" Dockerfiles use `base images for building `__. - -These "base" images are specified with the naming convention of -tensorflow-base:--. - -Before building "final" images: - -Build your "base" image. Make sure it is named and tagged in accordance with your "final" -Dockerfile. Skip this step if you want to build image of Tensorflow Version 1.9.0 and above. - -Then prepare the SageMaker TensorFlow Container python package in the image folder like below: - -:: - - # Create the SageMaker TensorFlow Container Python package. - cd sagemaker-tensorflow-containers - python setup.py sdist - - #. Copy your Python package to "final" Dockerfile directory that you are building. - cp dist/sagemaker_tensorflow_container-.tar.gz docker//final/py2 - -If you want to build "final" Docker images, for versions 1.6 and above, you will first need to download the appropriate tensorflow pip wheel, then pass in its location as a build argument. These can be obtained from pypi. For example, the files for 1.6.0 are here: - -https://pypi.org/project/tensorflow/1.6.0/#files -https://pypi.org/project/tensorflow-gpu/1.6.0/#files - -Note that you need to use the tensorflow-gpu wheel when building the GPU image. - -Then run: - -:: - - # All build instructions assumes you're building from the same directory as the Dockerfile. - - # CPU - docker build -t : --build-arg py_version= --build-arg framework_installable= -f Dockerfile.cpu . - - # GPU - docker build -t : --build-arg py_version= --build-arg framework_installable= -f Dockerfile.gpu . - -:: - - # Example - docker build -t preprod-tensorflow:1.6.0-cpu-py2 --build-arg py_version=2 - --build-arg framework_installable=tensorflow-1.6.0-cp27-cp27mu-manylinux1_x86_64.whl -f Dockerfile.cpu . - -The dockerfiles for 1.4 and 1.5 build from source instead, so when building those, you don't need to download the wheel beforehand: - -:: - - # All build instructions assumes you're building from the same directory as the Dockerfile. - - # CPU - docker build -t : -f Dockerfile.cpu . - - # GPU - docker build -t : -f Dockerfile.gpu . - -:: - - # Example - - # CPU - docker build -t preprod-tensorflow:1.4.1-cpu-py2 -f Dockerfile.cpu . - - # GPU - docker build -t preprod-tensorflow:1.4.1-gpu-py2 -f Dockerfile.gpu . - - -Running the tests ------------------ - -Running the tests requires installation of the SageMaker TensorFlow Container code and its test -dependencies. - -:: - - git clone https://github.com/aws/sagemaker-tensorflow-containers.git - cd sagemaker-tensorflow-containers - pip install -e .[test] - -Tests are defined in -`test/ `__ -and include unit, integration and functional tests. - -Unit Tests -~~~~~~~~~~ - -If you want to run unit tests, then use: - -:: - - # All test instructions should be run from the top level directory - - pytest test/unit - -Integration Tests -~~~~~~~~~~~~~~~~~ - -Running integration tests require `Docker `__ and `AWS -credentials `__, -as the integration tests make calls to a couple AWS services. The integration and functional -tests require configurations specified within their respective -`conftest.py `__.Make sure to update the account-id and region at a minimum. - -Integration tests on GPU require `Nvidia-Docker `__. - -Before running integration tests: - -#. Build your Docker image. -#. Pass in the correct pytest arguments to run tests against your Docker image. - -If you want to run local integration tests, then use: - -:: - - # Required arguments for integration tests are found in test/integ/conftest.py - - pytest test/integration --docker-base-name \ - --tag \ - --framework-version \ - --processor - -:: - - # Example - pytest test/integration --docker-base-name preprod-tensorflow \ - --tag 1.0 \ - --framework-version 1.4.1 \ - --processor cpu - -Functional Tests -~~~~~~~~~~~~~~~~ - -Functional tests require your Docker image to be within an `Amazon ECR repository `__. - -The Docker-base-name is your `ECR repository namespace `__. - -The instance-type is your specified `Amazon SageMaker Instance Type -`__ that the functional test will run on. - - -Before running functional tests: - -#. Build your Docker image. -#. Push the image to your ECR repository. -#. Pass in the correct pytest arguments to run tests on SageMaker against the image within your ECR repository. - -If you want to run a functional end to end test on `Amazon -SageMaker `__, then use: - -:: - - # Required arguments for integration tests are found in test/functional/conftest.py - - pytest test/functional --aws-id \ - --docker-base-name \ - --instance-type \ - --tag \ - -:: - - # Example - pytest test/functional --aws-id 12345678910 \ - --docker-base-name preprod-tensorflow \ - --instance-type ml.m4.xlarge \ - --tag 1.0 - Contributing ------------ Please read -`CONTRIBUTING.md `__ +`CONTRIBUTING.md `__ for details on our code of conduct, and the process for submitting pull requests to us. License ------- -SageMaker TensorFlow Containers is licensed under the Apache 2.0 License. It is copyright 2018 +SageMaker TensorFlow Training Toolkit is licensed under the Apache 2.0 License. It is copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. The license is available at: http://aws.amazon.com/apache2.0/ diff --git a/VERSION b/VERSION index eb5fc1c6..50106b6d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.8.dev0 +10.1.9.dev0 diff --git a/benchmarks/horovod-resnet/execute_horovod_training.py b/benchmarks/horovod-resnet/execute_horovod_training.py index e6ac7609..4b0b9b23 100755 --- a/benchmarks/horovod-resnet/execute_horovod_training.py +++ b/benchmarks/horovod-resnet/execute_horovod_training.py @@ -26,7 +26,7 @@ from sagemaker.tensorflow import TensorFlow dir_path = os.path.dirname(os.path.realpath(__file__)) -benchmark_results_dir = os.path.join('s3://', Session().default_bucket(), 'hvd-benchmarking') +benchmark_results_dir = os.path.join("s3://", Session().default_bucket(), "hvd-benchmarking") @click.group() @@ -35,93 +35,98 @@ def cli(): def generate_report(): - results_dir = os.path.join(dir_path, 'results') + results_dir = os.path.join(dir_path, "results") if os.path.exists(results_dir): shutil.rmtree(results_dir) - subprocess.call(['aws', 's3', 'cp', '--recursive', benchmark_results_dir, results_dir]) + subprocess.call(["aws", "s3", "cp", "--recursive", benchmark_results_dir, results_dir]) jobs = {} for job_name in os.listdir(results_dir): jobs[job_name] = {} - _, instance_type, instance_count, device, py_version, _, _, _, _, _, _, _ = job_name.split('-') + _, instance_type, instance_count, device, py_version, _, _, _, _, _, _, _ = job_name.split( + "-" + ) current_dir = os.path.join(results_dir, job_name) - model_dir = os.path.join(current_dir, 'output', 'model.tar.gz') - subprocess.call(['tar', '-xvzf', model_dir], cwd=current_dir) + model_dir = os.path.join(current_dir, "output", "model.tar.gz") + subprocess.call(["tar", "-xvzf", model_dir], cwd=current_dir) - jobs[job_name]['instance_type'] = instance_type - jobs[job_name]['instance_count'] = instance_count - jobs[job_name]['device'] = device - jobs[job_name]['py_version'] = py_version + jobs[job_name]["instance_type"] = instance_type + jobs[job_name]["instance_count"] = instance_count + jobs[job_name]["device"] = device + jobs[job_name]["py_version"] = py_version - benchmark_log = os.path.join(current_dir, 'benchmark_run.log') + benchmark_log = os.path.join(current_dir, "benchmark_run.log") if os.path.exists(benchmark_log): with open(benchmark_log) as f: data = json.load(f) - - jobs[job_name]['dataset'] = data['dataset']['name'] - jobs[job_name]['num_cores'] = data['machine_config']['cpu_info']['num_cores'] - jobs[job_name]['cpu_info'] = data['machine_config']['cpu_info']['cpu_info'] - jobs[job_name]['mhz_per_cpu'] = data['machine_config']['cpu_info']['mhz_per_cpu'] - jobs[job_name]['gpu_count'] = data['machine_config']['gpu_info']['count'] - jobs[job_name]['gpu_model'] = data['machine_config']['gpu_info']['model'] + jobs[job_name]["dataset"] = data["dataset"]["name"] + jobs[job_name]["num_cores"] = data["machine_config"]["cpu_info"]["num_cores"] + jobs[job_name]["cpu_info"] = data["machine_config"]["cpu_info"]["cpu_info"] + jobs[job_name]["mhz_per_cpu"] = data["machine_config"]["cpu_info"]["mhz_per_cpu"] + jobs[job_name]["gpu_count"] = data["machine_config"]["gpu_info"]["count"] + jobs[job_name]["gpu_model"] = data["machine_config"]["gpu_info"]["model"] def find_value(parameter): - other_key = [k for k in parameter if k != 'name'][0] + other_key = [k for k in parameter if k != "name"][0] return parameter[other_key] - for parameter in data['run_parameters']: - jobs[job_name][parameter['name']] = find_value(parameter) + for parameter in data["run_parameters"]: + jobs[job_name][parameter["name"]] = find_value(parameter) - jobs[job_name]['model_name'] = data['model_name'] - jobs[job_name]['run_date'] = data['run_date'] - jobs[job_name]['tensorflow_version'] = data['tensorflow_version']['version'] - jobs[job_name]['tensorflow_version_git_hash'] = data['tensorflow_version']['git_hash'] + jobs[job_name]["model_name"] = data["model_name"] + jobs[job_name]["run_date"] = data["run_date"] + jobs[job_name]["tensorflow_version"] = data["tensorflow_version"]["version"] + jobs[job_name]["tensorflow_version_git_hash"] = data["tensorflow_version"][ + "git_hash" + ] return pd.DataFrame(jobs) -@cli.command('train') -@click.option('--framework-version', required=True, type=click.Choice(['1.11', '1.12'])) -@click.option('--device', required=True, type=click.Choice(['cpu', 'gpu'])) -@click.option('--py-versions', multiple=True, type=str) -@click.option('--training-input-mode', default='File', type=click.Choice(['File', 'Pipe'])) -@click.option('--networking-isolation/--no-networking-isolation', default=False) -@click.option('--wait/--no-wait', default=False) -@click.option('--security-groups', multiple=True, type=str) -@click.option('--subnets', multiple=True, type=str) -@click.option('--role', default='SageMakerRole', type=str) -@click.option('--instance-counts', multiple=True, type=int) -@click.option('--instance-types', multiple=True, type=str) -@click.argument('script_args', nargs=-1, type=str) -def train(framework_version, - device, - py_versions, - training_input_mode, - networking_isolation, - wait, - security_groups, - subnets, - role, - instance_counts, - instance_types, - script_args): +@cli.command("train") +@click.option("--framework-version", required=True, type=click.Choice(["1.11", "1.12"])) +@click.option("--device", required=True, type=click.Choice(["cpu", "gpu"])) +@click.option("--py-versions", multiple=True, type=str) +@click.option("--training-input-mode", default="File", type=click.Choice(["File", "Pipe"])) +@click.option("--networking-isolation/--no-networking-isolation", default=False) +@click.option("--wait/--no-wait", default=False) +@click.option("--security-groups", multiple=True, type=str) +@click.option("--subnets", multiple=True, type=str) +@click.option("--role", default="SageMakerRole", type=str) +@click.option("--instance-counts", multiple=True, type=int) +@click.option("--instance-types", multiple=True, type=str) +@click.argument("script_args", nargs=-1, type=str) +def train( + framework_version, + device, + py_versions, + training_input_mode, + networking_isolation, + wait, + security_groups, + subnets, + role, + instance_counts, + instance_types, + script_args, +): iterator = itertools.product(instance_types, py_versions, instance_counts) for instance_type, py_version, instance_count in iterator: base_name = job_name(instance_type, instance_count, device, py_version) - mpi_options = '-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog' + mpi_options = "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog" estimator = TensorFlow( - entry_point=os.path.join(dir_path, 'train.sh'), + entry_point=os.path.join(dir_path, "train.sh"), role=role, - dependencies=[os.path.join(dir_path, 'train_imagenet_resnet_hvd.py')], + dependencies=[os.path.join(dir_path, "train_imagenet_resnet_hvd.py")], base_job_name=base_name, train_instance_count=instance_count, train_instance_type=instance_type, @@ -129,36 +134,34 @@ def train(framework_version, py_version=py_version, script_mode=True, hyperparameters={ - 'sagemaker_mpi_enabled': True, - 'sagemaker_mpi_num_of_processes_per_host': 8, - 'sagemaker_mpi_custom_mpi_options': mpi_options + "sagemaker_mpi_enabled": True, + "sagemaker_mpi_num_of_processes_per_host": 8, + "sagemaker_mpi_custom_mpi_options": mpi_options, }, output_path=benchmark_results_dir, security_group_ids=security_groups, - subnets=subnets + subnets=subnets, ) estimator.fit(wait=wait) if wait: - artifacts_path = os.path.join(dir_path, 'results', - estimator.latest_training_job.job_name) - model_path = os.path.join(artifacts_path, 'model.tar.gz') + artifacts_path = os.path.join( + dir_path, "results", estimator.latest_training_job.job_name + ) + model_path = os.path.join(artifacts_path, "model.tar.gz") os.makedirs(artifacts_path) - subprocess.call(['aws', 's3', 'cp', estimator.model_data, model_path]) - subprocess.call(['tar', '-xvzf', model_path], cwd=artifacts_path) + subprocess.call(["aws", "s3", "cp", estimator.model_data, model_path]) + subprocess.call(["tar", "-xvzf", model_path], cwd=artifacts_path) + + print("Model downloaded at %s" % model_path) - print('Model downloaded at %s' % model_path) +def job_name(instance_type, instance_count, device, python_version): + instance_typename = instance_type.replace(".", "").replace("ml", "") -def job_name(instance_type, - instance_count, - device, - python_version): - instance_typename = instance_type.replace('.', '').replace('ml', '') + return "hvd-%s-%s-%s-%s" % (instance_typename, instance_count, device, python_version) - return 'hvd-%s-%s-%s-%s' % ( - instance_typename, instance_count, device, python_version) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py b/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py index d415c62d..cf0e2486 100644 --- a/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py +++ b/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py @@ -51,18 +51,26 @@ from operator import itemgetter from tensorflow.python.util import nest + def rank0log(logger, *args, **kwargs): if hvd.rank() == 0: if logger: - logger.info(''.join([str(x) for x in list(args)])) + logger.info("".join([str(x) for x in list(args)])) else: print(*args, **kwargs) class LayerBuilder(object): - def __init__(self, activation=None, data_format='channels_last', - training=False, use_batch_norm=False, batch_norm_config=None, - conv_initializer=None, adv_bn_init=False): + def __init__( + self, + activation=None, + data_format="channels_last", + training=False, + use_batch_norm=False, + batch_norm_config=None, + conv_initializer=None, + adv_bn_init=False, + ): self.activation = activation self.data_format = data_format self.training = training @@ -72,19 +80,22 @@ def __init__(self, activation=None, data_format='channels_last', self.adv_bn_init = adv_bn_init if self.batch_norm_config is None: self.batch_norm_config = { - 'decay': 0.9, - 'epsilon': 1e-4, - 'scale': True, - 'zero_debias_moving_mean': False, + "decay": 0.9, + "epsilon": 1e-4, + "scale": True, + "zero_debias_moving_mean": False, } def _conv2d(self, inputs, activation, *args, **kwargs): x = tf.layers.conv2d( - inputs, data_format=self.data_format, + inputs, + data_format=self.data_format, use_bias=not self.use_batch_norm, kernel_initializer=self.conv_initializer, activation=None if self.use_batch_norm else activation, - *args, **kwargs) + *args, + **kwargs + ) if self.use_batch_norm: x = self.batch_norm(x) x = activation(x) if activation is not None else x @@ -92,19 +103,23 @@ def _conv2d(self, inputs, activation, *args, **kwargs): def conv2d_linear_last_bn(self, inputs, *args, **kwargs): x = tf.layers.conv2d( - inputs, data_format=self.data_format, + inputs, + data_format=self.data_format, use_bias=False, kernel_initializer=self.conv_initializer, - activation=None, *args, **kwargs) + activation=None, + *args, + **kwargs + ) param_initializers = { - 'moving_mean': tf.zeros_initializer(), - 'moving_variance': tf.ones_initializer(), - 'beta': tf.zeros_initializer(), + "moving_mean": tf.zeros_initializer(), + "moving_variance": tf.ones_initializer(), + "beta": tf.zeros_initializer(), } if self.adv_bn_init: - param_initializers['gamma'] = tf.zeros_initializer() + param_initializers["gamma"] = tf.zeros_initializer() else: - param_initializers['gamma'] = tf.ones_initializer() + param_initializers["gamma"] = tf.ones_initializer() x = self.batch_norm(x, param_initializers=param_initializers) return x @@ -125,19 +140,17 @@ def pad2d(self, inputs, begin, end=None): _ = end[1] except TypeError: end = [end, end] - if self.data_format == 'channels_last': + if self.data_format == "channels_last": padding = [[0, 0], [begin[0], end[0]], [begin[1], end[1]], [0, 0]] else: padding = [[0, 0], [0, 0], [begin[0], end[0]], [begin[1], end[1]]] return tf.pad(inputs, padding) def max_pooling2d(self, inputs, *args, **kwargs): - return tf.layers.max_pooling2d( - inputs, data_format=self.data_format, *args, **kwargs) + return tf.layers.max_pooling2d(inputs, data_format=self.data_format, *args, **kwargs) def average_pooling2d(self, inputs, *args, **kwargs): - return tf.layers.average_pooling2d( - inputs, data_format=self.data_format, *args, **kwargs) + return tf.layers.average_pooling2d(inputs, data_format=self.data_format, *args, **kwargs) def dense_linear(self, inputs, units, **kwargs): return tf.layers.dense(inputs, units, activation=None) @@ -152,72 +165,72 @@ def activate(self, inputs, activation=None): def batch_norm(self, inputs, **kwargs): all_kwargs = dict(self.batch_norm_config) all_kwargs.update(kwargs) - data_format = 'NHWC' if self.data_format == 'channels_last' else 'NCHW' + data_format = "NHWC" if self.data_format == "channels_last" else "NCHW" return tf.contrib.layers.batch_norm( - inputs, is_training=self.training, data_format=data_format, - fused=True, **all_kwargs) + inputs, is_training=self.training, data_format=data_format, fused=True, **all_kwargs + ) def spatial_average2d(self, inputs): shape = inputs.get_shape().as_list() - if self.data_format == 'channels_last': + if self.data_format == "channels_last": n, h, w, c = shape else: n, c, h, w = shape n = -1 if n is None else n - x = tf.layers.average_pooling2d(inputs, (h, w), (1, 1), - data_format=self.data_format) + x = tf.layers.average_pooling2d(inputs, (h, w), (1, 1), data_format=self.data_format) return tf.reshape(x, [n, c]) def flatten2d(self, inputs): x = inputs - if self.data_format != 'channel_last': + if self.data_format != "channel_last": # Note: This ensures the output order matches that of NHWC networks x = tf.transpose(x, [0, 2, 3, 1]) input_shape = x.get_shape().as_list() num_inputs = 1 for dim in input_shape[1:]: num_inputs *= dim - return tf.reshape(x, [-1, num_inputs], name='flatten') + return tf.reshape(x, [-1, num_inputs], name="flatten") def residual2d(self, inputs, network, units=None, scale=1.0, activate=False): outputs = network(inputs) - c_axis = -1 if self.data_format == 'channels_last' else 1 - h_axis = 1 if self.data_format == 'channels_last' else 2 + c_axis = -1 if self.data_format == "channels_last" else 1 + h_axis = 1 if self.data_format == "channels_last" else 2 w_axis = h_axis + 1 ishape, oshape = [y.get_shape().as_list() for y in [inputs, outputs]] ichans, ochans = ishape[c_axis], oshape[c_axis] - strides = ((ishape[h_axis] - 1) // oshape[h_axis] + 1, - (ishape[w_axis] - 1) // oshape[w_axis] + 1) - with tf.name_scope('residual'): - if (ochans != ichans or strides[0] != 1 or strides[1] != 1): - inputs = self.conv2d_linear(inputs, units, 1, strides, 'SAME') + strides = ( + (ishape[h_axis] - 1) // oshape[h_axis] + 1, + (ishape[w_axis] - 1) // oshape[w_axis] + 1, + ) + with tf.name_scope("residual"): + if ochans != ichans or strides[0] != 1 or strides[1] != 1: + inputs = self.conv2d_linear(inputs, units, 1, strides, "SAME") x = inputs + scale * outputs if activate: x = self.activate(x) return x -def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, - basic=False): +def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, basic=False): num_inputs = inputs.get_shape().as_list()[1] x = inputs - with tf.name_scope('resnet_v1'): + with tf.name_scope("resnet_v1"): if depth == num_inputs: if stride == 1: shortcut = x else: shortcut = builder.max_pooling2d(x, 1, stride) else: - shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME') + shortcut = builder.conv2d_linear(x, depth, 1, stride, "SAME") if basic: x = builder.pad2d(x, 1) - x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID') - x = builder.conv2d_linear(x, depth, 3, 1, 'SAME') + x = builder.conv2d(x, depth_bottleneck, 3, stride, "VALID") + x = builder.conv2d_linear(x, depth, 3, 1, "SAME") else: - x = builder.conv2d(x, depth_bottleneck, 1, 1, 'SAME') - x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME') + x = builder.conv2d(x, depth_bottleneck, 1, 1, "SAME") + x = builder.conv2d(x, depth_bottleneck, 3, stride, "SAME") # x = builder.conv2d_linear(x, depth, 1, 1, 'SAME') - x = builder.conv2d_linear_last_bn(x, depth, 1, 1, 'SAME') + x = builder.conv2d_linear_last_bn(x, depth, 1, 1, "SAME") x = tf.nn.relu(x + shortcut) return x @@ -225,8 +238,8 @@ def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, def inference_resnet_v1_impl(builder, inputs, layer_counts, basic=False): x = inputs x = builder.pad2d(x, 3) - x = builder.conv2d(x, 64, 7, 2, 'VALID') - x = builder.max_pooling2d(x, 3, 2, 'SAME') + x = builder.conv2d(x, 64, 7, 2, "VALID") + x = builder.max_pooling2d(x, 3, 2, "SAME") for i in range(layer_counts[0]): x = resnet_bottleneck_v1(builder, x, 256, 64, 1, basic) for i in range(layer_counts[1]): @@ -238,13 +251,25 @@ def inference_resnet_v1_impl(builder, inputs, layer_counts, basic=False): return builder.spatial_average2d(x) -def inference_resnet_v1(inputs, nlayer, data_format='channels_last', - training=False, conv_initializer=None, adv_bn_init=False): +def inference_resnet_v1( + inputs, + nlayer, + data_format="channels_last", + training=False, + conv_initializer=None, + adv_bn_init=False, +): """Deep Residual Networks family of models https://arxiv.org/abs/1512.03385 """ - builder = LayerBuilder(tf.nn.relu, data_format, training, use_batch_norm=True, - conv_initializer=conv_initializer, adv_bn_init=adv_bn_init) + builder = LayerBuilder( + tf.nn.relu, + data_format, + training, + use_batch_norm=True, + conv_initializer=conv_initializer, + adv_bn_init=adv_bn_init, + ) if nlayer == 18: return inference_resnet_v1_impl(builder, inputs, [2, 2, 2, 2], basic=True) elif nlayer == 34: @@ -256,83 +281,95 @@ def inference_resnet_v1(inputs, nlayer, data_format='channels_last', elif nlayer == 152: return inference_resnet_v1_impl(builder, inputs, [3, 8, 36, 3]) else: - raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" % - nlayer) + raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" % nlayer) def get_model_func(model_name): - if model_name.startswith('resnet'): - nlayer = int(model_name[len('resnet'):]) - return lambda images, *args, **kwargs: \ - inference_resnet_v1(images, nlayer, *args, **kwargs) + if model_name.startswith("resnet"): + nlayer = int(model_name[len("resnet") :]) + return lambda images, *args, **kwargs: inference_resnet_v1(images, nlayer, *args, **kwargs) else: raise ValueError("Invalid model type: %s" % model_name) def deserialize_image_record(record): feature_map = { - 'image/encoded': tf.FixedLenFeature([], tf.string, ''), - 'image/class/label': tf.FixedLenFeature([1], tf.int64, -1), - 'image/class/text': tf.FixedLenFeature([], tf.string, ''), - 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32) + "image/encoded": tf.FixedLenFeature([], tf.string, ""), + "image/class/label": tf.FixedLenFeature([1], tf.int64, -1), + "image/class/text": tf.FixedLenFeature([], tf.string, ""), + "image/object/bbox/xmin": tf.VarLenFeature(dtype=tf.float32), + "image/object/bbox/ymin": tf.VarLenFeature(dtype=tf.float32), + "image/object/bbox/xmax": tf.VarLenFeature(dtype=tf.float32), + "image/object/bbox/ymax": tf.VarLenFeature(dtype=tf.float32), } - with tf.name_scope('deserialize_image_record'): + with tf.name_scope("deserialize_image_record"): obj = tf.parse_single_example(record, feature_map) - imgdata = obj['image/encoded'] - label = tf.cast(obj['image/class/label'], tf.int32) - bbox = tf.stack([obj['image/object/bbox/%s' % x].values - for x in ['ymin', 'xmin', 'ymax', 'xmax']]) + imgdata = obj["image/encoded"] + label = tf.cast(obj["image/class/label"], tf.int32) + bbox = tf.stack( + [obj["image/object/bbox/%s" % x].values for x in ["ymin", "xmin", "ymax", "xmax"]] + ) bbox = tf.transpose(tf.expand_dims(bbox, 0), [0, 2, 1]) - text = obj['image/class/text'] + text = obj["image/class/text"] return imgdata, label, bbox, text def decode_jpeg(imgdata, channels=3): - return tf.image.decode_jpeg(imgdata, channels=channels, - fancy_upscaling=False, - dct_method='INTEGER_FAST') + return tf.image.decode_jpeg( + imgdata, channels=channels, fancy_upscaling=False, dct_method="INTEGER_FAST" + ) -def crop_and_resize_image(image, original_bbox, height, width, - distort=False, nsummary=10): - with tf.name_scope('crop_and_resize'): +def crop_and_resize_image(image, original_bbox, height, width, distort=False, nsummary=10): + with tf.name_scope("crop_and_resize"): # Evaluation is done on a center-crop of this ratio eval_crop_ratio = 0.8 if distort: - initial_shape = [int(round(height / eval_crop_ratio)), - int(round(width / eval_crop_ratio)), - 3] - bbox_begin, bbox_size, bbox = \ - tf.image.sample_distorted_bounding_box( - initial_shape, - bounding_boxes=tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]), - # tf.zeros(shape=[1,0,4]), # No bounding boxes - min_object_covered=0.1, - aspect_ratio_range=[3. / 4., 4. / 3.], - area_range=[0.08, 1.0], - max_attempts=100, - seed=11 * hvd.rank(), # Need to set for deterministic results - use_image_if_no_bounding_boxes=True) + initial_shape = [ + int(round(height / eval_crop_ratio)), + int(round(width / eval_crop_ratio)), + 3, + ] + bbox_begin, bbox_size, bbox = tf.image.sample_distorted_bounding_box( + initial_shape, + bounding_boxes=tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]), + # tf.zeros(shape=[1,0,4]), # No bounding boxes + min_object_covered=0.1, + aspect_ratio_range=[3.0 / 4.0, 4.0 / 3.0], + area_range=[0.08, 1.0], + max_attempts=100, + seed=11 * hvd.rank(), # Need to set for deterministic results + use_image_if_no_bounding_boxes=True, + ) bbox = bbox[0, 0] # Remove batch, box_idx dims else: # Central crop ratio_y = ratio_x = eval_crop_ratio - bbox = tf.constant([0.5 * (1 - ratio_y), 0.5 * (1 - ratio_x), - 0.5 * (1 + ratio_y), 0.5 * (1 + ratio_x)]) - image = tf.image.crop_and_resize( - image[None, :, :, :], bbox[None, :], [0], [height, width])[0] + bbox = tf.constant( + [0.5 * (1 - ratio_y), 0.5 * (1 - ratio_x), 0.5 * (1 + ratio_y), 0.5 * (1 + ratio_x)] + ) + image = tf.image.crop_and_resize(image[None, :, :, :], bbox[None, :], [0], [height, width])[ + 0 + ] return image -def parse_and_preprocess_image_record(record, counter, height, width, - brightness, contrast, saturation, hue, - distort=False, nsummary=10, increased_aug=False): +def parse_and_preprocess_image_record( + record, + counter, + height, + width, + brightness, + contrast, + saturation, + hue, + distort=False, + nsummary=10, + increased_aug=False, +): imgdata, label, bbox, text = deserialize_image_record(record) label -= 1 # Change to 0-based (don't use background class) - with tf.name_scope('preprocess_train'): + with tf.name_scope("preprocess_train"): try: image = decode_jpeg(imgdata, channels=3) except: @@ -342,24 +379,44 @@ def parse_and_preprocess_image_record(record, counter, height, width, image = tf.image.random_flip_left_right(image) if increased_aug: image = tf.image.random_brightness(image, max_delta=brightness) - image = distort_image_ops.random_hsv_in_yiq(image, - lower_saturation=saturation, - upper_saturation=2.0 - saturation, - max_delta_hue=hue * math.pi) + image = distort_image_ops.random_hsv_in_yiq( + image, + lower_saturation=saturation, + upper_saturation=2.0 - saturation, + max_delta_hue=hue * math.pi, + ) image = tf.image.random_contrast(image, lower=contrast, upper=2.0 - contrast) - tf.summary.image('distorted_color_image', tf.expand_dims(image, 0)) - image = tf.clip_by_value(image, 0., 255.) + tf.summary.image("distorted_color_image", tf.expand_dims(image, 0)) + image = tf.clip_by_value(image, 0.0, 255.0) image = tf.cast(image, tf.uint8) return image, label -def make_dataset(filenames, take_count, batch_size, height, width, - brightness, contrast, saturation, hue, - training=False, num_threads=10, nsummary=10, shard=False, synthetic=False, - increased_aug=False): + +def make_dataset( + filenames, + take_count, + batch_size, + height, + width, + brightness, + contrast, + saturation, + hue, + training=False, + num_threads=10, + nsummary=10, + shard=False, + synthetic=False, + increased_aug=False, +): if synthetic and training: input_shape = [height, width, 3] - input_element = nest.map_structure(lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape)) - label_element = nest.map_structure(lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1])) + input_element = nest.map_structure( + lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape) + ) + label_element = nest.map_structure( + lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1]) + ) element = (input_element, label_element) ds = tf.data.Dataset.from_tensors(element).repeat() else: @@ -380,16 +437,29 @@ def make_dataset(filenames, take_count, batch_size, height, width, if training: ds = ds.shuffle(1000, seed=7 * (1 + hvd.rank())) - ds = ds.interleave( - tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1) + ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1) counter = tf.data.Dataset.range(sys.maxsize) ds = tf.data.Dataset.zip((ds, counter)) preproc_func = lambda record, counter_: parse_and_preprocess_image_record( - record, counter_, height, width, brightness, contrast, saturation, hue, - distort=training, nsummary=nsummary if training else 0, increased_aug=increased_aug) + record, + counter_, + height, + width, + brightness, + contrast, + saturation, + hue, + distort=training, + nsummary=nsummary if training else 0, + increased_aug=increased_aug, + ) ds = ds.map(preproc_func, num_parallel_calls=num_threads) if training: - ds = ds.apply(tf.data.experimental.shuffle_and_repeat(shuffle_buffer_size, seed=5*(1+hvd.rank()))) + ds = ds.apply( + tf.data.experimental.shuffle_and_repeat( + shuffle_buffer_size, seed=5 * (1 + hvd.rank()) + ) + ) ds = ds.batch(batch_size) return ds @@ -399,18 +469,19 @@ def stage(tensors): """ stage_area = data_flow_ops.StagingArea( dtypes=[tensor.dtype for tensor in tensors], - shapes=[tensor.get_shape() for tensor in tensors]) + shapes=[tensor.get_shape() for tensor in tensors], + ) put_op = stage_area.put(tensors) get_tensors = stage_area.get() - tf.add_to_collection('STAGING_AREA_PUTS', put_op) + tf.add_to_collection("STAGING_AREA_PUTS", put_op) return put_op, get_tensors class PrefillStagingAreasHook(tf.train.SessionRunHook): def after_create_session(self, session, coord): - enqueue_ops = tf.get_collection('STAGING_AREA_PUTS') + enqueue_ops = tf.get_collection("STAGING_AREA_PUTS") for i in range(len(enqueue_ops)): - session.run(enqueue_ops[:i + 1]) + session.run(enqueue_ops[: i + 1]) class LogSessionRunHook(tf.train.SessionRunHook): @@ -421,15 +492,15 @@ def __init__(self, global_batch_size, num_records, display_every=10, logger=None self.logger = logger def after_create_session(self, session, coord): - rank0log(self.logger, ' Step Epoch Speed Loss FinLoss LR') - self.elapsed_secs = 0. + rank0log(self.logger, " Step Epoch Speed Loss FinLoss LR") + self.elapsed_secs = 0.0 self.count = 0 def before_run(self, run_context): self.t0 = time.time() return tf.train.SessionRunArgs( - fetches=[tf.train.get_global_step(), - 'loss:0', 'total_loss:0', 'learning_rate:0']) + fetches=[tf.train.get_global_step(), "loss:0", "total_loss:0", "learning_rate:0"] + ) def after_run(self, run_context, run_values): self.elapsed_secs += time.time() - self.t0 @@ -439,25 +510,37 @@ def after_run(self, run_context, run_values): dt = self.elapsed_secs / self.count img_per_sec = self.global_batch_size / dt epoch = global_step * self.global_batch_size / self.num_records - self.logger.info('%6i %5.1f %7.1f %6.3f %6.3f %7.5f' % - (global_step, epoch, img_per_sec, loss, total_loss, lr)) - self.elapsed_secs = 0. + self.logger.info( + "%6i %5.1f %7.1f %6.3f %6.3f %7.5f" + % (global_step, epoch, img_per_sec, loss, total_loss, lr) + ) + self.elapsed_secs = 0.0 self.count = 0 -def _fp32_trainvar_getter(getter, name, shape=None, dtype=None, - trainable=True, regularizer=None, - *args, **kwargs): +def _fp32_trainvar_getter( + getter, name, shape=None, dtype=None, trainable=True, regularizer=None, *args, **kwargs +): storage_dtype = tf.float32 if trainable else dtype - variable = getter(name, shape, dtype=storage_dtype, - trainable=trainable, - regularizer=regularizer if trainable and 'BatchNorm' not in name and 'batchnorm' not in name and 'batch_norm' not in name and 'Batch_Norm' not in name else None, - *args, **kwargs) + variable = getter( + name, + shape, + dtype=storage_dtype, + trainable=trainable, + regularizer=regularizer + if trainable + and "BatchNorm" not in name + and "batchnorm" not in name + and "batch_norm" not in name + and "Batch_Norm" not in name + else None, + *args, + **kwargs + ) if trainable and dtype != tf.float32: - cast_name = name + '/fp16_cast' + cast_name = name + "/fp16_cast" try: - cast_variable = tf.get_default_graph().get_tensor_by_name( - cast_name + ':0') + cast_variable = tf.get_default_graph().get_tensor_by_name(cast_name + ":0") except KeyError: cast_variable = tf.cast(variable, dtype, name=cast_name) cast_variable._ref = variable._ref @@ -465,31 +548,26 @@ def _fp32_trainvar_getter(getter, name, shape=None, dtype=None, return variable -def fp32_trainable_vars(name='fp32_vars', *args, **kwargs): +def fp32_trainable_vars(name="fp32_vars", *args, **kwargs): """A varible scope with custom variable getter to convert fp16 trainable variables with fp32 storage followed by fp16 cast. """ - return tf.variable_scope( - name, custom_getter=_fp32_trainvar_getter, *args, **kwargs) + return tf.variable_scope(name, custom_getter=_fp32_trainvar_getter, *args, **kwargs) class MixedPrecisionOptimizer(tf.train.Optimizer): """An optimizer that updates trainable variables in fp32.""" - def __init__(self, optimizer, - scale=None, - name="MixedPrecisionOptimizer", - use_locking=False): - super(MixedPrecisionOptimizer, self).__init__( - name=name, use_locking=use_locking) + def __init__(self, optimizer, scale=None, name="MixedPrecisionOptimizer", use_locking=False): + super(MixedPrecisionOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._scale = float(scale) if scale is not None else 1.0 def compute_gradients(self, loss, var_list=None, *args, **kwargs): if var_list is None: - var_list = ( - tf.trainable_variables() + - tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) + var_list = tf.trainable_variables() + tf.get_collection( + tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES + ) replaced_list = var_list @@ -503,7 +581,7 @@ def compute_gradients(self, loss, var_list=None, *args, **kwargs): if var is not orig_var: grad = tf.cast(grad, orig_var.dtype) if self._scale != 1.0: - grad = tf.scalar_mul(1. / self._scale, grad) + grad = tf.scalar_mul(1.0 / self._scale, grad) final_gradvar.append((grad, orig_var)) return final_gradvar @@ -511,6 +589,7 @@ def compute_gradients(self, loss, var_list=None, *args, **kwargs): def apply_gradients(self, *args, **kwargs): return self._optimizer.apply_gradients(*args, **kwargs) + class LarcOptimizer(tf.train.Optimizer): """ LARC implementation ------------------- @@ -524,10 +603,17 @@ class LarcOptimizer(tf.train.Optimizer): - use_locking """ - def __init__(self, optimizer, learning_rate, eta, clip=True, epsilon=1., - name="LarcOptimizer", use_locking=False): - super(LarcOptimizer, self).__init__( - name=name, use_locking=use_locking) + def __init__( + self, + optimizer, + learning_rate, + eta, + clip=True, + epsilon=1.0, + name="LarcOptimizer", + use_locking=False, + ): + super(LarcOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._learning_rate = learning_rate self._eta = float(eta) @@ -539,16 +625,13 @@ def compute_gradients(self, *args, **kwargs): def apply_gradients(self, gradvars, *args, **kwargs): v_list = [tf.norm(tensor=v, ord=2) for _, v in gradvars] - g_list = [tf.norm(tensor=g, ord=2) if g is not None else 0.0 - for g, _ in gradvars] + g_list = [tf.norm(tensor=g, ord=2) if g is not None else 0.0 for g, _ in gradvars] v_norms = tf.stack(v_list) g_norms = tf.stack(g_list) zeds = tf.zeros_like(v_norms) # assign epsilon if weights or grads = 0, to avoid division by zero # also prevent biases to get stuck at initialization (0.) - cond = tf.logical_and( - tf.not_equal(v_norms, zeds), - tf.not_equal(g_norms, zeds)) + cond = tf.logical_and(tf.not_equal(v_norms, zeds), tf.not_equal(g_norms, zeds)) true_vals = tf.scalar_mul(self._eta, tf.div(v_norms, g_norms)) # true_vals = tf.scalar_mul(tf.cast(self._eta, tf.float32), tf.div(tf.cast(v_norms, tf.float32), tf.cast(g_norms, tf.float32))) false_vals = tf.fill(tf.shape(v_norms), self._epsilon) @@ -561,9 +644,10 @@ def apply_gradients(self, gradvars, *args, **kwargs): # for which learning rate is already fixed # We then have to scale the gradients instead of the learning rate. larc_local_lr = tf.minimum(tf.div(larc_local_lr, lr), ones) - gradvars = [(tf.multiply(larc_local_lr[i], g), v) - if g is not None else (None, v) - for i, (g, v) in enumerate(gradvars)] + gradvars = [ + (tf.multiply(larc_local_lr[i], g), v) if g is not None else (None, v) + for i, (g, v) in enumerate(gradvars) + ] return self._optimizer.apply_gradients(gradvars, *args, **kwargs) @@ -571,45 +655,64 @@ def get_with_default(obj, key, default_value): return obj[key] if key in obj and obj[key] is not None else default_value -def get_lr(lr, steps, lr_steps, warmup_it, decay_steps, global_step, lr_decay_mode, - cdr_first_decay_ratio, cdr_t_mul, cdr_m_mul, cdr_alpha, lc_periods, lc_alpha, lc_beta): - if lr_decay_mode == 'steps': - learning_rate = tf.train.piecewise_constant(global_step, - steps, lr_steps) - elif lr_decay_mode == 'poly' or lr_decay_mode == 'poly_cycle': - cycle = lr_decay_mode == 'poly_cycle' - learning_rate = tf.train.polynomial_decay(lr, - global_step - warmup_it, - decay_steps=decay_steps - warmup_it, - end_learning_rate=0.00001, - power=2, - cycle=cycle) - elif lr_decay_mode == 'cosine_decay_restarts': - learning_rate = tf.train.cosine_decay_restarts(lr, - global_step - warmup_it, - (decay_steps - warmup_it) * cdr_first_decay_ratio, - t_mul=cdr_t_mul, - m_mul=cdr_m_mul, - alpha=cdr_alpha) - elif lr_decay_mode == 'cosine': - learning_rate = tf.train.cosine_decay(lr, - global_step - warmup_it, - decay_steps=decay_steps - warmup_it, - alpha=0.0) - elif lr_decay_mode == 'linear_cosine': - learning_rate = tf.train.linear_cosine_decay(lr, - global_step - warmup_it, - decay_steps=decay_steps - warmup_it, - num_periods=lc_periods,#0.47, - alpha=lc_alpha,#0.0, - beta=lc_beta)#0.00001) +def get_lr( + lr, + steps, + lr_steps, + warmup_it, + decay_steps, + global_step, + lr_decay_mode, + cdr_first_decay_ratio, + cdr_t_mul, + cdr_m_mul, + cdr_alpha, + lc_periods, + lc_alpha, + lc_beta, +): + if lr_decay_mode == "steps": + learning_rate = tf.train.piecewise_constant(global_step, steps, lr_steps) + elif lr_decay_mode == "poly" or lr_decay_mode == "poly_cycle": + cycle = lr_decay_mode == "poly_cycle" + learning_rate = tf.train.polynomial_decay( + lr, + global_step - warmup_it, + decay_steps=decay_steps - warmup_it, + end_learning_rate=0.00001, + power=2, + cycle=cycle, + ) + elif lr_decay_mode == "cosine_decay_restarts": + learning_rate = tf.train.cosine_decay_restarts( + lr, + global_step - warmup_it, + (decay_steps - warmup_it) * cdr_first_decay_ratio, + t_mul=cdr_t_mul, + m_mul=cdr_m_mul, + alpha=cdr_alpha, + ) + elif lr_decay_mode == "cosine": + learning_rate = tf.train.cosine_decay( + lr, global_step - warmup_it, decay_steps=decay_steps - warmup_it, alpha=0.0 + ) + elif lr_decay_mode == "linear_cosine": + learning_rate = tf.train.linear_cosine_decay( + lr, + global_step - warmup_it, + decay_steps=decay_steps - warmup_it, + num_periods=lc_periods, # 0.47, + alpha=lc_alpha, # 0.0, + beta=lc_beta, + ) # 0.00001) else: - raise ValueError('Invalid type of lr_decay_mode') + raise ValueError("Invalid type of lr_decay_mode") return learning_rate def warmup_decay(warmup_lr, global_step, warmup_steps, warmup_end_lr): from tensorflow.python.ops import math_ops + p = tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32) diff = math_ops.subtract(warmup_end_lr, warmup_lr) res = math_ops.add(warmup_lr, math_ops.multiply(diff, p)) @@ -618,40 +721,40 @@ def warmup_decay(warmup_lr, global_step, warmup_steps, warmup_end_lr): def cnn_model_function(features, labels, mode, params): labels = tf.reshape(labels, (-1,)) # Squash unnecessary unary dim - lr = params['lr'] - lr_steps = params['lr_steps'] - steps = params['steps'] - use_larc = params['use_larc'] - leta = params['leta'] - lr_decay_mode = params['lr_decay_mode'] - decay_steps = params['decay_steps'] - cdr_first_decay_ratio = params['cdr_first_decay_ratio'] - cdr_t_mul = params['cdr_t_mul'] - cdr_m_mul = params['cdr_m_mul'] - cdr_alpha = params['cdr_alpha'] - lc_periods = params['lc_periods'] - lc_alpha = params['lc_alpha'] - lc_beta = params['lc_beta'] - - model_name = params['model'] - num_classes = params['n_classes'] - model_dtype = get_with_default(params, 'dtype', tf.float32) - model_format = get_with_default(params, 'format', 'channels_first') - device = get_with_default(params, 'device', '/gpu:0') + lr = params["lr"] + lr_steps = params["lr_steps"] + steps = params["steps"] + use_larc = params["use_larc"] + leta = params["leta"] + lr_decay_mode = params["lr_decay_mode"] + decay_steps = params["decay_steps"] + cdr_first_decay_ratio = params["cdr_first_decay_ratio"] + cdr_t_mul = params["cdr_t_mul"] + cdr_m_mul = params["cdr_m_mul"] + cdr_alpha = params["cdr_alpha"] + lc_periods = params["lc_periods"] + lc_alpha = params["lc_alpha"] + lc_beta = params["lc_beta"] + + model_name = params["model"] + num_classes = params["n_classes"] + model_dtype = get_with_default(params, "dtype", tf.float32) + model_format = get_with_default(params, "format", "channels_first") + device = get_with_default(params, "device", "/gpu:0") model_func = get_model_func(model_name) inputs = features # TODO: Should be using feature columns? - is_training = (mode == tf.estimator.ModeKeys.TRAIN) - momentum = params['mom'] - weight_decay = params['wdecay'] - warmup_lr = params['warmup_lr'] - warmup_it = params['warmup_it'] - loss_scale = params['loss_scale'] + is_training = mode == tf.estimator.ModeKeys.TRAIN + momentum = params["mom"] + weight_decay = params["wdecay"] + warmup_lr = params["warmup_lr"] + warmup_it = params["warmup_it"] + loss_scale = params["loss_scale"] - adv_bn_init = params['adv_bn_init'] - conv_init = params['conv_init'] + adv_bn_init = params["adv_bn_init"] + conv_init = params["conv_init"] if mode == tf.estimator.ModeKeys.TRAIN: - with tf.device('/cpu:0'): + with tf.device("/cpu:0"): preload_op, (inputs, labels) = stage([inputs, labels]) with tf.device(device): @@ -661,73 +764,87 @@ def cnn_model_function(features, labels, mode, params): imagenet_mean = np.array([121, 115, 100], dtype=np.float32) imagenet_std = np.array([70, 68, 71], dtype=np.float32) inputs = tf.subtract(inputs, imagenet_mean) - inputs = tf.multiply(inputs, 1. / imagenet_std) - if model_format == 'channels_first': + inputs = tf.multiply(inputs, 1.0 / imagenet_std) + if model_format == "channels_first": inputs = tf.transpose(inputs, [0, 3, 1, 2]) - with fp32_trainable_vars( - regularizer=tf.contrib.layers.l2_regularizer(weight_decay)): + with fp32_trainable_vars(regularizer=tf.contrib.layers.l2_regularizer(weight_decay)): top_layer = model_func( - inputs, data_format=model_format, training=is_training, - conv_initializer=conv_init, adv_bn_init=adv_bn_init) - logits = tf.layers.dense(top_layer, num_classes, - kernel_initializer=tf.random_normal_initializer(stddev=0.01)) + inputs, + data_format=model_format, + training=is_training, + conv_initializer=conv_init, + adv_bn_init=adv_bn_init, + ) + logits = tf.layers.dense( + top_layer, num_classes, kernel_initializer=tf.random_normal_initializer(stddev=0.01) + ) predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32) logits = tf.cast(logits, tf.float32) if mode == tf.estimator.ModeKeys.PREDICT: probabilities = tf.softmax(logits) predictions = { - 'class_ids': predicted_classes[:, None], - 'probabilities': probabilities, - 'logits': logits + "class_ids": predicted_classes[:, None], + "probabilities": probabilities, + "logits": logits, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) - loss = tf.losses.sparse_softmax_cross_entropy( - logits=logits, labels=labels) - loss = tf.identity(loss, name='loss') # For access by logger (TODO: Better way to access it?) + loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels) + loss = tf.identity( + loss, name="loss" + ) # For access by logger (TODO: Better way to access it?) if mode == tf.estimator.ModeKeys.EVAL: with tf.device(None): # Allow fallback to CPU if no GPU support for these ops - accuracy = tf.metrics.accuracy( - labels=labels, predictions=predicted_classes) - top5acc = tf.metrics.mean( - tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)) + accuracy = tf.metrics.accuracy(labels=labels, predictions=predicted_classes) + top5acc = tf.metrics.mean(tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)) newaccuracy = (hvd.allreduce(accuracy[0]), accuracy[1]) newtop5acc = (hvd.allreduce(top5acc[0]), top5acc[1]) - metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc} - return tf.estimator.EstimatorSpec( - mode, loss=loss, eval_metric_ops=metrics) + metrics = {"val-top1acc": newaccuracy, "val-top5acc": newtop5acc} + return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) - assert (mode == tf.estimator.ModeKeys.TRAIN) + assert mode == tf.estimator.ModeKeys.TRAIN reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - total_loss = tf.add_n([loss] + reg_losses, name='total_loss') + total_loss = tf.add_n([loss] + reg_losses, name="total_loss") batch_size = tf.shape(inputs)[0] global_step = tf.train.get_global_step() - with tf.device('/cpu:0'): # Allow fallback to CPU if no GPU support for these ops - learning_rate = tf.cond(global_step < warmup_it, - lambda: warmup_decay(warmup_lr, global_step, warmup_it, - lr), - lambda: get_lr(lr, steps, lr_steps, warmup_it, decay_steps, global_step, - lr_decay_mode, - cdr_first_decay_ratio, cdr_t_mul, cdr_m_mul, cdr_alpha, - lc_periods, lc_alpha, lc_beta)) - learning_rate = tf.identity(learning_rate, 'learning_rate') - tf.summary.scalar('learning_rate', learning_rate) - - opt = tf.train.MomentumOptimizer( - learning_rate, momentum, use_nesterov=True) + with tf.device("/cpu:0"): # Allow fallback to CPU if no GPU support for these ops + learning_rate = tf.cond( + global_step < warmup_it, + lambda: warmup_decay(warmup_lr, global_step, warmup_it, lr), + lambda: get_lr( + lr, + steps, + lr_steps, + warmup_it, + decay_steps, + global_step, + lr_decay_mode, + cdr_first_decay_ratio, + cdr_t_mul, + cdr_m_mul, + cdr_alpha, + lc_periods, + lc_alpha, + lc_beta, + ), + ) + learning_rate = tf.identity(learning_rate, "learning_rate") + tf.summary.scalar("learning_rate", learning_rate) + + opt = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True) opt = hvd.DistributedOptimizer(opt) if use_larc: opt = LarcOptimizer(opt, learning_rate, leta, clip=True) opt = MixedPrecisionOptimizer(opt, scale=loss_scale) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or [] with tf.control_dependencies(update_ops): - gate_gradients = (tf.train.Optimizer.GATE_NONE) + gate_gradients = tf.train.Optimizer.GATE_NONE train_op = opt.minimize( - total_loss, global_step=tf.train.get_global_step(), - gate_gradients=gate_gradients) + total_loss, global_step=tf.train.get_global_step(), gate_gradients=gate_gradients + ) train_op = tf.group(preload_op, gpucopy_op, train_op) # , update_ops) return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op) @@ -741,158 +858,234 @@ def count_records(tf_record_filename): return count nfile = len(filenames) - return (count_records(filenames[0]) * (nfile - 1) + - count_records(filenames[-1])) + return count_records(filenames[0]) * (nfile - 1) + count_records(filenames[-1]) def add_bool_argument(cmdline, shortname, longname=None, default=False, help=None): if longname is None: shortname, longname = None, shortname elif default == True: - raise ValueError("""Boolean arguments that are True by default should not have short names.""") + raise ValueError( + """Boolean arguments that are True by default should not have short names.""" + ) name = longname[2:] feature_parser = cmdline.add_mutually_exclusive_group(required=False) if shortname is not None: - feature_parser.add_argument(shortname, '--' + name, dest=name, action='store_true', help=help, default=default) + feature_parser.add_argument( + shortname, "--" + name, dest=name, action="store_true", help=help, default=default + ) else: - feature_parser.add_argument('--' + name, dest=name, action='store_true', help=help, default=default) - feature_parser.add_argument('--no' + name, dest=name, action='store_false') + feature_parser.add_argument( + "--" + name, dest=name, action="store_true", help=help, default=default + ) + feature_parser.add_argument("--no" + name, dest=name, action="store_false") return cmdline def add_cli_args(): - cmdline = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + cmdline = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Basic options - cmdline.add_argument('-m', '--model', default='resnet50', - help="""Name of model to run: resnet[18,34,50,101,152]""") - cmdline.add_argument('--data_dir', - help="""Path to dataset in TFRecord format + cmdline.add_argument( + "-m", + "--model", + default="resnet50", + help="""Name of model to run: resnet[18,34,50,101,152]""", + ) + cmdline.add_argument( + "--data_dir", + help="""Path to dataset in TFRecord format (aka Example protobufs). Files should be - named 'train-*' and 'validation-*'.""") - add_bool_argument(cmdline, '--synthetic', help="""Whether to use synthetic data for training""") - cmdline.add_argument('-b', '--batch_size', default=256, type=int, - help="""Size of each minibatch per GPU""") - cmdline.add_argument('--num_batches', type=int, - help="""Number of batches to run. - Ignored during eval or if num epochs given""") - cmdline.add_argument('--num_epochs', type=int, - help="""Number of epochs to run. - Overrides --num_batches. Ignored during eval.""") - cmdline.add_argument('--log_dir', default='imagenet_resnet', - help="""Directory in which to write training + named 'train-*' and 'validation-*'.""", + ) + add_bool_argument(cmdline, "--synthetic", help="""Whether to use synthetic data for training""") + cmdline.add_argument( + "-b", "--batch_size", default=256, type=int, help="""Size of each minibatch per GPU""" + ) + cmdline.add_argument( + "--num_batches", + type=int, + help="""Number of batches to run. + Ignored during eval or if num epochs given""", + ) + cmdline.add_argument( + "--num_epochs", + type=int, + help="""Number of epochs to run. + Overrides --num_batches. Ignored during eval.""", + ) + cmdline.add_argument( + "--log_dir", + default="imagenet_resnet", + help="""Directory in which to write training summaries and checkpoints. If the log directory already contains some checkpoints, it tries to resume training from the last saved checkpoint. Pass --clear_log if you - want to clear all checkpoints and start a fresh run""") - add_bool_argument(cmdline, '--clear_log', default=False, - help="""Clear the log folder passed so a fresh run can be started""") - cmdline.add_argument('--log_name', type=str, default='hvd_train.log') - add_bool_argument(cmdline, '--local_ckpt', - help="""Performs local checkpoints (i.e. one per node)""") - cmdline.add_argument('--display_every', default=50, type=int, - help="""How often (in iterations) to print out - running information.""") - add_bool_argument(cmdline, '--eval', - help="""Evaluate the top-1 and top-5 accuracy of + want to clear all checkpoints and start a fresh run""", + ) + add_bool_argument( + cmdline, + "--clear_log", + default=False, + help="""Clear the log folder passed so a fresh run can be started""", + ) + cmdline.add_argument("--log_name", type=str, default="hvd_train.log") + add_bool_argument( + cmdline, "--local_ckpt", help="""Performs local checkpoints (i.e. one per node)""" + ) + cmdline.add_argument( + "--display_every", + default=50, + type=int, + help="""How often (in iterations) to print out + running information.""", + ) + add_bool_argument( + cmdline, + "--eval", + help="""Evaluate the top-1 and top-5 accuracy of the latest checkpointed model. If you want to evaluate using multiple GPUs ensure that all processes have access to all checkpoints. Either if checkpoints were saved using --local_ckpt or they were saved to a shared directory which all processes - can access.""") - cmdline.add_argument('--eval_interval', type=int, - help="""Evaluate accuracy per eval_interval number of epochs""") - add_bool_argument(cmdline, '--fp16', default=True, - help="""Train using float16 (half) precision instead - of float32.""") - cmdline.add_argument('--num_gpus', default=1, type=int, - help="""Specify total number of GPUS used to train a checkpointed model during eval. - Used only to calculate epoch number to print during evaluation""") - - cmdline.add_argument('--save_checkpoints_steps', type=int, default=1000) - cmdline.add_argument('--save_summary_steps', type=int, default=0) - add_bool_argument(cmdline, '--adv_bn_init', default=True, - help="""init gamme of the last BN of each ResMod at 0.""") - add_bool_argument(cmdline, '--adv_conv_init', default=True, - help="""init conv with MSRA initializer""") - - cmdline.add_argument('--lr', type=float, - help="""Start learning rate""") - cmdline.add_argument('--mom', default=0.90, type=float, - help="""Momentum""") - cmdline.add_argument('--wdecay', default=0.0001, type=float, - help="""Weight decay""") - cmdline.add_argument('--loss_scale', default=1024., type=float, - help="""loss scale""") - cmdline.add_argument('--warmup_lr', default=0.001, type=float, - help="""Warmup starting from this learning rate""") - cmdline.add_argument('--warmup_epochs', default=0, type=int, - help="""Number of epochs in which to warmup to given lr""") - cmdline.add_argument('--lr_decay_steps', default='30,60,80', type=str, - help="""epoch numbers at which lr is decayed by lr_decay_lrs. - Used when lr_decay_mode is steps""") - cmdline.add_argument('--lr_decay_lrs', default='', type=str, - help="""learning rates at specific epochs""") - cmdline.add_argument('--lr_decay_mode', default='poly', - help="""Takes either `steps` (decay by a factor at specified steps) - or `poly`(polynomial_decay with degree 2)""") - - add_bool_argument(cmdline, '--use_larc', default=False, - help="""Use Layer wise Adaptive Rate Control which helps convergence at really large batch sizes""") - cmdline.add_argument('--leta', default=0.013, type=float, - help="""The trust coefficient for LARC optimization, LARC Eta""") - - cmdline.add_argument('--cdr_first_decay_ratio', default=0.33, type=float, - help="""Cosine Decay Restart First Deacy Steps ratio""") - cmdline.add_argument('--cdr_t_mul', default=2.0, type=float, - help="""Cosine Decay Restart t_mul""") - cmdline.add_argument('--cdr_m_mul', default=0.1, type=float, - help="""Cosine Decay Restart m_mul""") - cmdline.add_argument('--cdr_alpha', default=0.0, type=float, - help="""Cosine Decay Restart alpha""") - cmdline.add_argument('--lc_periods', default=0.47, type=float, - help="""Linear Cosine num of periods""") - cmdline.add_argument('--lc_alpha', default=0.0, type=float, - help="""linear Cosine alpha""") - cmdline.add_argument('--lc_beta', default=0.00001, type=float, - help="""Liner Cosine Beta""") - - add_bool_argument(cmdline, '--increased_aug', default=False, - help="""Increase augmentations helpful when training with large number of GPUs such as 128 or 256""") - cmdline.add_argument('--contrast', default=0.6, type=float, - help="""contrast factor""") - cmdline.add_argument('--saturation', default=0.6, type=float, - help="""saturation factor""") - cmdline.add_argument('--hue', default=0.13, type=float, - help="""hue max delta factor, hue delta = hue * math.pi""") - cmdline.add_argument('--brightness', default=0.3, type=float, - help="""Brightness factor""") + can access.""", + ) + cmdline.add_argument( + "--eval_interval", type=int, help="""Evaluate accuracy per eval_interval number of epochs""" + ) + add_bool_argument( + cmdline, + "--fp16", + default=True, + help="""Train using float16 (half) precision instead + of float32.""", + ) + cmdline.add_argument( + "--num_gpus", + default=1, + type=int, + help="""Specify total number of GPUS used to train a checkpointed model during eval. + Used only to calculate epoch number to print during evaluation""", + ) + + cmdline.add_argument("--save_checkpoints_steps", type=int, default=1000) + cmdline.add_argument("--save_summary_steps", type=int, default=0) + add_bool_argument( + cmdline, + "--adv_bn_init", + default=True, + help="""init gamme of the last BN of each ResMod at 0.""", + ) + add_bool_argument( + cmdline, "--adv_conv_init", default=True, help="""init conv with MSRA initializer""" + ) + + cmdline.add_argument("--lr", type=float, help="""Start learning rate""") + cmdline.add_argument("--mom", default=0.90, type=float, help="""Momentum""") + cmdline.add_argument("--wdecay", default=0.0001, type=float, help="""Weight decay""") + cmdline.add_argument("--loss_scale", default=1024.0, type=float, help="""loss scale""") + cmdline.add_argument( + "--warmup_lr", default=0.001, type=float, help="""Warmup starting from this learning rate""" + ) + cmdline.add_argument( + "--warmup_epochs", + default=0, + type=int, + help="""Number of epochs in which to warmup to given lr""", + ) + cmdline.add_argument( + "--lr_decay_steps", + default="30,60,80", + type=str, + help="""epoch numbers at which lr is decayed by lr_decay_lrs. + Used when lr_decay_mode is steps""", + ) + cmdline.add_argument( + "--lr_decay_lrs", default="", type=str, help="""learning rates at specific epochs""" + ) + cmdline.add_argument( + "--lr_decay_mode", + default="poly", + help="""Takes either `steps` (decay by a factor at specified steps) + or `poly`(polynomial_decay with degree 2)""", + ) + + add_bool_argument( + cmdline, + "--use_larc", + default=False, + help="""Use Layer wise Adaptive Rate Control which helps convergence at really large batch sizes""", + ) + cmdline.add_argument( + "--leta", + default=0.013, + type=float, + help="""The trust coefficient for LARC optimization, LARC Eta""", + ) + + cmdline.add_argument( + "--cdr_first_decay_ratio", + default=0.33, + type=float, + help="""Cosine Decay Restart First Deacy Steps ratio""", + ) + cmdline.add_argument( + "--cdr_t_mul", default=2.0, type=float, help="""Cosine Decay Restart t_mul""" + ) + cmdline.add_argument( + "--cdr_m_mul", default=0.1, type=float, help="""Cosine Decay Restart m_mul""" + ) + cmdline.add_argument( + "--cdr_alpha", default=0.0, type=float, help="""Cosine Decay Restart alpha""" + ) + cmdline.add_argument( + "--lc_periods", default=0.47, type=float, help="""Linear Cosine num of periods""" + ) + cmdline.add_argument("--lc_alpha", default=0.0, type=float, help="""linear Cosine alpha""") + cmdline.add_argument("--lc_beta", default=0.00001, type=float, help="""Liner Cosine Beta""") + + add_bool_argument( + cmdline, + "--increased_aug", + default=False, + help="""Increase augmentations helpful when training with large number of GPUs such as 128 or 256""", + ) + cmdline.add_argument("--contrast", default=0.6, type=float, help="""contrast factor""") + cmdline.add_argument("--saturation", default=0.6, type=float, help="""saturation factor""") + cmdline.add_argument( + "--hue", + default=0.13, + type=float, + help="""hue max delta factor, hue delta = hue * math.pi""", + ) + cmdline.add_argument("--brightness", default=0.3, type=float, help="""Brightness factor""") return cmdline def sort_and_load_ckpts(log_dir): ckpts = [] for f in os.listdir(log_dir): - m = re.match(r'model.ckpt-([0-9]+).index', f) + m = re.match(r"model.ckpt-([0-9]+).index", f) if m is None: continue fullpath = os.path.join(log_dir, f) - ckpts.append({'step': int(m.group(1)), - 'path': os.path.splitext(fullpath)[0], - 'mtime': os.stat(fullpath).st_mtime, - }) - ckpts.sort(key=itemgetter('step')) + ckpts.append( + { + "step": int(m.group(1)), + "path": os.path.splitext(fullpath)[0], + "mtime": os.stat(fullpath).st_mtime, + } + ) + ckpts.sort(key=itemgetter("step")) return ckpts def main(): gpu_thread_count = 2 - os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' - os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) - os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' - os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' + os.environ["TF_GPU_THREAD_MODE"] = "gpu_private" + os.environ["TF_GPU_THREAD_COUNT"] = str(gpu_thread_count) + os.environ["TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT"] = "1" + os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1" hvd.init() - config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory @@ -914,7 +1107,7 @@ def main(): FLAGS.log_dir = None if FLAGS.log_dir == "" else FLAGS.log_dir if FLAGS.eval: - FLAGS.log_name = 'eval_' + FLAGS.log_name + FLAGS.log_name = "eval_" + FLAGS.log_name if hvd.rank() != 0: return if FLAGS.local_ckpt: @@ -930,7 +1123,7 @@ def main(): os.makedirs(FLAGS.log_dir) barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) tf.Session(config=config).run(barrier) - + logger = logging.getLogger(FLAGS.log_name) logger.setLevel(logging.INFO) # INFO, ERROR # file handler which logs debug messages @@ -939,7 +1132,7 @@ def main(): ch.setLevel(logging.INFO) # add formatter to the handlers # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - formatter = logging.Formatter('%(message)s') + formatter = logging.Formatter("%(message)s") ch.setFormatter(formatter) logger.addHandler(ch) if not hvd.rank(): @@ -948,23 +1141,25 @@ def main(): fh.setFormatter(formatter) # add handlers to logger logger.addHandler(fh) - + height, width = 224, 224 global_batch_size = FLAGS.batch_size * hvd.size() - rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__)) + rank0log(logger, "PY" + str(sys.version) + "TF" + str(tf.__version__)) rank0log(logger, "Horovod size: ", hvd.size()) if FLAGS.data_dir: - filename_pattern = os.path.join(FLAGS.data_dir, '%s-*') - train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train')) - eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation')) + filename_pattern = os.path.join(FLAGS.data_dir, "%s-*") + train_filenames = sorted(tf.gfile.Glob(filename_pattern % "train")) + eval_filenames = sorted(tf.gfile.Glob(filename_pattern % "validation")) num_training_samples = get_num_records(train_filenames) rank0log(logger, "Using data from: ", FLAGS.data_dir) if not FLAGS.eval: - rank0log(logger, 'Found ', num_training_samples, ' training samples') + rank0log(logger, "Found ", num_training_samples, " training samples") else: if not FLAGS.synthetic: - raise ValueError('data_dir missing. Please pass --synthetic if you want to run on synthetic data. Else please pass --data_dir') + raise ValueError( + "data_dir missing. Please pass --synthetic if you want to run on synthetic data. Else please pass --data_dir" + ) train_filenames = eval_filenames = [] num_training_samples = 1281167 training_samples_per_rank = num_training_samples // hvd.size() @@ -979,9 +1174,9 @@ def main(): nstep_per_epoch = num_training_samples // global_batch_size decay_steps = nstep - if FLAGS.lr_decay_mode == 'steps': - steps = [int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(',')] - lr_steps = [float(x) for x in FLAGS.lr_decay_lrs.split(',')] + if FLAGS.lr_decay_mode == "steps": + steps = [int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(",")] + lr_steps = [float(x) for x in FLAGS.lr_decay_lrs.split(",")] else: steps = [] lr_steps = [] @@ -997,11 +1192,11 @@ def main(): if not FLAGS.save_summary_steps: # default to save one checkpoint per epoch FLAGS.save_summary_steps = nstep_per_epoch - + if not FLAGS.eval: - rank0log(logger, 'Using a learning rate of ', FLAGS.lr) - rank0log(logger, 'Checkpointing every ' + str(FLAGS.save_checkpoints_steps) + ' steps') - rank0log(logger, 'Saving summary every ' + str(FLAGS.save_summary_steps) + ' steps') + rank0log(logger, "Using a learning rate of ", FLAGS.lr) + rank0log(logger, "Checkpointing every " + str(FLAGS.save_checkpoints_steps) + " steps") + rank0log(logger, "Saving summary every " + str(FLAGS.save_summary_steps) + " steps") warmup_it = nstep_per_epoch * FLAGS.warmup_epochs @@ -1009,62 +1204,74 @@ def main(): model_fn=cnn_model_function, model_dir=FLAGS.log_dir, params={ - 'model': FLAGS.model, - 'decay_steps': decay_steps, - 'n_classes': 1000, - 'dtype': tf.float16 if FLAGS.fp16 else tf.float32, - 'format': 'channels_first', - 'device': '/gpu:0', - 'lr': FLAGS.lr, - 'mom': FLAGS.mom, - 'wdecay': FLAGS.wdecay, - 'use_larc': FLAGS.use_larc, - 'leta': FLAGS.leta, - 'steps': steps, - 'lr_steps': lr_steps, - 'lr_decay_mode': FLAGS.lr_decay_mode, - 'warmup_it': warmup_it, - 'warmup_lr': FLAGS.warmup_lr, - 'cdr_first_decay_ratio': FLAGS.cdr_first_decay_ratio, - 'cdr_t_mul': FLAGS.cdr_t_mul, - 'cdr_m_mul': FLAGS.cdr_m_mul, - 'cdr_alpha': FLAGS.cdr_alpha, - 'lc_periods': FLAGS.lc_periods, - 'lc_alpha': FLAGS.lc_alpha, - 'lc_beta': FLAGS.lc_beta, - 'loss_scale': FLAGS.loss_scale, - 'adv_bn_init': FLAGS.adv_bn_init, - 'conv_init': tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None + "model": FLAGS.model, + "decay_steps": decay_steps, + "n_classes": 1000, + "dtype": tf.float16 if FLAGS.fp16 else tf.float32, + "format": "channels_first", + "device": "/gpu:0", + "lr": FLAGS.lr, + "mom": FLAGS.mom, + "wdecay": FLAGS.wdecay, + "use_larc": FLAGS.use_larc, + "leta": FLAGS.leta, + "steps": steps, + "lr_steps": lr_steps, + "lr_decay_mode": FLAGS.lr_decay_mode, + "warmup_it": warmup_it, + "warmup_lr": FLAGS.warmup_lr, + "cdr_first_decay_ratio": FLAGS.cdr_first_decay_ratio, + "cdr_t_mul": FLAGS.cdr_t_mul, + "cdr_m_mul": FLAGS.cdr_m_mul, + "cdr_alpha": FLAGS.cdr_alpha, + "lc_periods": FLAGS.lc_periods, + "lc_alpha": FLAGS.lc_alpha, + "lc_beta": FLAGS.lc_beta, + "loss_scale": FLAGS.loss_scale, + "adv_bn_init": FLAGS.adv_bn_init, + "conv_init": tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None, }, config=tf.estimator.RunConfig( # tf_random_seed=31 * (1 + hvd.rank()), session_config=config, save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None, save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None, - keep_checkpoint_max=None)) + keep_checkpoint_max=None, + ), + ) if not FLAGS.eval: num_preproc_threads = 5 rank0log(logger, "Using preprocessing threads per GPU: ", num_preproc_threads) - training_hooks = [hvd.BroadcastGlobalVariablesHook(0), - PrefillStagingAreasHook()] + training_hooks = [hvd.BroadcastGlobalVariablesHook(0), PrefillStagingAreasHook()] if hvd.rank() == 0: training_hooks.append( - LogSessionRunHook(global_batch_size, - num_training_samples, - FLAGS.display_every, logger)) + LogSessionRunHook( + global_batch_size, num_training_samples, FLAGS.display_every, logger + ) + ) try: start_time = time.time() classifier.train( input_fn=lambda: make_dataset( train_filenames, training_samples_per_rank, - FLAGS.batch_size, height, width, - FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue, - training=True, num_threads=num_preproc_threads, - shard=True, synthetic=FLAGS.synthetic, increased_aug=FLAGS.increased_aug), + FLAGS.batch_size, + height, + width, + FLAGS.brightness, + FLAGS.contrast, + FLAGS.saturation, + FLAGS.hue, + training=True, + num_threads=num_preproc_threads, + shard=True, + synthetic=FLAGS.synthetic, + increased_aug=FLAGS.increased_aug, + ), max_steps=nstep, - hooks=training_hooks) + hooks=training_hooks, + ) rank0log(logger, "Finished in ", time.time() - start_time) except KeyboardInterrupt: print("Keyboard interrupt") @@ -1075,45 +1282,62 @@ def main(): tf.Session(config=config).run(barrier) time.sleep(5) # a little extra margin... if FLAGS.num_gpus == 1: - rank0log(logger, """If you are evaluating checkpoints of a multi-GPU run on a single GPU, + rank0log( + logger, + """If you are evaluating checkpoints of a multi-GPU run on a single GPU, ensure you set --num_gpus to the number of GPUs it was trained on. - This will ensure that the epoch number is accurately displayed in the below logs.""") + This will ensure that the epoch number is accurately displayed in the below logs.""", + ) try: ckpts = sort_and_load_ckpts(FLAGS.log_dir) for i, c in enumerate(ckpts): if i < len(ckpts) - 1: - if (not FLAGS.eval_interval) or \ - (i % FLAGS.eval_interval != 0): + if (not FLAGS.eval_interval) or (i % FLAGS.eval_interval != 0): continue eval_result = classifier.evaluate( input_fn=lambda: make_dataset( eval_filenames, - get_num_records(eval_filenames), FLAGS.batch_size, - height, width, - FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue, - training=False, shard=True, increased_aug=False), - checkpoint_path=c['path']) - c['epoch'] = math.ceil(c['step'] / (num_training_samples / (FLAGS.batch_size * FLAGS.num_gpus))) - c['top1'] = eval_result['val-top1acc'] - c['top5'] = eval_result['val-top5acc'] - c['loss'] = eval_result['loss'] - rank0log(logger, ' step epoch top1 top5 loss checkpoint_time(UTC)') + get_num_records(eval_filenames), + FLAGS.batch_size, + height, + width, + FLAGS.brightness, + FLAGS.contrast, + FLAGS.saturation, + FLAGS.hue, + training=False, + shard=True, + increased_aug=False, + ), + checkpoint_path=c["path"], + ) + c["epoch"] = math.ceil( + c["step"] / (num_training_samples / (FLAGS.batch_size * FLAGS.num_gpus)) + ) + c["top1"] = eval_result["val-top1acc"] + c["top5"] = eval_result["val-top5acc"] + c["loss"] = eval_result["loss"] + rank0log(logger, " step epoch top1 top5 loss checkpoint_time(UTC)") barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) for i, c in enumerate(ckpts): tf.Session(config=config).run(barrier) - if 'top1' not in c: + if "top1" not in c: continue - rank0log(logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}' - .format(c['step'], - c['epoch'], - c['top1'] * 100, - c['top5'] * 100, - c['loss'], - time=time.strftime('%Y-%m-%d %H:%M:%S', - time.localtime(c['mtime'])))) + rank0log( + logger, + "{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}".format( + c["step"], + c["epoch"], + c["top1"] * 100, + c["top5"] * 100, + c["loss"], + time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(c["mtime"])), + ), + ) rank0log(logger, "Finished evaluation") except KeyboardInterrupt: logger.error("Keyboard interrupt") -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/benchmarks/tf_benchmarks/README.md b/benchmarks/tf_benchmarks/README.md index e1aecba4..badee3ba 100644 --- a/benchmarks/tf_benchmarks/README.md +++ b/benchmarks/tf_benchmarks/README.md @@ -1,9 +1,9 @@ # TensorFlow benchmarking scripts -This folder contains the TF training scripts https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks. +This folder contains a copy of [TensorFlow's `tf_cnn_benchmarks.py` script](https://github.com/tensorflow/benchmarks/blob/e3bd1370ba21b02c4d34340934ffb4941977d96f/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py). ## Basic usage -**execute_tensorflow_training.py train** uses SageMaker python sdk to start a training job. +**execute_tensorflow_training.py train** uses SageMaker python sdk to start a training job. ```bash ./execute_tensorflow_training.py train --help @@ -26,7 +26,7 @@ Options: --help Show this message and exit. ``` -**execute_tensorflow_training.py generate_reports** generate benchmark reports. +**execute_tensorflow_training.py generate_reports** generate benchmark reports. ## Examples: diff --git a/benchmarks/tf_benchmarks/benchmarks b/benchmarks/tf_benchmarks/benchmarks deleted file mode 160000 index ec056be5..00000000 --- a/benchmarks/tf_benchmarks/benchmarks +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ec056be57f189ec96611a58e8dc5562a6d620139 diff --git a/benchmarks/tf_benchmarks/execute_tensorflow_training.py b/benchmarks/tf_benchmarks/execute_tensorflow_training.py index b4f15304..e424638c 100755 --- a/benchmarks/tf_benchmarks/execute_tensorflow_training.py +++ b/benchmarks/tf_benchmarks/execute_tensorflow_training.py @@ -11,7 +11,6 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. - from __future__ import absolute_import import argparse @@ -26,13 +25,13 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) _DEFAULT_HYPERPARAMETERS = { - 'batch_size': 32, - 'model': 'resnet32', - 'num_epochs': 10, - 'data_format': 'NHWC', - 'summary_verbosity': 1, - 'save_summaries_steps': 10, - 'data_name': 'cifar10' + "batch_size": 32, + "model": "resnet32", + "num_epochs": 10, + "data_format": "NHWC", + "summary_verbosity": 1, + "save_summaries_steps": 10, + "data_name": "cifar10", } @@ -44,67 +43,73 @@ class ScriptModeTensorFlow(Framework): create_model = TensorFlow.create_model - def __init__(self, py_version='py3', **kwargs): + def __init__(self, py_version="py3", **kwargs): super(ScriptModeTensorFlow, self).__init__(**kwargs) self.py_version = py_version self.image_name = None - self.framework_version = '1.10.0' + self.framework_version = "1.10.0" def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('-t', '--instance-types', nargs='+', help=' Set flag', required=True) - parser.add_argument('-r', '--role', required=True) - parser.add_argument('-w', '--wait', action='store_true') - parser.add_argument('--region', default='us-west-2') - parser.add_argument('--py-versions', nargs='+', help=' Set flag', default=['py3']) - parser.add_argument('--checkpoint-path', - default=os.path.join(default_bucket(), 'benchmarks', 'checkpoints'), - help='The S3 location where the model checkpoints and tensorboard events are saved after training') + parser.add_argument( + "-t", "--instance-types", nargs="+", help=" Set flag", required=True + ) + parser.add_argument("-r", "--role", required=True) + parser.add_argument("-w", "--wait", action="store_true") + parser.add_argument("--region", default="us-west-2") + parser.add_argument("--py-versions", nargs="+", help=" Set flag", default=["py3"]) + parser.add_argument( + "--checkpoint-path", + default=os.path.join(default_bucket(), "benchmarks", "checkpoints"), + help="The S3 location where the model checkpoints and tensorboard events are saved after training", + ) return parser.parse_known_args() def main(args, script_args): for instance_type, py_version in itertools.product(args.instance_types, args.py_versions): - base_name = '%s-%s-%s' % (py_version, instance_type[3:5], instance_type[6:]) + base_name = "%s-%s-%s" % (py_version, instance_type[3:5], instance_type[6:]) model_dir = os.path.join(args.checkpoint_path, base_name) job_hps = create_hyperparameters(model_dir, script_args) - print('hyperparameters:') + print("hyperparameters:") print(job_hps) estimator = ScriptModeTensorFlow( - entry_point='tf_cnn_benchmarks.py', - role='SageMakerRole', - source_dir=os.path.join(dir_path, 'tf_cnn_benchmarks'), + entry_point="tf_cnn_benchmarks.py", + role="SageMakerRole", + source_dir=os.path.join(dir_path, "tf_cnn_benchmarks"), base_job_name=base_name, train_instance_count=1, hyperparameters=job_hps, train_instance_type=instance_type, ) - input_dir = 's3://sagemaker-sample-data-%s/spark/mnist/train/' % args.region - estimator.fit({'train': input_dir}, wait=args.wait) + input_dir = "s3://sagemaker-sample-data-%s/spark/mnist/train/" % args.region + estimator.fit({"train": input_dir}, wait=args.wait) print("To use TensorBoard, execute the following command:") - cmd = 'S3_USE_HTTPS=0 S3_VERIFY_SSL=0 AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s' + cmd = "S3_USE_HTTPS=0 S3_VERIFY_SSL=0 AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s" print(cmd % (args.region, args.checkpoint_path)) def create_hyperparameters(model_dir, script_args): job_hps = _DEFAULT_HYPERPARAMETERS.copy() - job_hps.update({'train_dir': model_dir, 'eval_dir': model_dir}) + job_hps.update({"train_dir": model_dir, "eval_dir": model_dir}) - script_arg_keys_without_dashes = [key[2:] if key.startswith('--') else key[1:] for key in script_args[::2]] + script_arg_keys_without_dashes = [ + key[2:] if key.startswith("--") else key[1:] for key in script_args[::2] + ] script_arg_values = script_args[1::2] job_hps.update(dict(zip(script_arg_keys_without_dashes, script_arg_values))) return job_hps -if __name__ == '__main__': +if __name__ == "__main__": args, script_args = get_args() - main(args, script_args) \ No newline at end of file + main(args, script_args) diff --git a/benchmarks/tf_benchmarks/models b/benchmarks/tf_benchmarks/models deleted file mode 160000 index bd835e57..00000000 --- a/benchmarks/tf_benchmarks/models +++ /dev/null @@ -1 +0,0 @@ -Subproject commit bd835e5794e0833705a645ce74d4fdf8fbac6214 diff --git a/benchmarks/tf_benchmarks/tf_cnn_benchmarks/tf_cnn_benchmarks.py b/benchmarks/tf_benchmarks/tf_cnn_benchmarks/tf_cnn_benchmarks.py new file mode 100644 index 00000000..c24f5e77 --- /dev/null +++ b/benchmarks/tf_benchmarks/tf_cnn_benchmarks/tf_cnn_benchmarks.py @@ -0,0 +1,68 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +"""Benchmark script for TensorFlow. + +Originally copied from: +https://github.com/tensorflow/benchmarks/blob/e3bd1370ba21b02c4d34340934ffb4941977d96f/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py +""" +from __future__ import absolute_import, division, print_function + +from absl import app +from absl import flags as absl_flags +import tensorflow.compat.v1 as tf + +import benchmark_cnn +import cnn_util +import flags +import mlperf +from cnn_util import log_fn + + +flags.define_flags() +for name in flags.param_specs.keys(): + absl_flags.declare_key_flag(name) + +absl_flags.DEFINE_boolean( + "ml_perf_compliance_logging", + False, + "Print logs required to be compliant with MLPerf. If set, must clone the " + "MLPerf training repo https://github.com/mlperf/training and add " + "https://github.com/mlperf/training/tree/master/compliance to the " + "PYTHONPATH", +) + + +def main(positional_arguments): + # Command-line arguments like '--distortions False' are equivalent to + # '--distortions=True False', where False is a positional argument. To prevent + # this from silently running with distortions, we do not allow positional + # arguments. + assert len(positional_arguments) >= 1 + if len(positional_arguments) > 1: + raise ValueError("Received unknown positional arguments: %s" % positional_arguments[1:]) + + params = benchmark_cnn.make_params_from_flags() + with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging, params.model): + params = benchmark_cnn.setup(params) + bench = benchmark_cnn.BenchmarkCNN(params) + + tfversion = cnn_util.tensorflow_version_tuple() + log_fn("TensorFlow: %i.%i" % (tfversion[0], tfversion[1])) + + bench.print_info() + bench.run() + + +if __name__ == "__main__": + tf.disable_v2_behavior() + app.run(main) # Raises error on invalid flags, unlike tf.app.run() diff --git a/buildspec-container-pr.yml b/buildspec-container-pr.yml new file mode 100644 index 00000000..c43cb34f --- /dev/null +++ b/buildspec-container-pr.yml @@ -0,0 +1,13 @@ +version: 0.2 + +phases: + pre_build: + commands: + - PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+') + - echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.' + + build: + commands: + + - error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1" + - execute-command-if-has-matching-changes "$error_cmd" "docker/" diff --git a/buildspec-release.yml b/buildspec-release.yml index e2ff7068..f2bd20c6 100644 --- a/buildspec-release.yml +++ b/buildspec-release.yml @@ -12,14 +12,14 @@ phases: # run unit tests - AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN= AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION= - tox -e py27,py36 -- test/unit + tox -e py27,py36,py37 --parallel all -- test/unit # run local integ tests #- $(aws ecr get-login --no-include-email --region us-west-2) - #- IGNORE_COVERAGE=- tox -e py27,py36 -- test/integ/local + #- IGNORE_COVERAGE=- tox -e py27,py37 -- test/integ/local # run sagemaker integ tests - #- IGNORE_COVERAGE=- tox -e py27,py36 -- test/integ/sagemaker + #- IGNORE_COVERAGE=- tox -e py27,py37 -- test/integ/sagemaker # generate the distribution package - python3 setup.py sdist diff --git a/buildspec-unit.yml b/buildspec-unit.yml deleted file mode 100644 index c3412df7..00000000 --- a/buildspec-unit.yml +++ /dev/null @@ -1,8 +0,0 @@ -version: 0.2 - -phases: - build: - commands: - - pip install --upgrade pip --quiet - - pip install tox --quiet - - tox -e ${TOX_ENVLIST} -- test/unit \ No newline at end of file diff --git a/buildspec.yml b/buildspec.yml index cf0e3e16..f4c4da8a 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -2,9 +2,12 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '1.15.0' + FRAMEWORK_VERSION: '1.15.2' + CPU_INSTANCE_TYPE: 'ml.c4.xlarge' + GPU_INSTANCE_TYPE: 'ml.p2.xlarge' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-tensorflow-container' + DLC_ACCOUNT: '763104351884' SETUP_FILE: 'setup_cmds.sh' SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]' @@ -15,110 +18,76 @@ phases: - ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text) - PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO" - PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+') + - BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" - echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.' build: commands: + - TOX_PARALLEL_NO_SPINNER=1 + - PY_COLORS=0 + # install - - pip3 install -U -e . - pip3 install -U -e .[test] - # run flake8 + # run linters - tox -e flake8,twine # run unit tests - - tox -e py36,py27 test/unit + - tox -e py27,py36,py37 --parallel all test/unit + + # define tags + - GENERIC_TAG="$FRAMEWORK_VERSION-tensorflow-$BUILD_ID" + - DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID" + - DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID" + + # run local CPU integration tests (build and push the image to ECR repo) + - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" + - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" + + # launch remote GPU instance + - prefix='ml.' + - instance_type=${GPU_INSTANCE_TYPE#"$prefix"} + - create-key-pair + - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest - # Create pip archive - - root_dir=$(pwd) - - build_id="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" + # build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test - python3 setup.py sdist - - tar_name=$(ls dist) - - # Find build artifacts - - build_artifacts=$root_dir/docker/artifacts - - # build py2 images - - # prepare build context - - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py2" - - cp $root_dir/dist/$tar_name $build_dir - - cp $build_artifacts/* $build_dir/ - - cd $build_dir - - # build cpu image - - cpu_dockerfile="Dockerfile.cpu" - - CPU_TAG_PY2="$FRAMEWORK_VERSION-cpu-py2-$build_id" - - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY2 . - - # build gpu image - - gpu_dockerfile="Dockerfile.gpu" - - GPU_TAG_PY2="$FRAMEWORK_VERSION-gpu-py2-$build_id" - - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY2 . - - # build py3 images - - # prepare build context - - build_dir="$root_dir/docker/$FRAMEWORK_VERSION/py3" - - cp $root_dir/dist/$tar_name $build_dir - - cp $build_artifacts/* $build_dir/ - - cd $build_dir - - # build cpu image - - cpu_dockerfile="Dockerfile.cpu" - - CPU_TAG_PY3="$FRAMEWORK_VERSION-cpu-py3-$build_id" - - docker build -f $cpu_dockerfile -t $PREPROD_IMAGE:$CPU_TAG_PY3 . - - # build gpu image - - gpu_dockerfile="Dockerfile.gpu" - - GPU_TAG_PY3="$FRAMEWORK_VERSION-gpu-py3-$build_id" - - docker build -f $gpu_dockerfile -t $PREPROD_IMAGE:$GPU_TAG_PY3 . - - # push images to ecr + - build_dir="test/container/$FRAMEWORK_VERSION" + - $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) + - docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION . + # push DLC GPU image to ECR - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - docker push $PREPROD_IMAGE:$CPU_TAG_PY2 - - docker push $PREPROD_IMAGE:$GPU_TAG_PY2 - - docker push $PREPROD_IMAGE:$CPU_TAG_PY3 - - docker push $PREPROD_IMAGE:$GPU_TAG_PY3 - - # launch remote gpu instance - - instance_type='p2.xlarge' - - create-key-pair - - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu + - docker push $PREPROD_IMAGE:$DLC_GPU_TAG - # run cpu integration tests - - py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor cpu" - - py2_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $CPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor cpu" - - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - # run gpu integration tests + # run GPU local integration tests - printf "$SETUP_CMDS" > $SETUP_FILE - - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY2 --framework-version $FRAMEWORK_VERSION --py-version 2 --processor gpu" - - py3_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - - execute-command-if-has-matching-changes "$py3_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - - cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $GPU_TAG_PY3 --framework-version $FRAMEWORK_VERSION --py-version 3 --processor gpu" - - py2_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - - execute-command-if-has-matching-changes "$py2_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - # run sagemaker tests - - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY2 --py-version 2 --processor cpu" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY2 --py-version 2 --processor gpu" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $CPU_TAG_PY3 --py-version 3 --processor cpu" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - - test_cmd="pytest test/integration/sagemaker -n 8 --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --tag $GPU_TAG_PY3 --py-version 3 --processor gpu" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "docker/*" "buildspec.yml" - + # no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests + - generic_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG" + - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" + - dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" + - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" + + # run CPU sagemaker integration tests + - test_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" + - test_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" + + # run GPU sagemaker integration tests + - test_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" + - test_cmd="pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml" finally: - # shut down remote gpu instance + # shut down remote GPU instance - cleanup-gpu-instances - cleanup-key-pairs - # remove ecr image - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY2 - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY2 - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_TAG_PY3 - - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_TAG_PY3 + # remove ECR image + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG + - aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG diff --git a/docker/1.15.2/py2/Dockerfile.cpu b/docker/1.15.2/py2/Dockerfile.cpu new file mode 100644 index 00000000..7bb9acaa --- /dev/null +++ b/docker/1.15.2/py2/Dockerfile.cpu @@ -0,0 +1,118 @@ +FROM ubuntu:18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Set environment variables for MKL +# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV KMP_BLOCKTIME=1 +ENV KMP_SETTINGS=0 +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/cpu/final/tensorflow-1.15.2-cp27-cp27mu-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + software-properties-common \ + build-essential \ + openssh-client \ + openssh-server \ + ca-certificates \ + curl \ + git \ + wget \ + vim \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && mkdir -p /var/run/sshd \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN apt-get update \ + && apt-get install -y \ + python \ + python-pip + +RUN pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python) /usr/local/bin/python + +RUN pip install --no-cache-dir -U \ + numpy==1.16.5 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.2 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + "sagemaker-tensorflow-training>=2,<3" \ + # Let's install TensorFlow separately in the end to avoid the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + awscli \ + && pip install --no-cache-dir -U \ + horovod==0.18.2 + +ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +CMD ["bin/bash"] diff --git a/docker/1.15.2/py2/Dockerfile.gpu b/docker/1.15.2/py2/Dockerfile.gpu new file mode 100644 index 00000000..35686af5 --- /dev/null +++ b/docker/1.15.2/py2/Dockerfile.gpu @@ -0,0 +1,160 @@ +# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. +# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ +FROM nvidia/cuda:10.0-base-ubuntu18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/gpu/final/tensorflow_gpu-1.15.2-cp27-cp27mu-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + ca-certificates \ + cuda-command-line-tools-10-0 \ + cuda-cublas-dev-10-0 \ + cuda-cudart-dev-10-0 \ + cuda-cufft-dev-10-0 \ + cuda-curand-dev-10-0 \ + cuda-cusolver-dev-10-0 \ + cuda-cusparse-dev-10-0 \ + curl \ + libcudnn7=7.5.1.10-1+cuda10.0 \ + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it + libnccl2=2.4.7-1+cuda10.0 \ + libgomp1 \ + libnccl-dev=2.4.7-1+cuda10.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libzmq3-dev \ + git \ + wget \ + vim \ + build-essential \ + openssh-client \ + openssh-server \ + zlib1g-dev \ + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 + # adds a new list which contains libnvinfer library, so it needs another + # 'apt-get update' to retrieve that list before it can actually install the library. + # We don't install libnvinfer-dev since we don't need to build against TensorRT, + # and libnvinfer4 doesn't contain libnvinfer.a static library. + && apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ + && apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + libnvinfer5=5.0.2-1+cuda10.0 \ + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +RUN apt-get update \ + && apt-get install -y \ + python \ + python-pip + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +# Set default NCCL parameters +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH /usr/local/openmpi/bin/:$PATH +ENV PATH=/usr/local/nvidia/bin:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN pip --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python) /usr/local/bin/python + +RUN pip install --no-cache-dir -U \ + numpy==1.16.5 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==6.2.2 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + "sagemaker-tensorflow-training>=2,<3" \ + # Let's install TensorFlow separately in the end to avoid the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + awscli + +# Install Horovod, temporarily using CUDA stubs +RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ + horovod==0.18.2 \ + && ldconfig + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +CMD ["bin/bash"] diff --git a/docker/1.15.2/py3/Dockerfile.cpu b/docker/1.15.2/py3/Dockerfile.cpu new file mode 100644 index 00000000..667a3edf --- /dev/null +++ b/docker/1.15.2/py3/Dockerfile.cpu @@ -0,0 +1,121 @@ +FROM ubuntu:18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Set environment variables for MKL +# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV KMP_BLOCKTIME=1 +ENV KMP_SETTINGS=0 +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/cpu/final/tensorflow-1.15.2-cp36-cp36m-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + python3-setuptools \ + software-properties-common \ + build-essential \ + openssh-client \ + openssh-server \ + ca-certificates \ + curl \ + git \ + wget \ + vim \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && mkdir -p /var/run/sshd \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN pip3 --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python3) /usr/local/bin/python \ + && ln -s $(which pip3) /usr/bin/pip + +RUN pip install --no-cache-dir -U \ + numpy==1.17.4 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==7.0.0 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + keras==2.3.1 \ + requests==2.22.0 \ + smdebug==0.7.2 \ + sagemaker==1.50.17 \ + sagemaker-experiments==0.1.7 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + "sagemaker-tensorflow-training>=2,<3" \ + # Let's install TensorFlow separately in the end to avoid + # the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --force-reinstall --no-cache-dir -U \ + horovod==0.18.2 \ + && pip install --no-cache-dir -U \ + awscli + +ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +CMD ["bin/bash"] diff --git a/docker/1.15.2/py3/Dockerfile.gpu b/docker/1.15.2/py3/Dockerfile.gpu new file mode 100644 index 00000000..56b5df5b --- /dev/null +++ b/docker/1.15.2/py3/Dockerfile.gpu @@ -0,0 +1,167 @@ +# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. +# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ +FROM nvidia/cuda:10.0-base-ubuntu18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/gpu/final/tensorflow_gpu-1.15.2-cp36-cp36m-manylinux2010_x86_64.whl + +RUN apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + python3-dev \ + python3-pip \ + python3-setuptools \ + python3-dev \ + ca-certificates \ + cuda-command-line-tools-10-0 \ + cuda-cublas-dev-10-0 \ + cuda-cudart-dev-10-0 \ + cuda-cufft-dev-10-0 \ + cuda-curand-dev-10-0 \ + cuda-cusolver-dev-10-0 \ + cuda-cusparse-dev-10-0 \ + curl \ + libcudnn7=7.5.1.10-1+cuda10.0 \ + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it + libnccl2=2.4.7-1+cuda10.0 \ + libgomp1 \ + libnccl-dev=2.4.7-1+cuda10.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libzmq3-dev \ + git \ + wget \ + vim \ + build-essential \ + openssh-client \ + openssh-server \ + zlib1g-dev \ + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 + # adds a new list which contains libnvinfer library, so it needs another + # 'apt-get update' to retrieve that list before it can actually install the + # library. + # We don't install libnvinfer-dev since we don't need to build against TensorRT, + # and libnvinfer4 doesn't contain libnvinfer.a static library. + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ + libnvinfer5=5.0.2-1+cuda10.0 \ + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +########################################################################### +# Horovod & its dependencies +########################################################################### + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +# Set default NCCL parameters +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH +ENV PATH=/usr/local/nvidia/bin:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN pip3 --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python3) /usr/local/bin/python \ + && ln -s $(which pip3) /usr/bin/pip + +RUN pip install --no-cache-dir -U \ + numpy==1.17.4 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==7.0.0 \ + h5py==2.9.0 \ + keras_applications==1.0.8 \ + keras_preprocessing==1.1.0 \ + requests==2.22.0 \ + keras==2.3.1 \ + smdebug==0.7.2 \ + sagemaker==1.50.17 \ + sagemaker-experiments==0.1.7 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + "sagemaker-tensorflow-training>=2,<3" \ + # Let's install TensorFlow separately in the end to avoid + # the library version to be overwritten + && pip install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && pip install --no-cache-dir -U \ + awscli + +# Install Horovod, temporarily using CUDA stubs +RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \ + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ + horovod==0.18.2 \ + && ldconfig + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +CMD ["bin/bash"] diff --git a/docker/1.15.2/py37/Dockerfile.cpu b/docker/1.15.2/py37/Dockerfile.cpu new file mode 100644 index 00000000..e46ea361 --- /dev/null +++ b/docker/1.15.2/py37/Dockerfile.cpu @@ -0,0 +1,138 @@ +FROM ubuntu:18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Set environment variables for MKL +# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn +ENV KMP_AFFINITY=granularity=fine,compact,1,0 +ENV KMP_BLOCKTIME=1 +ENV KMP_SETTINGS=0 +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/cpu/final/tensorflow_cpu-1.15.2-cp37-cp37m-manylinux2010_x86_64.whl +ARG PYTHON=python3 +ARG PYTHON_PIP=python3-pip +ARG PIP=pip3 +ARG PYTHON_VERSION=3.7.7 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + git \ + openssh-client \ + openssh-server \ + vim \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH + +# SSH login fix. Otherwise user is kicked off after login +RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && mkdir -p /var/run/sshd \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libbz2-dev \ + libc6-dev \ + libffi-dev \ + libgdbm-dev \ + libncursesw5-dev \ + libreadline-gplv2-dev \ + libsqlite3-dev \ + libssl-dev \ + tk-dev \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xvf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure && make && make install \ + && make && make install && rm -rf ../Python-$PYTHON_VERSION* + +RUN ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python3) /usr/local/bin/python \ + && ln -s $(which pip3) /usr/bin/pip + +RUN ${PIP} install --no-cache-dir -U \ + numpy==1.17.4 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==7.0.0 \ + h5py==2.10.0 \ + requests==2.22.0 \ + smdebug==0.7.2 \ + sagemaker-experiments==0.1.7 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + sagemaker-tensorflow-training==10.1.0 \ + # Let's install TensorFlow separately in the end to avoid + # the library version to be overwritten + && ${PIP} install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && ${PIP} install --force-reinstall --no-cache-dir -U \ + horovod==0.18.2 \ + && ${PIP} install --no-cache-dir -U \ + awscli + +ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +CMD ["bin/bash"] diff --git a/docker/1.15.2/py37/Dockerfile.gpu b/docker/1.15.2/py37/Dockerfile.gpu new file mode 100644 index 00000000..aefc97ab --- /dev/null +++ b/docker/1.15.2/py37/Dockerfile.gpu @@ -0,0 +1,184 @@ +# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. +# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ +FROM nvidia/cuda:10.0-base-ubuntu18.04 + +LABEL maintainer="Amazon AI" + +# Prevent docker build get stopped by requesting user interaction +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true +# Python won’t try to write .pyc or .pyo files on the import of source modules +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +# See http://bugs.python.org/issue19846 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +# Specify the location of module that contains the training logic for SageMaker +# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main + +# Define framework-related package sources +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/gpu/final/tensorflow_gpu-1.15.2-cp37-cp37m-manylinux2010_x86_64.whl +ARG PYTHON=python3 +ARG PYTHON_PIP=python3-pip +ARG PIP=pip3 +ARG PYTHON_VERSION=3.7.7 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends --allow-unauthenticated \ + build-essential \ + ca-certificates \ + cuda-command-line-tools-10-0 \ + cuda-cublas-dev-10-0 \ + cuda-cudart-dev-10-0 \ + cuda-cufft-dev-10-0 \ + cuda-curand-dev-10-0 \ + cuda-cusolver-dev-10-0 \ + cuda-cusparse-dev-10-0 \ + curl \ + libcudnn7=7.5.1.10-1+cuda10.0 \ + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it + libnccl2=2.4.7-1+cuda10.0 \ + libgomp1 \ + libnccl-dev=2.4.7-1+cuda10.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libzmq3-dev \ + git \ + wget \ + vim \ + openssh-client \ + openssh-server \ + zlib1g-dev \ + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 + # adds a new list which contains libnvinfer library, so it needs another + # 'apt-get update' to retrieve that list before it can actually install the + # library. + # We don't install libnvinfer-dev since we don't need to build against TensorRT, + # and libnvinfer4 doesn't contain libnvinfer.a static library. + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ + libnvinfer5=5.0.2-1+cuda10.0 \ + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /var/run/sshd + +########################################################################### +# Horovod & its dependencies +########################################################################### + +# Install Open MPI +RUN mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ + && tar zxf openmpi-4.0.1.tar.gz \ + && cd openmpi-4.0.1 \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ + && chmod a+x /usr/local/bin/mpirun + +# Configure OpenMPI to run good defaults: +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf + +# Set default NCCL parameters +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf + +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH +ENV PATH=/usr/local/openmpi/bin/:$PATH +ENV PATH=/usr/local/nvidia/bin:$PATH + + +# SSH login fix. Otherwise user is kicked off after login +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +# Create SSH key. +RUN mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +WORKDIR / + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libbz2-dev \ + libc6-dev \ + libffi-dev \ + libgdbm-dev \ + libncursesw5-dev \ + libreadline-gplv2-dev \ + libsqlite3-dev \ + libssl-dev \ + tk-dev \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xvf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure && make && make install \ + && make && make install && rm -rf ../Python-$PYTHON_VERSION* + +RUN ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Some TF tools expect a "python" binary +RUN ln -s $(which python3) /usr/local/bin/python \ + && ln -s $(which pip3) /usr/bin/pip + +RUN ${PIP} install --no-cache-dir -U \ + numpy==1.17.4 \ + scipy==1.2.2 \ + scikit-learn==0.20.3 \ + pandas==0.24.2 \ + Pillow==7.0.0 \ + h5py==2.10.0 \ + requests==2.22.0 \ + smdebug==0.7.2 \ + sagemaker-experiments==0.1.7 \ + mpi4py==3.0.2 \ + "cryptography>=2.3" \ + "sagemaker-tensorflow>=1.15,<1.16" \ + sagemaker-tensorflow-training==10.1.0 \ + # Let's install TensorFlow separately in the end to avoid + # the library version to be overwritten + && ${PIP} install --force-reinstall --no-cache-dir -U \ + ${TF_URL} \ + && ${PIP} install --no-cache-dir -U \ + awscli + +# Install Horovod, temporarily using CUDA stubs +RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \ + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ + horovod==0.18.2 \ + && ldconfig + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt + +CMD ["bin/bash"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..4c5649dc --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +markers = + deploy_test + skip_cpu + skip_gpu diff --git a/scripts/build_all.py b/scripts/build_all.py deleted file mode 100644 index 9f340d5d..00000000 --- a/scripts/build_all.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import argparse -import os -import subprocess - -VERSION = '1.13.1' -REPO = 'sagemaker-tensorflow-scriptmode' -PY2_CPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl' # noqa -PY3_CPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' # noqa -PY2_GPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl' # noqa -PY3_GPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' # noqa -DEV_ACCOUNT = '142577830533' -REGION = 'us-west-2' - - -def _parse_args(): - - parser = argparse.ArgumentParser() - - parser.add_argument('--account', type=str, default=DEV_ACCOUNT) - parser.add_argument('--region', type=str, default=REGION) - parser.add_argument('--version', type=str, default=VERSION) - parser.add_argument('--py2-cpu-binary', type=str, default=PY2_CPU_BINARY) - parser.add_argument('--py3-cpu-binary', type=str, default=PY3_CPU_BINARY) - parser.add_argument('--py2-gpu-binary', type=str, default=PY2_GPU_BINARY) - parser.add_argument('--py3-gpu-binary', type=str, default=PY3_GPU_BINARY) - parser.add_argument('--repo', type=str, default=REPO) - - return parser.parse_args() - - -args = _parse_args() -binaries = { - 'py2-cpu': args.py2_cpu_binary, - 'py3-cpu': args.py3_cpu_binary, - 'py2-gpu': args.py2_gpu_binary, - 'py3-gpu': args.py3_gpu_binary -} -build_dir = os.path.join('docker', args.version) - -# Run docker-login so we can pull the cached image -login_cmd = subprocess.check_output( - 'aws ecr get-login --no-include-email --registry-id {}'.format(args.account).split()) -print('Executing docker login command: '.format(login_cmd)) -subprocess.check_call(login_cmd.split()) - -for arch in ['cpu', 'gpu']: - for py_version in ['2', '3']: - - binary_url = binaries['py{}-{}'.format(py_version, arch)] - binary_file = os.path.basename(binary_url) - cmd = 'wget -O {}/{} {}'.format(build_dir, binary_file, binary_url) - print('Downloading binary file: {}'.format(cmd)) - subprocess.check_call(cmd.split()) - - tag = '{}-{}-py{}'.format(args.version, arch, py_version) - prev_image_uri = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(args.account, args.region, args.repo, tag) - dockerfile = os.path.join(build_dir, 'Dockerfile.{}'.format(arch)) - - tar_file_name = subprocess.check_output('ls {}/sagemaker_tensorflow_container*'.format(build_dir), - shell=True).strip().decode('ascii') - print('framework_support_installable is {}'.format(os.path.basename(tar_file_name))) - - build_cmd = 'docker build -f {} --cache-from {} --build-arg framework_support_installable={} ' \ - '--build-arg py_version={} --build-arg framework_installable={} ' \ - '-t {}:{} {}'.format(dockerfile, prev_image_uri, os.path.basename(tar_file_name), py_version, - binary_file, args.repo, tag, build_dir) - print('Building docker image: {}'.format(build_cmd)) - subprocess.check_call(build_cmd.split()) - - print('Deleting binary file {}'.format(binary_file)) - subprocess.check_call('rm {}'.format(os.path.join(build_dir, binary_file)).split()) diff --git a/scripts/publish_all.py b/scripts/publish_all.py deleted file mode 100644 index 2c78e8a7..00000000 --- a/scripts/publish_all.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import argparse -import subprocess - -DEV_ACCOUNT = '142577830533' -VERSION = '1.13.1' -REGION = 'us-west-2' -REPO = 'sagemaker-tensorflow-scriptmode' - - -def _parse_args(): - - parser = argparse.ArgumentParser() - - parser.add_argument('--account', type=str, default=DEV_ACCOUNT) - parser.add_argument('--version', type=str, default=VERSION) - parser.add_argument('--repo', type=str, default=REPO) - parser.add_argument('--region', type=str, default=REGION) - - return parser.parse_args() - - -args = _parse_args() - -for arch in ['cpu', 'gpu']: - for py_version in ['2', '3']: - source = '{}:{}-{}-py{}'.format(args.repo, args.version, arch, py_version) - dest = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(args.account, args.region, source) - tag_cmd = 'docker tag {} {}'.format(source, dest) - print('Tagging image: {}'.format(tag_cmd)) - subprocess.check_call(tag_cmd.split()) - login_cmd = subprocess.check_output( - 'aws ecr get-login --no-include-email --registry-id {} --region {}' - .format(args.account, args.region).split()) - print('Executing docker login command: {}'.format(login_cmd)) - subprocess.check_call(login_cmd.split()) - push_cmd = 'docker push {}'.format(dest) - print('Pushing image: {}'.format(push_cmd)) - subprocess.check_call(push_cmd.split()) diff --git a/setup.py b/setup.py index 983ebd13..67cfbe56 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ import os from os.path import basename from os.path import splitext +import sys from setuptools import find_packages, setup @@ -25,41 +26,60 @@ def read(fname): def read_version(): - return read('VERSION').strip() + return read("VERSION").strip() -setup( - name='sagemaker_tensorflow_training', - version=read_version(), - description='Open source library for creating ' - 'TensorFlow containers to run on Amazon SageMaker.', +test_dependencies = [ + "tox", + "flake8", + "pytest", + "pytest-cov", + "pytest-xdist", + "mock", + "sagemaker==1.50.1", + "tensorflow<2.0", + "docker-compose", + "boto3==1.10.50", + "six==1.13.0", + "python-dateutil>=2.1,<2.8.1", + "botocore==1.13.50", + "requests-mock", + "awscli>=1.16.314", +] - packages=find_packages(where='src', exclude=('test',)), - package_dir={'': 'src'}, - py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')], - - long_description=read('README.rst'), - author='Amazon Web Services', - url='https://github.com/aws/sagemaker-tensorflow-containers', - license='Apache License 2.0', +if sys.version_info.major > 2: + test_dependencies.append("sagemaker-experiments==0.1.7") +setup( + name="sagemaker_tensorflow_training", + version=read_version(), + description="Open source library for creating " + "TensorFlow containers to run on Amazon SageMaker.", + packages=find_packages(where="src", exclude=("test",)), + package_dir={"": "src"}, + py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], + long_description=read("README.rst"), + author="Amazon Web Services", + url="https://github.com/aws/sagemaker-tensorflow-containers", + license="Apache License 2.0", classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Natural Language :: English", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.6', + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", ], - - install_requires=['sagemaker-containers>=2.6.2', 'numpy', 'scipy', 'sklearn', - 'pandas', 'Pillow', 'h5py'], - extras_require={ - 'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', - 'sagemaker==1.50.1', 'tensorflow<2.0', 'docker-compose', 'boto3==1.10.50', - 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.50', - 'requests-mock', 'awscli==1.16.314'], - 'benchmark': ['click'] - }, + install_requires=[ + "sagemaker-training>=4.3.0,<5.2.0", + "numpy", + "scipy", + "sklearn", + "pandas", + "Pillow", + "h5py", + ], + extras_require={"test": test_dependencies, "benchmark": ["click"]}, ) diff --git a/src/sagemaker_tensorflow_container/s3_utils.py b/src/sagemaker_tensorflow_container/s3_utils.py index 0137ef25..15902c55 100644 --- a/src/sagemaker_tensorflow_container/s3_utils.py +++ b/src/sagemaker_tensorflow_container/s3_utils.py @@ -20,23 +20,23 @@ def configure(model_dir, job_region): - os.environ['S3_REGION'] = _s3_region(job_region, model_dir) + os.environ["S3_REGION"] = _s3_region(job_region, model_dir) # setting log level to WARNING - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' - os.environ['S3_USE_HTTPS'] = '1' + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" + os.environ["S3_USE_HTTPS"] = "1" def _s3_region(job_region, model_dir): - if model_dir and model_dir.startswith('s3://'): - s3 = boto3.client('s3', region_name=job_region) + if model_dir and model_dir.startswith("s3://"): + s3 = boto3.client("s3", region_name=job_region) # We get the AWS region of the checkpoint bucket, which may be different from # the region this container is currently running in. parsed_url = urlparse(model_dir) bucket_name = parsed_url.netloc - bucket_location = s3.get_bucket_location(Bucket=bucket_name)['LocationConstraint'] + bucket_location = s3.get_bucket_location(Bucket=bucket_name)["LocationConstraint"] return bucket_location or job_region else: diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py index bce6a69c..f71db52a 100644 --- a/src/sagemaker_tensorflow_container/training.py +++ b/src/sagemaker_tensorflow_container/training.py @@ -19,15 +19,15 @@ import subprocess import time -import sagemaker_containers.beta.framework as framework +from sagemaker_training import entry_point, environment, mapping, runner import tensorflow as tf from sagemaker_tensorflow_container import s3_utils logger = logging.getLogger(__name__) -SAGEMAKER_PARAMETER_SERVER_ENABLED = 'sagemaker_parameter_server_enabled' -MODEL_DIR = '/opt/ml/model' +SAGEMAKER_PARAMETER_SERVER_ENABLED = "sagemaker_parameter_server_enabled" +MODEL_DIR = "/opt/ml/model" def _is_host_master(hosts, current_host): @@ -56,50 +56,46 @@ def _build_tf_config(hosts, current_host, ps_task=False): ps = hosts if len(hosts) > 1 else None def host_addresses(hosts, port=2222): - return ['{}:{}'.format(host, port) for host in hosts] + return ["{}:{}".format(host, port) for host in hosts] - tf_config = { - 'cluster': { - 'master': host_addresses(masters) - }, - 'environment': 'cloud' - } + tf_config = {"cluster": {"master": host_addresses(masters)}, "environment": "cloud"} if ps: - tf_config['cluster']['ps'] = host_addresses(ps, port='2223') + tf_config["cluster"]["ps"] = host_addresses(ps, port="2223") if workers: - tf_config['cluster']['worker'] = host_addresses(workers) + tf_config["cluster"]["worker"] = host_addresses(workers) if ps_task: if ps is None: raise ValueError( - 'Cannot have a ps task if there are no parameter servers in the cluster') - task_type = 'ps' + "Cannot have a ps task if there are no parameter servers in the cluster" + ) + task_type = "ps" task_index = ps.index(current_host) elif _is_host_master(hosts, current_host): - task_type = 'master' + task_type = "master" task_index = 0 else: - task_type = 'worker' + task_type = "worker" task_index = workers.index(current_host) - tf_config['task'] = {'index': task_index, 'type': task_type} + tf_config["task"] = {"index": task_index, "type": task_type} return tf_config def _run_ps(env, cluster): - logger.info('Running distributed training job with parameter servers') + logger.info("Running distributed training job with parameter servers") cluster_spec = tf.train.ClusterSpec(cluster) task_index = env.hosts.index(env.current_host) # Force parameter server to run on cpu. Running multiple TensorFlow processes on the same # GPU is not safe: # https://stackoverflow.com/questions/46145100/is-it-unsafe-to-run-multiple-tensorflow-processes-on-the-same-gpu - no_gpu_config = tf.ConfigProto(device_count={'GPU': 0}) + no_gpu_config = tf.ConfigProto(device_count={"GPU": 0}) server = tf.train.Server( - cluster_spec, job_name='ps', task_index=task_index, config=no_gpu_config + cluster_spec, job_name="ps", task_index=task_index, config=no_gpu_config ) multiprocessing.Process(target=lambda: server.join()).start() @@ -107,20 +103,27 @@ def _run_ps(env, cluster): def _run_worker(env, cmd_args, tf_config): env_vars = env.to_env_vars() - env_vars['TF_CONFIG'] = json.dumps(tf_config) - - framework.entry_point.run(env.module_dir, env.user_entry_point, cmd_args, env_vars) + env_vars["TF_CONFIG"] = json.dumps(tf_config) + + entry_point.run( + uri=env.module_dir, + user_entry_point=env.user_entry_point, + args=cmd_args, + env_vars=env_vars, + capture_error=True, + ) def _wait_until_master_is_down(master): while True: try: subprocess.check_call( - ['curl', '{}:2222'.format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - logger.info('master {} is still up, waiting for it to exit'.format(master)) + ["curl", "{}:2222".format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + logger.info("master {} is still up, waiting for it to exit".format(master)) time.sleep(10) except subprocess.CalledProcessError: - logger.info('master {} is down, stopping parameter server'.format(master)) + logger.info("master {} is down, stopping parameter server".format(master)) return @@ -128,18 +131,19 @@ def train(env, cmd_args): """Get training job environment from env and run the training job. Args: - env (sagemaker_containers.beta.framework.env.TrainingEnv): Instance of TrainingEnv class + env (sagemaker_training.env.TrainingEnv): Instance of TrainingEnv class """ parameter_server_enabled = env.additional_framework_parameters.get( - SAGEMAKER_PARAMETER_SERVER_ENABLED, False) + SAGEMAKER_PARAMETER_SERVER_ENABLED, False + ) if len(env.hosts) > 1 and parameter_server_enabled: tf_config = _build_tf_config(hosts=env.hosts, current_host=env.current_host) - logger.info('Running distributed training job with parameter servers') - logger.info('Launching parameter server process') - _run_ps(env, tf_config['cluster']) - logger.info('Launching worker process') + logger.info("Running distributed training job with parameter servers") + logger.info("Launching parameter server process") + _run_ps(env, tf_config["cluster"]) + logger.info("Launching worker process") _run_worker(env, cmd_args, tf_config) if not _is_host_master(env.hosts, env.current_host): @@ -147,15 +151,21 @@ def train(env, cmd_args): else: - mpi_enabled = env.additional_framework_parameters.get('sagemaker_mpi_enabled') + mpi_enabled = env.additional_framework_parameters.get("sagemaker_mpi_enabled") if mpi_enabled: - runner_type = framework.runner.MPIRunnerType + runner_type = runner.MPIRunnerType else: - runner_type = framework.runner.ProcessRunnerType + runner_type = runner.ProcessRunnerType - framework.entry_point.run(env.module_dir, env.user_entry_point, cmd_args, env.to_env_vars(), - runner=runner_type) + entry_point.run( + uri=env.module_dir, + user_entry_point=env.user_entry_point, + args=cmd_args, + env_vars=env.to_env_vars(), + capture_error=True, + runner_type=runner_type, + ) def _log_model_missing_warning(model_dir): @@ -165,48 +175,56 @@ def _log_model_missing_warning(model_dir): if filenames: file_exists = True for f in filenames: - if 'saved_model.pb' in f or 'saved_model.pbtxt' in f: + if "saved_model.pb" in f or "saved_model.pbtxt" in f: pb_file_exists = True path, direct_parent_dir = os.path.split(dirpath) if not str.isdigit(direct_parent_dir): - logger.warn('Your model will NOT be servable with SageMaker TensorFlow Serving containers. ' - 'The SavedModel bundle is under directory \"{}\", not a numeric name.' - .format(direct_parent_dir)) + logger.warn( + "Your model will NOT be servable with SageMaker TensorFlow Serving containers. " + 'The SavedModel bundle is under directory "{}", not a numeric name.'.format( + direct_parent_dir + ) + ) if not file_exists: - logger.warn('No model artifact is saved under path {}.' - ' Your training job will not save any model files to S3.\n' - 'For details of how to construct your training script see:\n' - 'https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script' - .format(model_dir)) + logger.warn( + "No model artifact is saved under path {}." + " Your training job will not save any model files to S3.\n" + "For details of how to construct your training script see:\n" + "https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script".format( + model_dir + ) + ) elif not pb_file_exists: - logger.warn('Your model will NOT be servable with SageMaker TensorFlow Serving container. ' - 'The model artifact was not saved in the TensorFlow SavedModel directory structure:\n' - 'https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory') + logger.warn( + "Your model will NOT be servable with SageMaker TensorFlow Serving container. " + "The model artifact was not saved in the TensorFlow SavedModel directory structure:\n" + "https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory" + ) def _model_dir_with_training_job(model_dir, job_name): - if model_dir.startswith('/opt/ml'): + if model_dir and model_dir.startswith("/opt/ml"): return model_dir else: - return '{}/{}/model'.format(model_dir, job_name) + return "{}/{}/model".format(model_dir, job_name) def main(): """Training entry point """ - hyperparameters = framework.env.read_hyperparameters() - env = framework.training_env(hyperparameters=hyperparameters) + hyperparameters = environment.read_hyperparameters() + env = environment.Environment(hyperparameters=hyperparameters) user_hyperparameters = env.hyperparameters # If the training job is part of the multiple training jobs for tuning, we need to append the training job name to # model_dir in case they read from/write to the same object - if '_tuning_objective_metric' in hyperparameters: - model_dir = _model_dir_with_training_job(hyperparameters.get('model_dir'), env.job_name) - logger.info('Appending the training job name to model_dir: {}'.format(model_dir)) - user_hyperparameters['model_dir'] = model_dir + if "_tuning_objective_metric" in hyperparameters: + model_dir = _model_dir_with_training_job(hyperparameters.get("model_dir"), env.job_name) + logger.info("Appending the training job name to model_dir: {}".format(model_dir)) + user_hyperparameters["model_dir"] = model_dir - s3_utils.configure(user_hyperparameters.get('model_dir'), os.environ.get('SAGEMAKER_REGION')) - train(env, framework.mapping.to_cmd_args(user_hyperparameters)) + s3_utils.configure(user_hyperparameters.get("model_dir"), os.environ.get("SAGEMAKER_REGION")) + train(env, mapping.to_cmd_args(user_hyperparameters)) _log_model_missing_warning(MODEL_DIR) diff --git a/test/__init__.py b/test/__init__.py deleted file mode 100644 index 57862f92..00000000 --- a/test/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# or in the "license" file accompanying this file. This file is distributed -# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language governing -# permissions and limitations under the License. -from __future__ import absolute_import diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 00000000..56d58673 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,197 @@ +# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. +from __future__ import absolute_import + +import logging +import os + +import boto3 +import pytest +from sagemaker import LocalSession, Session + +from utils import image_utils + +# these regions have some p2 and p3 instances, but not enough for automated testing +NO_P2_REGIONS = [ + "ca-central-1", + "eu-central-1", + "eu-west-2", + "us-west-1", + "eu-west-3", + "eu-north-1", + "sa-east-1", + "ap-east-1", + "me-south-1", +] +NO_P3_REGIONS = [ + "ap-southeast-1", + "ap-southeast-2", + "ap-south-1", + "ca-central-1", + "eu-central-1", + "eu-west-2", + "us-west-1" "eu-west-3", + "eu-north-1", + "sa-east-1", + "ap-east-1", + "me-south-1", +] + + +logger = logging.getLogger(__name__) +logging.getLogger("boto").setLevel(logging.INFO) +logging.getLogger("botocore").setLevel(logging.INFO) +logging.getLogger("factory.py").setLevel(logging.INFO) +logging.getLogger("auth.py").setLevel(logging.INFO) +logging.getLogger("connectionpool.py").setLevel(logging.INFO) + +DIR_PATH = os.path.dirname(os.path.realpath(__file__)) + + +def pytest_addoption(parser): + parser.addoption("--build-image", "-B", action="store_true") + parser.addoption("--push-image", "-P", action="store_true") + parser.addoption("--dockerfile-type", "-T", choices=["dlc.cpu", "dlc.gpu", "tf"], default="tf") + parser.addoption("--dockerfile", "-D", default=None) + parser.addoption("--docker-base-name", default="sagemaker-tensorflow-training") + parser.addoption("--tag", default=None) + parser.addoption("--region", default="us-west-2") + parser.addoption("--framework-version", default="1.15.2") + parser.addoption("--processor", default="cpu", choices=["cpu", "gpu", "cpu,gpu"]) + parser.addoption("--py-version", default="3", choices=["2", "3", "2,3"]) + parser.addoption("--account-id", default="142577830533") + parser.addoption("--instance-type", default=None) + + +def pytest_generate_tests(metafunc): + if "py_version" in metafunc.fixturenames: + py_version_params = ["py" + v for v in metafunc.config.getoption("--py-version").split(",")] + metafunc.parametrize("py_version", py_version_params, scope="session") + + if "processor" in metafunc.fixturenames: + processor_params = metafunc.config.getoption("--processor").split(",") + metafunc.parametrize("processor", processor_params, scope="session") + + +@pytest.fixture(scope="session", name="dockerfile_type") +def fixture_dockerfile_type(request): + return request.config.getoption("--dockerfile-type") + + +@pytest.fixture(scope="session", name="dockerfile") +def fixture_dockerfile(request, dockerfile_type): + dockerfile = request.config.getoption("--dockerfile") + return dockerfile if dockerfile else "Dockerfile.{}".format(dockerfile_type) + + +@pytest.fixture(scope="session", name="build_image", autouse=True) +def fixture_build_image(request, framework_version, dockerfile, image_uri, region): + build_image = request.config.getoption("--build-image") + if build_image: + return image_utils.build_image( + framework_version=framework_version, + dockerfile=dockerfile, + image_uri=image_uri, + region=region, + cwd=os.path.join(DIR_PATH, ".."), + ) + + return image_uri + + +@pytest.fixture(scope="session", name="push_image", autouse=True) +def fixture_push_image(request, image_uri, region, account_id): + push_image = request.config.getoption("--push-image") + if push_image: + return image_utils.push_image(image_uri, region, account_id) + return None + + +@pytest.fixture(scope="session") +def docker_base_name(request): + return request.config.getoption("--docker-base-name") + + +@pytest.fixture(scope="session") +def region(request): + return request.config.getoption("--region") + + +@pytest.fixture(scope="session") +def framework_version(request): + return request.config.getoption("--framework-version") + + +@pytest.fixture(scope="session") +def tag(request, framework_version, processor, py_version): + provided_tag = request.config.getoption("--tag") + default_tag = "{}-{}-py{}".format(framework_version, processor, py_version) + return provided_tag if provided_tag is not None else default_tag + + +@pytest.fixture(scope="session") +def sagemaker_session(region): + return Session(boto_session=boto3.Session(region_name=region)) + + +@pytest.fixture(scope="session") +def sagemaker_local_session(region): + return LocalSession(boto_session=boto3.Session(region_name=region)) + + +@pytest.fixture(scope="session") +def account_id(request): + return request.config.getoption("--account-id") + + +@pytest.fixture +def instance_type(request, processor): + provided_instance_type = request.config.getoption("--instance-type") + default_instance_type = "ml.c4.xlarge" if processor == "cpu" else "ml.p2.xlarge" + return provided_instance_type if provided_instance_type is not None else default_instance_type + + +@pytest.fixture(autouse=True) +def skip_by_device_type(request, processor): + is_gpu = processor == "gpu" + if (request.node.get_closest_marker("skip_gpu") and is_gpu) or ( + request.node.get_closest_marker("skip_cpu") and not is_gpu + ): + pytest.skip("Skipping because running on '{}' instance".format(processor)) + + +@pytest.fixture(autouse=True) +def skip_gpu_instance_restricted_regions(region, instance_type): + if (region in NO_P2_REGIONS and instance_type.startswith("ml.p2")) or ( + region in NO_P3_REGIONS and instance_type.startswith("ml.p3") + ): + pytest.skip("Skipping GPU test in region {}".format(region)) + + +@pytest.fixture(autouse=True) +def skip_by_dockerfile_type(request, dockerfile_type): + is_generic = dockerfile_type == "tf" + if request.node.get_closest_marker("skip_generic") and is_generic: + pytest.skip("Skipping because running generic image without mpi and horovod") + + +@pytest.fixture(name="docker_registry", scope="session") +def fixture_docker_registry(account_id, region): + return "{}.dkr.ecr.{}.amazonaws.com".format(account_id, region) if account_id else None + + +@pytest.fixture(name="image_uri", scope="session") +def fixture_image_uri(docker_registry, docker_base_name, tag): + if docker_registry: + return "{}/{}:{}".format(docker_registry, docker_base_name, tag) + return "{}:{}".format(docker_base_name, tag) diff --git a/test/container/1.15.2/Dockerfile.dlc.cpu b/test/container/1.15.2/Dockerfile.dlc.cpu new file mode 100644 index 00000000..98764974 --- /dev/null +++ b/test/container/1.15.2/Dockerfile.dlc.cpu @@ -0,0 +1,6 @@ +ARG region +FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:1.15.2-cpu-py2 + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz diff --git a/test/container/1.15.2/Dockerfile.dlc.gpu b/test/container/1.15.2/Dockerfile.dlc.gpu new file mode 100644 index 00000000..15344f6e --- /dev/null +++ b/test/container/1.15.2/Dockerfile.dlc.gpu @@ -0,0 +1,6 @@ +ARG region +FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:1.15.2-gpu-py3 + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz diff --git a/test/container/1.15.2/Dockerfile.tf b/test/container/1.15.2/Dockerfile.tf new file mode 100644 index 00000000..b1a62168 --- /dev/null +++ b/test/container/1.15.2/Dockerfile.tf @@ -0,0 +1,7 @@ +FROM tensorflow/tensorflow:1.15.2-gpu-py3 + +ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz diff --git a/test/integration/__init__.py b/test/integration/__init__.py index 966dd7d4..a2e25d25 100644 --- a/test/integration/__init__.py +++ b/test/integration/__init__.py @@ -14,35 +14,18 @@ import logging import os +import random +import time -logging.getLogger('boto3').setLevel(logging.INFO) -logging.getLogger('botocore').setLevel(logging.INFO) +logging.getLogger("boto3").setLevel(logging.INFO) +logging.getLogger("botocore").setLevel(logging.INFO) -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', 'resources') +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "resources") -# these regions have some p2 and p3 instances, but not enough for automated testing -NO_P2_REGIONS = [ - 'ca-central-1', - 'eu-central-1', - 'eu-west-2', - 'us-west-1', - 'eu-west-3', - 'eu-north-1', - 'sa-east-1', - 'ap-east-1', - 'me-south-1' -] -NO_P3_REGIONS = [ - 'ap-southeast-1', - 'ap-southeast-2', - 'ap-south-1', - 'ca-central-1', - 'eu-central-1', - 'eu-west-2', - 'us-west-1' - 'eu-west-3', - 'eu-north-1', - 'sa-east-1', - 'ap-east-1', - 'me-south-1' -] + +def unique_name_from_base(base, max_length=63): + unique = "%04x" % random.randrange(16 ** 4) # 4-digit hex + ts = str(int(time.time())) + available_length = max_length - 2 - len(ts) - len(unique) + trimmed = base[:available_length] + return "{}-{}-{}".format(trimmed, ts, unique) diff --git a/test/integration/conftest.py b/test/integration/conftest.py deleted file mode 100644 index 4b599675..00000000 --- a/test/integration/conftest.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# or in the "license" file accompanying this file. This file is distributed -# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language governing -# permissions and limitations under the License. -from __future__ import absolute_import - -import logging -import os - -import boto3 -import pytest -from sagemaker import LocalSession, Session -from sagemaker.tensorflow import TensorFlow - -from test.integration import NO_P2_REGIONS, NO_P3_REGIONS - -logger = logging.getLogger(__name__) -logging.getLogger('boto').setLevel(logging.INFO) -logging.getLogger('botocore').setLevel(logging.INFO) -logging.getLogger('factory.py').setLevel(logging.INFO) -logging.getLogger('auth.py').setLevel(logging.INFO) -logging.getLogger('connectionpool.py').setLevel(logging.INFO) - -SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__)) - - -def pytest_addoption(parser): - parser.addoption('--docker-base-name', default='sagemaker-tensorflow-scriptmode') - parser.addoption('--tag', default=None) - parser.addoption('--region', default='us-west-2') - parser.addoption('--framework-version', default=TensorFlow.LATEST_VERSION) - parser.addoption('--processor', default='cpu', choices=['cpu', 'gpu', 'cpu,gpu']) - parser.addoption('--py-version', default='3', choices=['2', '3', '2,3']) - parser.addoption('--account-id', default='142577830533') - parser.addoption('--instance-type', default=None) - - -def pytest_configure(config): - os.environ['TEST_PY_VERSIONS'] = config.getoption('--py-version') - os.environ['TEST_PROCESSORS'] = config.getoption('--processor') - - -@pytest.fixture(scope='session') -def docker_base_name(request): - return request.config.getoption('--docker-base-name') - - -@pytest.fixture(scope='session') -def region(request): - return request.config.getoption('--region') - - -@pytest.fixture(scope='session') -def framework_version(request): - return request.config.getoption('--framework-version') - - -@pytest.fixture -def tag(request, framework_version, processor, py_version): - provided_tag = request.config.getoption('--tag') - default_tag = '{}-{}-py{}'.format(framework_version, processor, py_version) - return provided_tag if provided_tag is not None else default_tag - - -@pytest.fixture(scope='session') -def sagemaker_session(region): - return Session(boto_session=boto3.Session(region_name=region)) - - -@pytest.fixture(scope='session') -def sagemaker_local_session(region): - return LocalSession(boto_session=boto3.Session(region_name=region)) - - -@pytest.fixture(scope='session') -def account_id(request): - return request.config.getoption('--account-id') - - -@pytest.fixture -def instance_type(request, processor): - provided_instance_type = request.config.getoption('--instance-type') - default_instance_type = 'ml.c4.xlarge' if processor == 'cpu' else 'ml.p2.xlarge' - return provided_instance_type if provided_instance_type is not None else default_instance_type - - -@pytest.fixture(autouse=True) -def skip_by_device_type(request, processor): - is_gpu = (processor == 'gpu') - if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \ - (request.node.get_closest_marker('skip_cpu') and not is_gpu): - pytest.skip('Skipping because running on \'{}\' instance'.format(processor)) - - -@pytest.fixture(autouse=True) -def skip_gpu_instance_restricted_regions(region, instance_type): - if (region in NO_P2_REGIONS and instance_type.startswith('ml.p2')) or \ - (region in NO_P3_REGIONS and instance_type.startswith('ml.p3')): - pytest.skip('Skipping GPU test in region {}'.format(region)) - - -@pytest.fixture -def docker_image(docker_base_name, tag): - return '{}:{}'.format(docker_base_name, tag) - - -@pytest.fixture -def ecr_image(account_id, docker_base_name, tag, region): - return '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format( - account_id, region, docker_base_name, tag) diff --git a/test/integration/local/test_horovod.py b/test/integration/local/test_horovod.py index f35ba03a..2137f4ab 100644 --- a/test/integration/local/test_horovod.py +++ b/test/integration/local/test_horovod.py @@ -19,48 +19,64 @@ import pytest from sagemaker.tensorflow import TensorFlow -from test.integration.utils import processor, py_version # noqa: F401 +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') + +@pytest.mark.skip_cpu +@pytest.mark.skip_generic +def test_distributed_training_horovod_gpu( + sagemaker_local_session, image_uri, tmpdir, framework_version +): + _test_distributed_training_horovod( + 1, 2, sagemaker_local_session, image_uri, tmpdir, framework_version, "local_gpu" + ) @pytest.mark.skip_gpu -@pytest.mark.parametrize('instances, processes', [ - [1, 2], - (2, 1), - (2, 2), - (5, 2)]) -def test_distributed_training_horovod_basic(instances, - processes, - sagemaker_local_session, - docker_image, - tmpdir, - framework_version): - output_path = 'file://%s' % tmpdir +@pytest.mark.skip_generic +@pytest.mark.parametrize("instances, processes", [(1, 2), (2, 1), (2, 2), (5, 2)]) +def test_distributed_training_horovod_cpu( + instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version +): + _test_distributed_training_horovod( + instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version, "local" + ) + + +def _test_distributed_training_horovod( + instances, processes, session, image_uri, tmpdir, framework_version, instance_type +): + output_path = "file://%s" % tmpdir estimator = TensorFlow( - entry_point=os.path.join(RESOURCE_PATH, 'hvdbasic', 'train_hvd_basic.py'), - role='SageMakerRole', - train_instance_type='local', - sagemaker_session=sagemaker_local_session, + entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_basic.py"), + role="SageMakerRole", + train_instance_type=instance_type, + sagemaker_session=session, train_instance_count=instances, - image_name=docker_image, + image_name=image_uri, output_path=output_path, framework_version=framework_version, - hyperparameters={'sagemaker_mpi_enabled': True, - 'sagemaker_network_interface_name': 'eth0', - 'sagemaker_mpi_num_of_processes_per_host': processes}) + hyperparameters={ + "sagemaker_mpi_enabled": True, + "sagemaker_network_interface_name": "eth0", + "sagemaker_mpi_num_of_processes_per_host": processes, + }, + ) - estimator.fit('file://{}'.format(os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed'))) + estimator.fit("file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data-distributed"))) tmp = str(tmpdir) - extract_files(output_path.replace('file://', ''), tmp) + extract_files(output_path.replace("file://", ""), tmp) size = instances * processes for rank in range(size): local_rank = rank % processes - assert read_json('local-rank-%s-rank-%s' % (local_rank, rank), tmp) == { - 'local-rank': local_rank, 'rank': rank, 'size': size} + assert read_json("local-rank-%s-rank-%s" % (local_rank, rank), tmp) == { + "local-rank": local_rank, + "rank": rank, + "size": size, + } def read_json(file, tmp): @@ -69,14 +85,14 @@ def read_json(file, tmp): def assert_files_exist_in_tar(output_path, files): - if output_path.startswith('file://'): + if output_path.startswith("file://"): output_path = output_path[7:] - model_file = os.path.join(output_path, 'model.tar.gz') + model_file = os.path.join(output_path, "model.tar.gz") with tarfile.open(model_file) as tar: for f in files: tar.getmember(f) def extract_files(output_path, tmpdir): - with tarfile.open(os.path.join(output_path, 'model.tar.gz')) as tar: + with tarfile.open(os.path.join(output_path, "model.tar.gz")) as tar: tar.extractall(tmpdir) diff --git a/test/integration/local/test_keras.py b/test/integration/local/test_keras.py deleted file mode 100644 index 1eca0c2a..00000000 --- a/test/integration/local/test_keras.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import logging -import os - -import numpy as np -import pytest -from sagemaker.tensorflow import serving, TensorFlow - -from test.integration import RESOURCE_PATH -from test.integration.utils import processor, py_version # noqa: F401 - - -logging.basicConfig(level=logging.DEBUG) - - -@pytest.mark.skip(reason="Serving part fails because of version mismatch.") -def test_keras_training(sagemaker_local_session, docker_image, tmpdir, framework_version): - entry_point = os.path.join(RESOURCE_PATH, 'keras_inception.py') - output_path = 'file://{}'.format(tmpdir) - - estimator = TensorFlow( - entry_point=entry_point, - role='SageMakerRole', - train_instance_count=1, - train_instance_type='local', - image_name=docker_image, - sagemaker_session=sagemaker_local_session, - model_dir='/opt/ml/model', - output_path=output_path, - framework_version=framework_version, - py_version='py3') - - estimator.fit() - - model = serving.Model(model_data=output_path, - role='SageMakerRole', - framework_version=framework_version, - sagemaker_session=sagemaker_local_session) - - predictor = model.deploy(initial_instance_count=1, instance_type='local') - - assert predictor.predict(np.random.randn(4, 4, 4, 2) * 255) - - predictor.delete_endpoint() diff --git a/test/integration/local/test_training.py b/test/integration/local/test_training.py index bd1641b0..35a676a6 100644 --- a/test/integration/local/test_training.py +++ b/test/integration/local/test_training.py @@ -18,136 +18,109 @@ import pytest from sagemaker.tensorflow import TensorFlow -from test.integration.utils import processor, py_version # noqa: F401 - -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') -TF_CHECKPOINT_FILES = ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'] +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") +TF_CHECKPOINT_FILES = ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"] @pytest.fixture # noqa: F811 def py_full_version(py_version): # noqa: F811 - if py_version == '2': - return '2.7' + if py_version == "2": + return "2.7" else: - return '3.6' - - -@pytest.mark.skip_gpu -def test_py_versions(sagemaker_local_session, docker_image, py_full_version, framework_version, tmpdir): - output_path = 'file://{}'.format(tmpdir) - run_tf_training(script=os.path.join(RESOURCE_PATH, 'test_py_version', 'entry.py'), - instance_type='local', - instance_count=1, - sagemaker_local_session=sagemaker_local_session, - docker_image=docker_image, - framework_version=framework_version, - output_path=output_path, - training_data_path=None) - - with tarfile.open(os.path.join(str(tmpdir), 'output.tar.gz')) as tar: - output_file = tar.getmember('py_version') - tar.extractall(path=str(tmpdir), members=[output_file]) - - with open(os.path.join(str(tmpdir), 'py_version')) as f: - assert f.read().strip() == py_full_version + return "3.6" @pytest.mark.skip_gpu -def test_mnist_cpu(sagemaker_local_session, docker_image, tmpdir, framework_version): - output_path = 'file://{}'.format(tmpdir) - run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist.py'), - instance_type='local', - instance_count=1, - sagemaker_local_session=sagemaker_local_session, - docker_image=docker_image, - framework_version=framework_version, - output_path=output_path, - training_data_path='file://{}'.format( - os.path.join(RESOURCE_PATH, 'mnist', 'data'))) - _assert_files_exist_in_tar(output_path, ['my_model.h5']) - - -@pytest.mark.skip_cpu -def test_gpu(sagemaker_local_session, docker_image, framework_version): - run_tf_training(script=os.path.join(RESOURCE_PATH, 'gpu_device_placement.py'), - instance_type='local_gpu', - instance_count=1, - sagemaker_local_session=sagemaker_local_session, - docker_image=docker_image, - framework_version=framework_version, - training_data_path='file://{}'.format( - os.path.join(RESOURCE_PATH, 'mnist', 'data'))) +def test_mnist_cpu(sagemaker_local_session, image_uri, tmpdir, framework_version): + output_path = "file://{}".format(tmpdir) + run_tf_training( + script=os.path.join(RESOURCE_PATH, "mnist", "mnist.py"), + instance_type="local", + instance_count=1, + sagemaker_local_session=sagemaker_local_session, + image_uri=image_uri, + framework_version=framework_version, + output_path=output_path, + training_data_path="file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data")), + ) + _assert_files_exist_in_tar(output_path, ["my_model.h5"]) @pytest.mark.skip_gpu -def test_distributed_training_cpu_no_ps(sagemaker_local_session, - docker_image, - tmpdir, - framework_version): - output_path = 'file://{}'.format(tmpdir) - run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist_estimator.py'), - instance_type='local', - instance_count=2, - sagemaker_local_session=sagemaker_local_session, - docker_image=docker_image, - framework_version=framework_version, - output_path=output_path, - training_data_path='file://{}'.format( - os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed'))) +def test_distributed_training_cpu_no_ps( + sagemaker_local_session, image_uri, tmpdir, framework_version +): + output_path = "file://{}".format(tmpdir) + run_tf_training( + script=os.path.join(RESOURCE_PATH, "mnist", "mnist_estimator.py"), + instance_type="local", + instance_count=2, + sagemaker_local_session=sagemaker_local_session, + image_uri=image_uri, + framework_version=framework_version, + output_path=output_path, + training_data_path="file://{}".format( + os.path.join(RESOURCE_PATH, "mnist", "data-distributed") + ), + ) _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES) @pytest.mark.skip_gpu -def test_distributed_training_cpu_ps(sagemaker_local_session, - docker_image, - tmpdir, - framework_version): - output_path = 'file://{}'.format(tmpdir) - run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist_estimator.py'), - instance_type='local', - instance_count=2, - sagemaker_local_session=sagemaker_local_session, - docker_image=docker_image, - framework_version=framework_version, - output_path=output_path, - hyperparameters={'sagemaker_parameter_server_enabled': True}, - training_data_path='file://{}'.format( - os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed'))) +def test_distributed_training_cpu_ps(sagemaker_local_session, image_uri, tmpdir, framework_version): + output_path = "file://{}".format(tmpdir) + run_tf_training( + script=os.path.join(RESOURCE_PATH, "mnist", "mnist_estimator.py"), + instance_type="local", + instance_count=2, + sagemaker_local_session=sagemaker_local_session, + image_uri=image_uri, + framework_version=framework_version, + output_path=output_path, + hyperparameters={"sagemaker_parameter_server_enabled": True}, + training_data_path="file://{}".format( + os.path.join(RESOURCE_PATH, "mnist", "data-distributed") + ), + ) _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES) -def run_tf_training(script, - instance_type, - instance_count, - sagemaker_local_session, - docker_image, - framework_version, - training_data_path, - output_path=None, - hyperparameters=None): +def run_tf_training( + script, + instance_type, + instance_count, + sagemaker_local_session, + image_uri, + framework_version, + training_data_path, + output_path=None, + hyperparameters=None, +): hyperparameters = hyperparameters or {} - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_count=instance_count, - train_instance_type=instance_type, - sagemaker_session=sagemaker_local_session, - image_name=docker_image, - model_dir='/opt/ml/model', - output_path=output_path, - hyperparameters=hyperparameters, - base_job_name='test-tf', - framework_version=framework_version, - py_version='py3') + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_count=instance_count, + train_instance_type=instance_type, + sagemaker_session=sagemaker_local_session, + image_name=image_uri, + model_dir="/opt/ml/model", + output_path=output_path, + hyperparameters=hyperparameters, + base_job_name="test-tf", + framework_version=framework_version, + py_version="py3", + ) estimator.fit(training_data_path) def _assert_files_exist_in_tar(output_path, files): - if output_path.startswith('file://'): + if output_path.startswith("file://"): output_path = output_path[7:] - model_file = os.path.join(output_path, 'model.tar.gz') + model_file = os.path.join(output_path, "model.tar.gz") with tarfile.open(model_file) as tar: for f in files: tar.getmember(f) diff --git a/test/integration/sagemaker/test_horovod.py b/test/integration/sagemaker/test_horovod.py index 1d2bd8ac..de7c3ff1 100644 --- a/test/integration/sagemaker/test_horovod.py +++ b/test/integration/sagemaker/test_horovod.py @@ -14,39 +14,68 @@ import os +import pytest import sagemaker from sagemaker.tensorflow import TensorFlow +from sagemaker.utils import unique_name_from_base -from test.integration.utils import processor, py_version, unique_name_from_base # noqa: F401 +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') +@pytest.mark.skip_generic +def test_distributed_training_horovod( + sagemaker_session, instance_type, image_uri, tmpdir, framework_version +): -def test_distributed_training_horovod(sagemaker_session, - instance_type, - ecr_image, - tmpdir, - framework_version): - - mpi_options = '-verbose -x orte_base_help_aggregate=0' + mpi_options = "-verbose -x orte_base_help_aggregate=0" estimator = TensorFlow( - entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'), - role='SageMakerRole', + entry_point=os.path.join(RESOURCE_PATH, "mnist", "horovod_mnist.py"), + role="SageMakerRole", train_instance_type=instance_type, train_instance_count=2, - image_name=ecr_image, + image_name=image_uri, framework_version=framework_version, - py_version='py3', + py_version="py3", script_mode=True, - hyperparameters={'sagemaker_mpi_enabled': True, - 'sagemaker_mpi_custom_mpi_options': mpi_options, - 'sagemaker_mpi_num_of_processes_per_host': 1}, - sagemaker_session=sagemaker_session) + hyperparameters={ + "sagemaker_mpi_enabled": True, + "sagemaker_mpi_custom_mpi_options": mpi_options, + "sagemaker_mpi_num_of_processes_per_host": 1, + }, + sagemaker_session=sagemaker_session, + ) - estimator.fit(job_name=unique_name_from_base('test-tf-horovod')) + estimator.fit(job_name=unique_name_from_base("test-tf-horovod")) model_data_source = sagemaker.local.data.get_data_source_instance( - estimator.model_data, sagemaker_session) + estimator.model_data, sagemaker_session + ) for filename in model_data_source.get_file_list(): - assert os.path.basename(filename) == 'model.tar.gz' + assert os.path.basename(filename) == "model.tar.gz" + + +@pytest.mark.skip_generic +def test_distributed_training_horovod_with_env_vars( + sagemaker_session, instance_type, image_uri, tmpdir, framework_version +): + + mpi_options = "-verbose -x orte_base_help_aggregate=0" + estimator = TensorFlow( + entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"), + role="SageMakerRole", + train_instance_type=instance_type, + train_instance_count=2, + image_name=image_uri, + framework_version=framework_version, + py_version="py3", + script_mode=True, + hyperparameters={ + "sagemaker_mpi_enabled": True, + "sagemaker_mpi_custom_mpi_options": mpi_options, + "sagemaker_mpi_num_of_processes_per_host": 2, + }, + sagemaker_session=sagemaker_session, + ) + + estimator.fit(job_name=unique_name_from_base("test-tf-horovod-env-vars")) diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index 25c8db3e..c466f573 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -18,143 +18,125 @@ import pytest from sagemaker.tensorflow import TensorFlow from sagemaker.tuner import HyperparameterTuner, IntegerParameter +from sagemaker.utils import unique_name_from_base from six.moves.urllib.parse import urlparse -from test.integration.utils import processor, py_version, unique_name_from_base # noqa: F401 from timeout import timeout @pytest.mark.deploy_test -def test_mnist(sagemaker_session, ecr_image, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist.py') - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_type=instance_type, - train_instance_count=1, - sagemaker_session=sagemaker_session, - image_name=ecr_image, - framework_version=framework_version, - script_mode=True) +def test_mnist(sagemaker_session, image_uri, instance_type, framework_version): + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist.py") + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_type=instance_type, + train_instance_count=1, + sagemaker_session=sagemaker_session, + image_name=image_uri, + framework_version=framework_version, + script_mode=True, + ) inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(resource_path, 'mnist', 'data'), - key_prefix='scriptmode/mnist') - estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist')) + path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" + ) + estimator.fit(inputs, job_name=unique_name_from_base("test-sagemaker-mnist")) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) -def test_distributed_mnist_no_ps(sagemaker_session, ecr_image, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist.py') - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_count=2, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=ecr_image, - framework_version=framework_version, - script_mode=True) +def test_distributed_mnist_no_ps(sagemaker_session, image_uri, instance_type, framework_version): + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist.py") + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_count=2, + train_instance_type=instance_type, + sagemaker_session=sagemaker_session, + image_name=image_uri, + framework_version=framework_version, + script_mode=True, + ) inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(resource_path, 'mnist', 'data'), - key_prefix='scriptmode/mnist') - estimator.fit(inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist')) + path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" + ) + estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist")) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) -def test_distributed_mnist_ps(sagemaker_session, ecr_image, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py') - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - hyperparameters={'sagemaker_parameter_server_enabled': True}, - train_instance_count=2, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=ecr_image, - framework_version=framework_version, - script_mode=True) +def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, framework_version): + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist_estimator.py") + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + hyperparameters={"sagemaker_parameter_server_enabled": True}, + train_instance_count=2, + train_instance_type=instance_type, + sagemaker_session=sagemaker_session, + image_name=image_uri, + framework_version=framework_version, + script_mode=True, + ) inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(resource_path, 'mnist', 'data-distributed'), - key_prefix='scriptmode/mnist-distributed') - estimator.fit(inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist')) + path=os.path.join(resource_path, "mnist", "data-distributed"), + key_prefix="scriptmode/mnist-distributed", + ) + estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist")) _assert_checkpoint_exists(sagemaker_session.boto_region_name, estimator.model_dir, 0) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) -def test_s3_plugin(sagemaker_session, ecr_image, instance_type, region, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py') - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - hyperparameters={ - # Saving a checkpoint after every 5 steps to hammer the S3 plugin - 'save-checkpoint-steps': 10, - # Disable throttling for checkpoint and model saving - 'throttle-secs': 0, - # Without the patch training jobs would fail around 100th to - # 150th step - 'max-steps': 200, - # Large batch size would result in a larger checkpoint file - 'batch-size': 1024, - # This makes the training job exporting model during training. - # Stale model garbage collection will also be performed. - 'export-model-during-training': True - }, - train_instance_count=1, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=ecr_image, - framework_version=framework_version, - script_mode=True) - estimator.fit('s3://sagemaker-sample-data-{}/tensorflow/mnist'.format(region), - job_name=unique_name_from_base('test-tf-sm-s3-mnist')) - _assert_s3_file_exists(region, estimator.model_data) - _assert_checkpoint_exists(region, estimator.model_dir, 200) - - -def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist.py') - - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_type=instance_type, - train_instance_count=1, - sagemaker_session=sagemaker_session, - image_name=ecr_image, - framework_version=framework_version, - script_mode=True) - - hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)} - objective_metric_name = 'accuracy' - metric_definitions = [{'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)'}] - - tuner = HyperparameterTuner(estimator, - objective_metric_name, - hyperparameter_ranges, - metric_definitions, - max_jobs=2, - max_parallel_jobs=2) +def test_tuning(sagemaker_session, image_uri, instance_type, framework_version): + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist.py") + + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_type=instance_type, + train_instance_count=1, + sagemaker_session=sagemaker_session, + image_name=image_uri, + framework_version=framework_version, + script_mode=True, + ) + + hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} + objective_metric_name = "accuracy" + metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)"}] + + tuner = HyperparameterTuner( + estimator, + objective_metric_name, + hyperparameter_ranges, + metric_definitions, + max_jobs=2, + max_parallel_jobs=2, + ) with timeout(minutes=20): inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(resource_path, 'mnist', 'data'), - key_prefix='scriptmode/mnist') + path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" + ) - tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32) + tuning_job_name = unique_name_from_base("test-tf-sm-tuning", max_length=32) tuner.fit(inputs, job_name=tuning_job_name) tuner.wait() def _assert_checkpoint_exists(region, model_dir, checkpoint_number): - _assert_s3_file_exists(region, os.path.join(model_dir, 'graph.pbtxt')) - _assert_s3_file_exists(region, - os.path.join(model_dir, 'model.ckpt-{}.index'.format(checkpoint_number))) - _assert_s3_file_exists(region, - os.path.join(model_dir, 'model.ckpt-{}.meta'.format(checkpoint_number))) + _assert_s3_file_exists(region, os.path.join(model_dir, "graph.pbtxt")) + _assert_s3_file_exists( + region, os.path.join(model_dir, "model.ckpt-{}.index".format(checkpoint_number)) + ) + _assert_s3_file_exists( + region, os.path.join(model_dir, "model.ckpt-{}.meta".format(checkpoint_number)) + ) def _assert_s3_file_exists(region, s3_url): parsed_url = urlparse(s3_url) - s3 = boto3.resource('s3', region_name=region) - s3.Object(parsed_url.netloc, parsed_url.path.lstrip('/')).load() + s3 = boto3.resource("s3", region_name=region) + s3.Object(parsed_url.netloc, parsed_url.path.lstrip("/")).load() diff --git a/test/integration/sagemaker/test_tuning_model_dir.py b/test/integration/sagemaker/test_tuning_model_dir.py index e833c3a4..c113c1cb 100644 --- a/test/integration/sagemaker/test_tuning_model_dir.py +++ b/test/integration/sagemaker/test_tuning_model_dir.py @@ -16,30 +16,35 @@ from sagemaker.tensorflow import TensorFlow from sagemaker.tuner import HyperparameterTuner, IntegerParameter - -from test.integration.utils import processor, py_version, unique_name_from_base # noqa: F401 - - -def test_model_dir_with_training_job_name(sagemaker_session, ecr_image, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources') - script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py') - - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_type=instance_type, - train_instance_count=1, - image_name=ecr_image, - framework_version=framework_version, - py_version='py3', - sagemaker_session=sagemaker_session) - - tuner = HyperparameterTuner(estimator=estimator, - objective_metric_name='accuracy', - hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)}, - metric_definitions=[{'Name': 'accuracy', 'Regex': 'accuracy=([01])'}], - max_jobs=1, - max_parallel_jobs=1) +from sagemaker.utils import unique_name_from_base + + +def test_model_dir_with_training_job_name( + sagemaker_session, image_uri, instance_type, framework_version +): + resource_path = os.path.join(os.path.dirname(__file__), "../..", "resources") + script = os.path.join(resource_path, "tuning_model_dir", "entry.py") + + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_type=instance_type, + train_instance_count=1, + image_name=image_uri, + framework_version=framework_version, + py_version="py3", + sagemaker_session=sagemaker_session, + ) + + tuner = HyperparameterTuner( + estimator=estimator, + objective_metric_name="accuracy", + hyperparameter_ranges={"arbitrary_value": IntegerParameter(0, 1)}, + metric_definitions=[{"Name": "accuracy", "Regex": "accuracy=([01])"}], + max_jobs=1, + max_parallel_jobs=1, + ) # User script has logic to check for the correct model_dir - tuner.fit(job_name=unique_name_from_base('test-tf-model-dir', max_length=32)) + tuner.fit(job_name=unique_name_from_base("test-tf-model-dir", max_length=32)) tuner.wait() diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py index d4738d32..1ff4278c 100644 --- a/test/integration/sagemaker/timeout.py +++ b/test/integration/sagemaker/timeout.py @@ -16,7 +16,7 @@ import logging import signal -LOGGER = logging.getLogger('timeout') +LOGGER = logging.getLogger("timeout") class TimeoutError(Exception): @@ -39,7 +39,7 @@ def timeout(seconds=0, minutes=0, hours=0): limit = seconds + 60 * minutes + 3600 * hours def handler(signum, frame): - raise TimeoutError('timed out after {} seconds'.format(limit)) + raise TimeoutError("timed out after {} seconds".format(limit)) try: signal.signal(signal.SIGALRM, handler) diff --git a/test/integration/utils.py b/test/integration/utils.py deleted file mode 100644 index 4944eb20..00000000 --- a/test/integration/utils.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os -import random -import time - -import pytest - - -def unique_name_from_base(base, max_length=63): - unique = '%04x' % random.randrange(16**4) # 4-digit hex - ts = str(int(time.time())) - available_length = max_length - 2 - len(ts) - len(unique) - trimmed = base[:available_length] - return '{}-{}-{}'.format(trimmed, ts, unique) - - -@pytest.fixture(params=os.environ['TEST_PY_VERSIONS'].split(',')) -def py_version(request): - return request.param - - -@pytest.fixture(params=os.environ['TEST_PROCESSORS'].split(',')) -def processor(request): - return request.param diff --git a/test/resources/gpu_device_placement.py b/test/resources/gpu_device_placement.py deleted file mode 100644 index 11bbcdff..00000000 --- a/test/resources/gpu_device_placement.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# or in the "license" file accompanying this file. This file is distributed -# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language governing -# permissions and limitations under the License. -import tensorflow as tf - -# https://www.tensorflow.org/programmers_guide/using_gpu -print('-' * 87) -print('Run GPU test.') -with tf.device('/gpu:0'): - a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') - b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') -c = tf.matmul(a, b) -sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) -# Runs the op. -print(sess.run(c)) -print('-' * 87) -print('') diff --git a/test/resources/hvdbasic/train_hvd_basic.py b/test/resources/hvdbasic/train_hvd_basic.py index cc068678..24a35a8b 100644 --- a/test/resources/hvdbasic/train_hvd_basic.py +++ b/test/resources/hvdbasic/train_hvd_basic.py @@ -4,8 +4,10 @@ hvd.init() -with open(os.path.join('/opt/ml/model/local-rank-%s-rank-%s' % (hvd.local_rank(), hvd.rank())), 'w+') as f: - basic_info = {'local-rank': hvd.local_rank(), 'rank': hvd.rank(), 'size': hvd.size()} +with open( + os.path.join("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank())), "w+" +) as f: + basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()} print(basic_info) json.dump(basic_info, f) diff --git a/test/resources/hvdbasic/train_hvd_env_vars.py b/test/resources/hvdbasic/train_hvd_env_vars.py new file mode 100644 index 00000000..da67367c --- /dev/null +++ b/test/resources/hvdbasic/train_hvd_env_vars.py @@ -0,0 +1,19 @@ +import json +import os +import horovod.tensorflow as hvd + +hvd.init() + +with open("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank()), "w+") as f: + basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()} + + print(basic_info) + json.dump(basic_info, f) + +val = os.environ.get("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") +host = os.environ.get("SM_CURRENT_HOST") + +assert val is not None +assert host is not None + +print("host {}: AWS_CONTAINER_CREDENTIALS_RELATIVE_URI={}".format(host, val)) diff --git a/test/resources/keras_inception.py b/test/resources/keras_inception.py deleted file mode 100644 index ebfd1a0e..00000000 --- a/test/resources/keras_inception.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# or in the "license" file accompanying this file. This file is distributed -# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language governing -# permissions and limitations under the License. -import argparse -import os - -import keras -import tensorflow as tf - -parser = argparse.ArgumentParser() -parser.add_argument('--model_dir', type=str) - -args = parser.parse_args() - - -# Loading pre-trained Keras model -model = keras.applications.inception_v3.InceptionV3(weights='imagenet') - -# Exports the keras model as TensorFlow Serving Saved Model -with tf.Session() as session: - - init = tf.global_variables_initializer() - session.run(init) - - tf.saved_model.simple_save( - session, - os.path.join(args.model_dir, 'inception-model/1'), - inputs={'input_image': model.input}, - outputs={t.name: t for t in model.outputs}) diff --git a/test/resources/mnist/horovod_mnist.py b/test/resources/mnist/horovod_mnist.py index 1014f2bb..f2bf4e8f 100644 --- a/test/resources/mnist/horovod_mnist.py +++ b/test/resources/mnist/horovod_mnist.py @@ -10,120 +10,84 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from __future__ import absolute_import, print_function - import os -import subprocess - -import keras -from keras.datasets import mnist -from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K import tensorflow as tf -import horovod.keras as hvd - +import horovod.tensorflow as hvd # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -config.gpu_options.allow_growth = True -config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) - -batch_size = 128 -num_classes = 10 - -epochs = 1 - -# Input image dimensions -img_rows, img_cols = 28, 28 - -# The data, shuffled and split between train and test sets -(x_train, y_train), (x_test, y_test) = mnist.load_data() - -x_train = x_train[:600] -y_train = y_train[:600] -x_test = x_test[:100] -y_test = y_test[:100] - -if K.image_data_format() == 'channels_first': - x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) - x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) - x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) - input_shape = (img_rows, img_cols, 1) - -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') - -# Convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) - -model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) -model.add(MaxPooling2D(pool_size=(2, 2))) -model.add(Dropout(0.25)) -model.add(Flatten()) -model.add(Dense(128, activation='relu')) -model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) +gpus = tf.config.experimental.list_physical_devices("GPU") +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") + +(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data( + path="mnist-%d.npz" % hvd.rank() +) + +dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) +) +dataset = dataset.repeat().shuffle(10000).batch(128) + +mnist_model = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), + tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation="softmax"), + ] +) +loss = tf.losses.SparseCategoricalCrossentropy() # Horovod: adjust learning rate based on number of GPUs. -opt = keras.optimizers.Adadelta(1.0 * hvd.size()) +opt = tf.optimizers.Adam(0.001 * hvd.size()) + +checkpoint_dir = "./checkpoints" +checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) + + +@tf.function +def training_step(images, labels, first_batch): + with tf.GradientTape() as tape: + probs = mnist_model(images, training=True) + loss_value = loss(labels, probs) -# Horovod: add Horovod Distributed Optimizer. -opt = hvd.DistributedOptimizer(opt) + # Horovod: add Horovod Distributed GradientTape. + tape = hvd.DistributedGradientTape(tape) -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=opt, - metrics=['accuracy']) + grads = tape.gradient(loss_value, mnist_model.trainable_variables) + opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) -callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. - hvd.callbacks.BroadcastGlobalVariablesCallback(0), -] + # + # Note: broadcast should be done after the first gradient step to ensure optimizer + # initialization. + if first_batch: + hvd.broadcast_variables(mnist_model.variables, root_rank=0) + hvd.broadcast_variables(opt.variables(), root_rank=0) -# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. -if hvd.rank() == 0: - callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) - -model.fit(x_train, y_train, - batch_size=batch_size, - callbacks=callbacks, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) -score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) + return loss_value -if hvd.rank() == 0: - # Exports the keras model as TensorFlow Serving Saved Model - with K.get_session() as session: +# Horovod: adjust number of steps based on number of GPUs. +for batch, (images, labels) in enumerate(dataset.take(600 // hvd.size())): + loss_value = training_step(images, labels, batch == 0) - init = tf.global_variables_initializer() - session.run(init) + if batch % 10 == 0 and hvd.local_rank() == 0: + print("Step #%d\tLoss: %.6f" % (batch, loss_value)) - tf.saved_model.simple_save( - session, - os.path.join('/opt/ml/model/mnist/1'), - inputs={'input_image': model.input}, - outputs={t.name: t for t in model.outputs}) +# Horovod: save checkpoints only on worker 0 to prevent other workers from +# corrupting it. +if hvd.rank() == 0: + # Export the keras model as Tensorflow SavedModelBundle + mnist_model.save(os.path.join("/opt/ml/model/mnist/1"), save_format="tf") diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py index e4349ce2..e1c2b275 100644 --- a/test/resources/mnist/mnist.py +++ b/test/resources/mnist/mnist.py @@ -7,63 +7,49 @@ import tensorflow as tf - def _parse_args(): parser = argparse.ArgumentParser() # hyperparameters sent by the client are passed as command-line arguments to the script. - parser.add_argument('--epochs', type=int, default=1) + parser.add_argument("--epochs", type=int, default=1) # Data, model, and output directories - parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) - parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) - parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) + parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) + parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"]) return parser.parse_known_args() def _load_training_data(base_dir): - x_train = np.load(os.path.join(base_dir, 'train', 'x_train.npy')) - y_train = np.load(os.path.join(base_dir, 'train', 'y_train.npy')) + x_train = np.load(os.path.join(base_dir, "train", "x_train.npy")) + y_train = np.load(os.path.join(base_dir, "train", "y_train.npy")) return x_train, y_train def _load_testing_data(base_dir): - x_test = np.load(os.path.join(base_dir, 'test', 'x_test.npy')) - y_test = np.load(os.path.join(base_dir, 'test', 'y_test.npy')) + x_test = np.load(os.path.join(base_dir, "test", "x_test.npy")) + y_test = np.load(os.path.join(base_dir, "test", "y_test.npy")) return x_test, y_test -def assert_can_track_sagemaker_experiments(): - in_sagemaker_training = 'TRAINING_JOB_ARN' in os.environ - in_python_three = sys.version_info[0] == 3 - - if in_sagemaker_training and in_python_three: - import smexperiments.tracker - - with smexperiments.tracker.Tracker.load() as tracker: - tracker.log_parameter('param', 1) - tracker.log_metric('metric', 1.0) - - args, unknown = _parse_args() -model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(512, activation=tf.nn.relu), - tf.keras.layers.Dropout(0.2), - tf.keras.layers.Dense(10, activation=tf.nn.softmax) -]) +model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(512, activation=tf.nn.relu), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation=tf.nn.softmax), + ] +) -model.compile(optimizer='adam', - loss='sparse_categorical_crossentropy', - metrics=['accuracy']) +model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) x_train, y_train = _load_training_data(args.train) x_test, y_test = _load_testing_data(args.train) model.fit(x_train, y_train, epochs=args.epochs) model.evaluate(x_test, y_test) if args.current_host == args.hosts[0]: - model.save(os.path.join('/opt/ml/model', 'my_model.h5')) - assert_can_track_sagemaker_experiments() + model.save(os.path.join("/opt/ml/model", "my_model.h5")) diff --git a/test/resources/mnist/mnist_estimator.py b/test/resources/mnist/mnist_estimator.py index d0b991f2..82fb75ac 100644 --- a/test/resources/mnist/mnist_estimator.py +++ b/test/resources/mnist/mnist_estimator.py @@ -4,171 +4,176 @@ from __future__ import division from __future__ import print_function +import logging + import numpy as np import tensorflow as tf import os import argparse import json + def cnn_model_fn(features, labels, mode): - """Model function for CNN.""" - # Input Layer - # Reshape X to 4-D tensor: [batch_size, width, height, channels] - # MNIST images are 28x28 pixels, and have one color channel - input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) - - # Convolutional Layer #1 - # Computes 32 features using a 5x5 filter with ReLU activation. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 28, 28, 1] - # Output Tensor Shape: [batch_size, 28, 28, 32] - conv1 = tf.layers.conv2d( - inputs=input_layer, - filters=32, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #1 - # First max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 28, 28, 32] - # Output Tensor Shape: [batch_size, 14, 14, 32] - pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) - - # Convolutional Layer #2 - # Computes 64 features using a 5x5 filter. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 14, 14, 32] - # Output Tensor Shape: [batch_size, 14, 14, 64] - conv2 = tf.layers.conv2d( - inputs=pool1, - filters=64, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #2 - # Second max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 14, 14, 64] - # Output Tensor Shape: [batch_size, 7, 7, 64] - pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) - - # Flatten tensor into a batch of vectors - # Input Tensor Shape: [batch_size, 7, 7, 64] - # Output Tensor Shape: [batch_size, 7 * 7 * 64] - pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) - - # Dense Layer - # Densely connected layer with 1024 neurons - # Input Tensor Shape: [batch_size, 7 * 7 * 64] - # Output Tensor Shape: [batch_size, 1024] - dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) - - # Add dropout operation; 0.6 probability that element will be kept - dropout = tf.layers.dropout( - inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) - - # Logits layer - # Input Tensor Shape: [batch_size, 1024] - # Output Tensor Shape: [batch_size, 10] - logits = tf.layers.dense(inputs=dropout, units=10) - - predictions = { - # Generate predictions (for PREDICT and EVAL mode) - "classes": tf.argmax(input=logits, axis=1), - # Add `softmax_tensor` to the graph. It is used for PREDICT and by the - # `logging_hook`. - "probabilities": tf.nn.softmax(logits, name="softmax_tensor") - } - if mode == tf.estimator.ModeKeys.PREDICT: - return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) - - # Calculate Loss (for both TRAIN and EVAL modes) - loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) - - # Configure the Training Op (for TRAIN mode) - if mode == tf.estimator.ModeKeys.TRAIN: - optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) - train_op = optimizer.minimize( - loss=loss, - global_step=tf.train.get_global_step()) - return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) - - # Add evaluation metrics (for EVAL mode) - eval_metric_ops = { - "accuracy": tf.metrics.accuracy( - labels=labels, predictions=predictions["classes"])} - return tf.estimator.EstimatorSpec( - mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) + """Model function for CNN.""" + # Input Layer + # Reshape X to 4-D tensor: [batch_size, width, height, channels] + # MNIST images are 28x28 pixels, and have one color channel + input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) + + # Convolutional Layer #1 + # Computes 32 features using a 5x5 filter with ReLU activation. + # Padding is added to preserve width and height. + # Input Tensor Shape: [batch_size, 28, 28, 1] + # Output Tensor Shape: [batch_size, 28, 28, 32] + conv1 = tf.compat.v1.layers.conv2d( + inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu + ) + + # Pooling Layer #1 + # First max pooling layer with a 2x2 filter and stride of 2 + # Input Tensor Shape: [batch_size, 28, 28, 32] + # Output Tensor Shape: [batch_size, 14, 14, 32] + pool1 = tf.compat.v1.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) + + # Convolutional Layer #2 + # Computes 64 features using a 5x5 filter. + # Padding is added to preserve width and height. + # Input Tensor Shape: [batch_size, 14, 14, 32] + # Output Tensor Shape: [batch_size, 14, 14, 64] + conv2 = tf.compat.v1.layers.conv2d( + inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu + ) + + # Pooling Layer #2 + # Second max pooling layer with a 2x2 filter and stride of 2 + # Input Tensor Shape: [batch_size, 14, 14, 64] + # Output Tensor Shape: [batch_size, 7, 7, 64] + pool2 = tf.compat.v1.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) + + # Flatten tensor into a batch of vectors + # Input Tensor Shape: [batch_size, 7, 7, 64] + # Output Tensor Shape: [batch_size, 7 * 7 * 64] + pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) + + # Dense Layer + # Densely connected layer with 1024 neurons + # Input Tensor Shape: [batch_size, 7 * 7 * 64] + # Output Tensor Shape: [batch_size, 1024] + dense = tf.compat.v1.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) + + # Add dropout operation; 0.6 probability that element will be kept + dropout = tf.compat.v1.layers.dropout( + inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN + ) + + # Logits layer + # Input Tensor Shape: [batch_size, 1024] + # Output Tensor Shape: [batch_size, 10] + logits = tf.compat.v1.layers.dense(inputs=dropout, units=10) + + predictions = { + # Generate predictions (for PREDICT and EVAL mode) + "classes": tf.argmax(input=logits, axis=1), + # Add `softmax_tensor` to the graph. It is used for PREDICT and by the + # `logging_hook`. + "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), + } + if mode == tf.estimator.ModeKeys.PREDICT: + return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) + + # Calculate Loss (for both TRAIN and EVAL modes) + loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) + + # Configure the Training Op (for TRAIN mode) + if mode == tf.estimator.ModeKeys.TRAIN: + optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001) + train_op = optimizer.minimize(loss=loss, global_step=tf.compat.v1.train.get_global_step()) + return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) + + # Add evaluation metrics (for EVAL mode) + eval_metric_ops = { + "accuracy": tf.compat.v1.metrics.accuracy(labels=labels, predictions=predictions["classes"]) + } + return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) + def _load_training_data(base_dir): - x_train = np.load(os.path.join(base_dir, 'train_data.npy')) - y_train = np.load(os.path.join(base_dir, 'train_labels.npy')) + x_train = np.load(os.path.join(base_dir, "train_data.npy")) + y_train = np.load(os.path.join(base_dir, "train_labels.npy")) return x_train, y_train + def _load_testing_data(base_dir): - x_test = np.load(os.path.join(base_dir, 'eval_data.npy')) - y_test = np.load(os.path.join(base_dir, 'eval_labels.npy')) + x_test = np.load(os.path.join(base_dir, "eval_data.npy")) + y_test = np.load(os.path.join(base_dir, "eval_labels.npy")) return x_test, y_test + def _parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) - parser.add_argument('--model_dir', type=str) - parser.add_argument('--max-steps', type=int, default=200) - parser.add_argument('--save-checkpoint-steps', type=int, default=200) - parser.add_argument('--throttle-secs', type=int, default=60) - parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--batch-size', type=int, default=100) - parser.add_argument('--export-model-during-training', type=bool, default=False) + parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"]) + parser.add_argument("--model_dir", type=str) + parser.add_argument("--max-steps", type=int, default=200) + parser.add_argument("--save-checkpoint-steps", type=int, default=200) + parser.add_argument("--throttle-secs", type=int, default=60) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--batch-size", type=int, default=100) + parser.add_argument("--export-model-during-training", type=bool, default=False) return parser.parse_known_args() + def serving_input_fn(): - inputs = {'x': tf.placeholder(tf.float32, [None, 784])} + inputs = {"x": tf.compat.v1.placeholder(tf.float32, [None, 784])} return tf.estimator.export.ServingInputReceiver(inputs, inputs) + if __name__ == "__main__": args, unknown = _parse_args() for arg in vars(args): print(arg, getattr(args, arg)) - tf.logging.set_verbosity(tf.logging.DEBUG) + logger = tf.get_logger() + logger.setLevel(logging.DEBUG) + # tf.logging.set_verbosity(tf.logging.DEBUG) train_data, train_labels = _load_training_data(args.train) eval_data, eval_labels = _load_testing_data(args.train) # Saving a checkpoint after every step run_config = tf.estimator.RunConfig(save_checkpoints_steps=args.save_checkpoint_steps) mnist_classifier = tf.estimator.Estimator( - model_fn=cnn_model_fn, model_dir=args.model_dir, config=run_config) + model_fn=cnn_model_fn, model_dir=args.model_dir, config=run_config + ) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} - logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=50 - ) + logging_hook = tf.estimator.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50) # Train the model - train_input_fn = tf.estimator.inputs.numpy_input_fn( + train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=args.batch_size, num_epochs=None, - shuffle=True) + shuffle=True, + ) - exporter = tf.estimator.LatestExporter('Servo', serving_input_receiver_fn=serving_input_fn) \ - if args.export_model_during_training else None + exporter = ( + tf.compat.v1.estimator.LatestExporter("Servo", serving_input_receiver_fn=serving_input_fn) + if args.export_model_during_training + else None + ) # Evaluate the model and print results - eval_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": eval_data}, - y=eval_labels, - num_epochs=1, - shuffle=False) + eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( + x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False + ) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=args.max_steps) - eval_spec = tf.estimator.EvalSpec(eval_input_fn, throttle_secs=args.throttle_secs, exporters=exporter) + eval_spec = tf.estimator.EvalSpec( + eval_input_fn, throttle_secs=args.throttle_secs, exporters=exporter + ) tf.estimator.train_and_evaluate(mnist_classifier, train_spec, eval_spec) if args.current_host == args.hosts[0]: - mnist_classifier.export_savedmodel('/opt/ml/model', serving_input_fn) + mnist_classifier.export_saved_model("/opt/ml/model", serving_input_fn) diff --git a/test/resources/tuning_model_dir/entry.py b/test/resources/tuning_model_dir/entry.py index 0bce7165..09d44abc 100644 --- a/test/resources/tuning_model_dir/entry.py +++ b/test/resources/tuning_model_dir/entry.py @@ -16,11 +16,13 @@ import os parser = argparse.ArgumentParser() -parser.add_argument('--model_dir', type=str) -parser.add_argument('--arbitrary_value', type=int, default=0) +parser.add_argument("--model_dir", type=str) +parser.add_argument("--arbitrary_value", type=int, default=0) args = parser.parse_args() -assert os.environ['TRAINING_JOB_NAME'] in args.model_dir, 'model_dir not unique to training job: %s' % args.model_dir +assert os.environ["TRAINING_JOB_NAME"] in args.model_dir, ( + "model_dir not unique to training job: %s" % args.model_dir +) # For the "hyperparameter tuning" to work -print('accuracy=1') +print("accuracy=1") diff --git a/test/unit/test_deep_learning_container.py b/test/unit/test_deep_learning_container.py deleted file mode 100644 index 7d5d7d86..00000000 --- a/test/unit/test_deep_learning_container.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the 'License'). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the 'license' file accompanying this file. This file is -# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import unittest - -from docker.build_artifacts import deep_learning_container as deep_learning_container_to_test -import pytest -import requests - - -@pytest.fixture(name='fixture_valid_instance_id') -def fixture_valid_instance_id(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', - text='i-123t32e11s32t1231') - - -@pytest.fixture(name='fixture_invalid_instance_id') -def fixture_invalid_instance_id(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text='i-123') - - -@pytest.fixture(name='fixture_none_instance_id') -def fixture_none_instance_id(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/meta-data/instance-id', text=None) - - -@pytest.fixture(name='fixture_invalid_region') -def fixture_invalid_region(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document', - json={'region': 'test'}) - - -@pytest.fixture(name='fixture_valid_region') -def fixture_valid_region(requests_mock): - return requests_mock.get('http://169.254.169.254/latest/dynamic/instance-identity/document', - json={'region': 'us-east-1'}) - - -def test_retrieve_instance_id(fixture_valid_instance_id): - result = deep_learning_container_to_test._retrieve_instance_id() - assert 'i-123t32e11s32t1231' == result - - -def test_retrieve_none_instance_id(fixture_none_instance_id): - result = deep_learning_container_to_test._retrieve_instance_id() - assert result is None - - -def test_retrieve_invalid_instance_id(fixture_invalid_instance_id): - result = deep_learning_container_to_test._retrieve_instance_id() - assert result is None - - -def test_retrieve_invalid_region(fixture_invalid_region): - result = deep_learning_container_to_test._retrieve_instance_region() - assert result is None - - -def test_retrieve_valid_region(fixture_valid_region): - result = deep_learning_container_to_test._retrieve_instance_region() - assert 'us-east-1' == result - - -def test_query_bucket(requests_mock, fixture_valid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_valid_region.return_value = 'us-east-1' - requests_mock.get(('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' - '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231'), - text='Access Denied') - actual_response = deep_learning_container_to_test.query_bucket() - assert 'Access Denied' == actual_response.text - - -def test_query_bucket_region_none(fixture_invalid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_invalid_region.return_value = None - actual_response = deep_learning_container_to_test.query_bucket() - assert actual_response is None - - -def test_query_bucket_instance_id_none(requests_mock, fixture_valid_region, fixture_none_instance_id): - fixture_none_instance_id.return_value = None - fixture_valid_region.return_value = 'us-east-1' - actual_response = deep_learning_container_to_test.query_bucket() - assert actual_response is None - - -def test_query_bucket_instance_id_invalid(requests_mock, fixture_valid_region, fixture_invalid_instance_id): - fixture_invalid_instance_id.return_value = None - fixture_valid_region.return_value = 'us-east-1' - actual_response = deep_learning_container_to_test.query_bucket() - assert actual_response is None - - -def test_HTTP_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_valid_region.return_value = 'us-east-1' - query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' - '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') - - requests_mock.get( - query_s3_url, - exc=requests.exceptions.HTTPError) - requests_mock.side_effect = requests.exceptions.HTTPError - - with pytest.raises(requests.exceptions.HTTPError): - actual_response = requests.get(query_s3_url) - assert actual_response is None - - -def test_connection_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_valid_region.return_value = 'us-east-1' - query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' - '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') - - requests_mock.get( - query_s3_url, - exc=requests.exceptions.ConnectionError) - - with pytest.raises(requests.exceptions.ConnectionError): - actual_response = requests.get( - query_s3_url) - - assert actual_response is None - - -def test_timeout_error_on_S3(requests_mock, fixture_valid_region, fixture_valid_instance_id): - fixture_valid_instance_id.return_value = 'i-123t32e11s32t1231' - fixture_valid_region.return_value = 'us-east-1' - query_s3_url = ('https://aws-deep-learning-containers-us-east-1.s3.us-east-1.amazonaws.com' - '/dlc-containers.txt?x-instance-id=i-123t32e11s32t1231') - - requests_mock.get( - query_s3_url, - exc=requests.Timeout) - - with pytest.raises(requests.exceptions.Timeout): - actual_response = requests.get( - query_s3_url) - - assert actual_response is None - - -if __name__ == '__main__': - unittest.main() diff --git a/test/unit/test_s3_utils.py b/test/unit/test_s3_utils.py index 03de70a3..2bd63bf8 100644 --- a/test/unit/test_s3_utils.py +++ b/test/unit/test_s3_utils.py @@ -19,30 +19,30 @@ from sagemaker_tensorflow_container import s3_utils -BUCKET_REGION = 'us-west-2' -JOB_REGION = 'us-west-1' -JOB_BUKCET = 'sagemaker-us-west-2-000-00-1' -PREFIX = 'sagemaker/something' -MODEL_DIR = 's3://{}/{}'.format(JOB_BUKCET, PREFIX) +BUCKET_REGION = "us-west-2" +JOB_REGION = "us-west-1" +JOB_BUKCET = "sagemaker-us-west-2-000-00-1" +PREFIX = "sagemaker/something" +MODEL_DIR = "s3://{}/{}".format(JOB_BUKCET, PREFIX) -@patch('boto3.client') +@patch("boto3.client") def test_configure(client): s3 = MagicMock() client.return_value = s3 - loc = {'LocationConstraint': BUCKET_REGION} + loc = {"LocationConstraint": BUCKET_REGION} s3.get_bucket_location.return_value = loc s3_utils.configure(MODEL_DIR, JOB_REGION) - assert os.environ['S3_REGION'] == BUCKET_REGION - assert os.environ['TF_CPP_MIN_LOG_LEVEL'] == '1' - assert os.environ['S3_USE_HTTPS'] == '1' + assert os.environ["S3_REGION"] == BUCKET_REGION + assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1" + assert os.environ["S3_USE_HTTPS"] == "1" def test_configure_local_dir(): - s3_utils.configure('/opt/ml/model', JOB_REGION) + s3_utils.configure("/opt/ml/model", JOB_REGION) - assert os.environ['S3_REGION'] == JOB_REGION - assert os.environ['TF_CPP_MIN_LOG_LEVEL'] == '1' - assert os.environ['S3_USE_HTTPS'] == '1' + assert os.environ["S3_REGION"] == JOB_REGION + assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1" + assert os.environ["S3_USE_HTTPS"] == "1" diff --git a/test/unit/test_training.py b/test/unit/test_training.py index b69beed2..2795af44 100644 --- a/test/unit/test_training.py +++ b/test/unit/test_training.py @@ -17,32 +17,32 @@ from mock import MagicMock, patch import pytest -from sagemaker_containers.beta.framework import runner +from sagemaker_training import runner import tensorflow as tf from sagemaker_tensorflow_container import training -MODULE_DIR = 's3://my/bucket' -MODULE_NAME = 'script_name' -LOG_LEVEL = 'Debug' -HOST1 = 'host1' -HOST2 = 'host2' +MODULE_DIR = "s3://my/bucket" +MODULE_NAME = "script_name" +LOG_LEVEL = "Debug" +HOST1 = "host1" +HOST2 = "host2" HOST_LIST = [HOST1, HOST2] CURRENT_HOST = HOST1 -CMD_ARGS = {'some_key': 'some_value'} +CMD_ARGS = {"some_key": "some_value"} CLUSTER_WITH_PS = { - 'master': ['{}:2222'.format(HOST1)], - 'worker': ['{}:2222'.format(HOST2)], - 'ps': ['{}:2223'.format(HOST1), '{}:2223'.format(HOST2)] + "master": ["{}:2222".format(HOST1)], + "worker": ["{}:2222".format(HOST2)], + "ps": ["{}:2223".format(HOST1), "{}:2223".format(HOST2)], } -MASTER_TASK = {'index': 0, 'type': 'master'} -WORKER_TASK = {'index': 0, 'type': 'worker'} -PS_TASK_1 = {'index': 0, 'type': 'ps'} -PS_TASK_2 = {'index': 1, 'type': 'ps'} -MODEL_DIR = 's3://bucket/prefix' -MODEL_DIR_CMD_LIST = ['--model_dir', MODEL_DIR] -REGION = 'us-west-2' -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', 'resources') +MASTER_TASK = {"index": 0, "type": "master"} +WORKER_TASK = {"index": 0, "type": "worker"} +PS_TASK_1 = {"index": 0, "type": "ps"} +PS_TASK_2 = {"index": 1, "type": "ps"} +MODEL_DIR = "s3://bucket/prefix" +MODEL_DIR_CMD_LIST = ["--model_dir", MODEL_DIR] +REGION = "us-west-2" +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "resources") @pytest.fixture @@ -50,9 +50,7 @@ def distributed_training_env(): env = simple_training_env() env.hosts = HOST_LIST - env.additional_framework_parameters = { - training.SAGEMAKER_PARAMETER_SERVER_ENABLED: True - } + env.additional_framework_parameters = {training.SAGEMAKER_PARAMETER_SERVER_ENABLED: True} return env @@ -65,187 +63,238 @@ def simple_training_env(): env = MagicMock() env.module_dir = MODULE_DIR env.user_entry_point = MODULE_NAME - env.hyperparameters = {'model_dir': MODEL_DIR} + env.hyperparameters = {"model_dir": MODEL_DIR} env.log_level = LOG_LEVEL env.additional_framework_parameters = {} env.hosts = CURRENT_HOST env.current_host = CURRENT_HOST env.to_env_vars = lambda: {} - env.job_name = 'test-training-job' + env.job_name = "test-training-job" return env def test_is_host_master(): assert training._is_host_master(HOST_LIST, CURRENT_HOST) is True - assert training._is_host_master(HOST_LIST, 'host2') is False - assert training._is_host_master(HOST_LIST, 'somehost') is False + assert training._is_host_master(HOST_LIST, "host2") is False + assert training._is_host_master(HOST_LIST, "somehost") is False -@patch('sagemaker_containers.beta.framework.entry_point.run') +@patch("sagemaker_training.entry_point.run") def test_single_machine(run_module, single_machine_training_env): training.train(single_machine_training_env, MODEL_DIR_CMD_LIST) - run_module.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST, - single_machine_training_env.to_env_vars(), - runner=runner.ProcessRunnerType) + run_module.assert_called_with( + uri=MODULE_DIR, + user_entry_point=MODULE_NAME, + args=MODEL_DIR_CMD_LIST, + env_vars=single_machine_training_env.to_env_vars(), + capture_error=True, + runner_type=runner.ProcessRunnerType, + ) -@patch('sagemaker_containers.beta.framework.entry_point.run') +@patch("sagemaker_training.entry_point.run") def test_train_horovod(run_module, single_machine_training_env): - single_machine_training_env.additional_framework_parameters['sagemaker_mpi_enabled'] = True + single_machine_training_env.additional_framework_parameters["sagemaker_mpi_enabled"] = True training.train(single_machine_training_env, MODEL_DIR_CMD_LIST) - run_module.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST, - single_machine_training_env.to_env_vars(), - runner=runner.MPIRunnerType) - - -@pytest.mark.skipif(sys.version_info.major != 3, - reason="Skip this for python 2 because of dict key order mismatch") -@patch('tensorflow.train.ClusterSpec') -@patch('tensorflow.train.Server') -@patch('sagemaker_containers.beta.framework.entry_point.run') -@patch('multiprocessing.Process', lambda target: target()) -@patch('time.sleep', MagicMock()) + run_module.assert_called_with( + uri=MODULE_DIR, + user_entry_point=MODULE_NAME, + args=MODEL_DIR_CMD_LIST, + env_vars=single_machine_training_env.to_env_vars(), + capture_error=True, + runner_type=runner.MPIRunnerType, + ) + + +@pytest.mark.skip_on_pipeline +@pytest.mark.skipif( + sys.version_info.major != 3, reason="Skip this for python 2 because of dict key order mismatch" +) +@patch("tensorflow.train.ClusterSpec") +@patch("tensorflow.train.Server") +@patch("sagemaker_training.entry_point.run") +@patch("multiprocessing.Process", lambda target: target()) +@patch("time.sleep", MagicMock()) def test_train_distributed_master(run, tf_server, cluster_spec, distributed_training_env): training.train(distributed_training_env, MODEL_DIR_CMD_LIST) - cluster_spec.assert_called_with({'worker': ['host2:2222'], - 'master': ['host1:2222'], - 'ps': ['host1:2223', 'host2:2223']}) + cluster_spec.assert_called_with( + {"worker": ["host2:2222"], "master": ["host1:2222"], "ps": ["host1:2223", "host2:2223"]} + ) tf_server.assert_called_with( - cluster_spec(), job_name='ps', task_index=0, config=tf.ConfigProto(device_count={'GPU': 0}) + cluster_spec(), job_name="ps", task_index=0, config=tf.ConfigProto(device_count={"GPU": 0}) ) tf_server().join.assert_called_with() - tf_config = '{"cluster": {' \ - '"master": ["host1:2222"], ' \ - '"ps": ["host1:2223", "host2:2223"], ' \ - '"worker": ["host2:2222"]}, ' \ - '"environment": "cloud", ' \ - '"task": {"index": 0, "type": "master"}}' + tf_config = ( + '{"cluster": {' + '"master": ["host1:2222"], ' + '"ps": ["host1:2223", "host2:2223"], ' + '"worker": ["host2:2222"]}, ' + '"environment": "cloud", ' + '"task": {"index": 0, "type": "master"}}' + ) - run.assert_called_with('s3://my/bucket', 'script_name', MODEL_DIR_CMD_LIST, - {'TF_CONFIG': tf_config}) + run.assert_called_with( + uri="s3://my/bucket", + user_entry_point="script_name", + args=MODEL_DIR_CMD_LIST, + env_vars={"TF_CONFIG": tf_config}, + capture_error=True, + ) -@pytest.mark.skipif(sys.version_info.major != 3, - reason="Skip this for python 2 because of dict key order mismatch") -@patch('tensorflow.train.ClusterSpec') -@patch('tensorflow.train.Server') -@patch('sagemaker_containers.beta.framework.entry_point.run') -@patch('multiprocessing.Process', lambda target: target()) -@patch('time.sleep', MagicMock()) +@pytest.mark.skip_on_pipeline +@pytest.mark.skipif( + sys.version_info.major != 3, reason="Skip this for python 2 because of dict key order mismatch" +) +@patch("tensorflow.train.ClusterSpec") +@patch("tensorflow.train.Server") +@patch("sagemaker_training.entry_point.run") +@patch("multiprocessing.Process", lambda target: target()) +@patch("time.sleep", MagicMock()) def test_train_distributed_worker(run, tf_server, cluster_spec, distributed_training_env): distributed_training_env.current_host = HOST2 training.train(distributed_training_env, MODEL_DIR_CMD_LIST) - cluster_spec.assert_called_with({'worker': ['host2:2222'], - 'master': ['host1:2222'], - 'ps': ['host1:2223', 'host2:2223']}) + cluster_spec.assert_called_with( + {"worker": ["host2:2222"], "master": ["host1:2222"], "ps": ["host1:2223", "host2:2223"]} + ) tf_server.assert_called_with( - cluster_spec(), job_name='ps', task_index=1, config=tf.ConfigProto(device_count={'GPU': 0}) + cluster_spec(), job_name="ps", task_index=1, config=tf.ConfigProto(device_count={"GPU": 0}) ) tf_server().join.assert_called_with() - tf_config = '{"cluster": {' \ - '"master": ["host1:2222"], ' \ - '"ps": ["host1:2223", "host2:2223"], ' \ - '"worker": ["host2:2222"]}, ' \ - '"environment": "cloud", ' \ - '"task": {"index": 0, "type": "worker"}}' + tf_config = ( + '{"cluster": {' + '"master": ["host1:2222"], ' + '"ps": ["host1:2223", "host2:2223"], ' + '"worker": ["host2:2222"]}, ' + '"environment": "cloud", ' + '"task": {"index": 0, "type": "worker"}}' + ) - run.assert_called_with('s3://my/bucket', 'script_name', MODEL_DIR_CMD_LIST, - {'TF_CONFIG': tf_config}) + run.assert_called_with( + uri="s3://my/bucket", + user_entry_point="script_name", + args=MODEL_DIR_CMD_LIST, + env_vars={"TF_CONFIG": tf_config}, + capture_error=True, + ) -@patch('sagemaker_containers.beta.framework.entry_point.run') +@patch("sagemaker_training.entry_point.run") def test_train_distributed_no_ps(run, distributed_training_env): distributed_training_env.additional_framework_parameters[ - training.SAGEMAKER_PARAMETER_SERVER_ENABLED] = False + training.SAGEMAKER_PARAMETER_SERVER_ENABLED + ] = False distributed_training_env.current_host = HOST2 training.train(distributed_training_env, MODEL_DIR_CMD_LIST) - run.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST, - distributed_training_env.to_env_vars(), runner=runner.ProcessRunnerType) + run.assert_called_with( + uri=MODULE_DIR, + user_entry_point=MODULE_NAME, + args=MODEL_DIR_CMD_LIST, + env_vars=distributed_training_env.to_env_vars(), + capture_error=True, + runner_type=runner.ProcessRunnerType, + ) def test_build_tf_config(): assert training._build_tf_config(HOST_LIST, HOST1) == { - 'cluster': CLUSTER_WITH_PS, - 'environment': 'cloud', - 'task': MASTER_TASK + "cluster": CLUSTER_WITH_PS, + "environment": "cloud", + "task": MASTER_TASK, } assert training._build_tf_config(HOST_LIST, HOST1, ps_task=True) == { - 'cluster': CLUSTER_WITH_PS, - 'environment': 'cloud', - 'task': PS_TASK_1 + "cluster": CLUSTER_WITH_PS, + "environment": "cloud", + "task": PS_TASK_1, } assert training._build_tf_config(HOST_LIST, HOST2) == { - 'cluster': CLUSTER_WITH_PS, - 'environment': 'cloud', - 'task': WORKER_TASK + "cluster": CLUSTER_WITH_PS, + "environment": "cloud", + "task": WORKER_TASK, } assert training._build_tf_config(HOST_LIST, HOST2, ps_task=True) == { - 'cluster': CLUSTER_WITH_PS, - 'environment': 'cloud', - 'task': PS_TASK_2} + "cluster": CLUSTER_WITH_PS, + "environment": "cloud", + "task": PS_TASK_2, + } def test_build_tf_config_error(): with pytest.raises(ValueError) as error: training._build_tf_config([HOST1], HOST1, ps_task=True) - assert 'Cannot have a ps task if there are no parameter servers in the cluster' in str(error.value) + assert "Cannot have a ps task if there are no parameter servers in the cluster" in str( + error.value + ) -@patch('sagemaker_tensorflow_container.training.logger') +@patch("sagemaker_tensorflow_container.training.logger") def test_log_model_missing_warning_no_model(logger): - path = os.path.join(RESOURCE_PATH, 'test_dir_empty') + path = os.path.join(RESOURCE_PATH, "test_dir_empty") if not os.path.exists(path): os.mkdir(path) training._log_model_missing_warning(path) - logger.warn.assert_called_with('No model artifact is saved under path {}.' - ' Your training job will not save any model files to S3.\n' - 'For details of how to construct your training script see:\n' - 'https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script' # noqa - .format(path)) + logger.warn.assert_called_with( + "No model artifact is saved under path {}." + " Your training job will not save any model files to S3.\n" + "For details of how to construct your training script see:\n" + "https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script".format( # noqa + path + ) + ) -@patch('sagemaker_tensorflow_container.training.logger') +@patch("sagemaker_tensorflow_container.training.logger") def test_log_model_missing_warning_wrong_format(logger): - training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_wrong_model')) - logger.warn.assert_called_with('Your model will NOT be servable with SageMaker TensorFlow Serving container. ' - 'The model artifact was not saved in the TensorFlow ' - 'SavedModel directory structure:\n' - 'https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory') + training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_wrong_model")) + logger.warn.assert_called_with( + "Your model will NOT be servable with SageMaker TensorFlow Serving container. " + "The model artifact was not saved in the TensorFlow " + "SavedModel directory structure:\n" + "https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory" + ) -@patch('sagemaker_tensorflow_container.training.logger') +@patch("sagemaker_tensorflow_container.training.logger") def test_log_model_missing_warning_wrong_parent_dir(logger): - training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_wrong_parent_dir')) - logger.warn.assert_called_with('Your model will NOT be servable with SageMaker TensorFlow Serving containers. ' - 'The SavedModel bundle is under directory \"{}\", not a numeric name.' - .format('not-digit')) + training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_wrong_parent_dir")) + logger.warn.assert_called_with( + "Your model will NOT be servable with SageMaker TensorFlow Serving containers. " + 'The SavedModel bundle is under directory "{}", not a numeric name.'.format("not-digit") + ) -@patch('sagemaker_tensorflow_container.training.logger') +@patch("sagemaker_tensorflow_container.training.logger") def test_log_model_missing_warning_correct(logger): - training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_correct_model')) + training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_correct_model")) logger.warn.assert_not_called() -@patch('sagemaker_tensorflow_container.training.logger') -@patch('sagemaker_tensorflow_container.training.train') -@patch('logging.Logger.setLevel') -@patch('sagemaker_containers.beta.framework.training_env') -@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={}) -@patch('sagemaker_tensorflow_container.s3_utils.configure') -def test_main(configure_s3_env, read_hyperparameters, training_env, - set_level, train, logger, single_machine_training_env): +@patch("sagemaker_tensorflow_container.training.logger") +@patch("sagemaker_tensorflow_container.training.train") +@patch("logging.Logger.setLevel") +@patch("sagemaker_training.environment.Environment") +@patch("sagemaker_training.environment.read_hyperparameters", return_value={}) +@patch("sagemaker_tensorflow_container.s3_utils.configure") +def test_main( + configure_s3_env, + read_hyperparameters, + training_env, + set_level, + train, + logger, + single_machine_training_env, +): training_env.return_value = single_machine_training_env - os.environ['SAGEMAKER_REGION'] = REGION + os.environ["SAGEMAKER_REGION"] = REGION training.main() read_hyperparameters.assert_called_once_with() training_env.assert_called_once_with(hyperparameters={}) @@ -253,46 +302,71 @@ def test_main(configure_s3_env, read_hyperparameters, training_env, configure_s3_env.assert_called_once() -@patch('sagemaker_tensorflow_container.training.logger') -@patch('sagemaker_tensorflow_container.training.train') -@patch('logging.Logger.setLevel') -@patch('sagemaker_containers.beta.framework.training_env') -@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={'model_dir': MODEL_DIR}) -@patch('sagemaker_tensorflow_container.s3_utils.configure') -def test_main_simple_training_model_dir(configure_s3_env, read_hyperparameters, training_env, - set_level, train, logger, single_machine_training_env): +@patch("sagemaker_tensorflow_container.training.logger") +@patch("sagemaker_tensorflow_container.training.train") +@patch("logging.Logger.setLevel") +@patch("sagemaker_training.environment.Environment") +@patch("sagemaker_training.environment.read_hyperparameters", return_value={"model_dir": MODEL_DIR}) +@patch("sagemaker_tensorflow_container.s3_utils.configure") +def test_main_simple_training_model_dir( + configure_s3_env, + read_hyperparameters, + training_env, + set_level, + train, + logger, + single_machine_training_env, +): training_env.return_value = single_machine_training_env - os.environ['SAGEMAKER_REGION'] = REGION + os.environ["SAGEMAKER_REGION"] = REGION training.main() configure_s3_env.assert_called_once_with(MODEL_DIR, REGION) -@patch('sagemaker_tensorflow_container.training.logger') -@patch('sagemaker_tensorflow_container.training.train') -@patch('logging.Logger.setLevel') -@patch('sagemaker_containers.beta.framework.training_env') -@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={'model_dir': MODEL_DIR, - '_tuning_objective_metric': 'auc'}) -@patch('sagemaker_tensorflow_container.s3_utils.configure') -def test_main_tuning_model_dir(configure_s3_env, read_hyperparameters, training_env, - set_level, train, logger, single_machine_training_env): +@patch("sagemaker_tensorflow_container.training.logger") +@patch("sagemaker_tensorflow_container.training.train") +@patch("logging.Logger.setLevel") +@patch("sagemaker_training.environment.Environment") +@patch( + "sagemaker_training.environment.read_hyperparameters", + return_value={"model_dir": MODEL_DIR, "_tuning_objective_metric": "auc"}, +) +@patch("sagemaker_tensorflow_container.s3_utils.configure") +def test_main_tuning_model_dir( + configure_s3_env, + read_hyperparameters, + training_env, + set_level, + train, + logger, + single_machine_training_env, +): training_env.return_value = single_machine_training_env - os.environ['SAGEMAKER_REGION'] = REGION + os.environ["SAGEMAKER_REGION"] = REGION training.main() - expected_model_dir = '{}/{}/model'.format(MODEL_DIR, single_machine_training_env.job_name) + expected_model_dir = "{}/{}/model".format(MODEL_DIR, single_machine_training_env.job_name) configure_s3_env.assert_called_once_with(expected_model_dir, REGION) -@patch('sagemaker_tensorflow_container.training.logger') -@patch('sagemaker_tensorflow_container.training.train') -@patch('logging.Logger.setLevel') -@patch('sagemaker_containers.beta.framework.training_env') -@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={'model_dir': '/opt/ml/model', - '_tuning_objective_metric': 'auc'}) -@patch('sagemaker_tensorflow_container.s3_utils.configure') -def test_main_tuning_mpi_model_dir(configure_s3_env, read_hyperparameters, training_env, - set_level, train, logger, single_machine_training_env): +@patch("sagemaker_tensorflow_container.training.logger") +@patch("sagemaker_tensorflow_container.training.train") +@patch("logging.Logger.setLevel") +@patch("sagemaker_training.environment.Environment") +@patch( + "sagemaker_training.environment.read_hyperparameters", + return_value={"model_dir": "/opt/ml/model", "_tuning_objective_metric": "auc"}, +) +@patch("sagemaker_tensorflow_container.s3_utils.configure") +def test_main_tuning_mpi_model_dir( + configure_s3_env, + read_hyperparameters, + training_env, + set_level, + train, + logger, + single_machine_training_env, +): training_env.return_value = single_machine_training_env - os.environ['SAGEMAKER_REGION'] = REGION + os.environ["SAGEMAKER_REGION"] = REGION training.main() - configure_s3_env.assert_called_once_with('/opt/ml/model', REGION) + configure_s3_env.assert_called_once_with("/opt/ml/model", REGION) diff --git a/test/resources/test_py_version/entry.py b/test/utils/__init__.py similarity index 67% rename from test/resources/test_py_version/entry.py rename to test/utils/__init__.py index 8f71a01b..79cb9cdf 100644 --- a/test/resources/test_py_version/entry.py +++ b/test/utils/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of @@ -11,12 +11,3 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from __future__ import absolute_import - -import os -import sys - - -py_version = '%s.%s' % (sys.version_info.major, sys.version_info.minor) - -with open(os.path.join(os.environ['SM_OUTPUT_DIR'], 'py_version'), 'a') as f: - f.write(py_version) diff --git a/test/utils/image_utils.py b/test/utils/image_utils.py new file mode 100644 index 00000000..9fe5b590 --- /dev/null +++ b/test/utils/image_utils.py @@ -0,0 +1,72 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import subprocess +import sys + +CYAN_COLOR = "\033[36m" +END_COLOR = "\033[0m" +DLC_AWS_ID = "763104351884" + + +def build_image(framework_version, dockerfile, image_uri, region, cwd="."): + _check_call("python setup.py sdist") + + if "dlc" in dockerfile: + ecr_login(region, DLC_AWS_ID) + + dockerfile_location = os.path.join("test", "container", framework_version, dockerfile) + + subprocess.check_call( + [ + "docker", + "build", + "-t", + image_uri, + "-f", + dockerfile_location, + "--build-arg", + "region={}".format(region), + cwd, + ], + cwd=cwd, + ) + print("created image {}".format(image_uri)) + return image_uri + + +def push_image(ecr_image, region, aws_id): + ecr_login(region, aws_id) + _check_call("docker push {}".format(ecr_image)) + + +def ecr_login(region, aws_id): + login = _check_call( + "aws ecr get-login --registry-ids {} ".format(aws_id) + + "--no-include-email --region {}".format(region) + ) + _check_call(login.decode("utf-8").rstrip("\n")) + + +def _check_call(cmd, *popenargs, **kwargs): + if isinstance(cmd, str): + cmd = cmd.split(" ") + _print_cmd(cmd) + return subprocess.check_output(cmd, *popenargs, **kwargs) + + +def _print_cmd(cmd): + print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR)) + sys.stdout.flush() diff --git a/tox.ini b/tox.ini index b4f6fbb0..17ed3095 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27,py36,flake8 +envlist = py27,py36,py37,flake8 skip_missing_interpreters = False [travis] @@ -27,6 +27,8 @@ exclude = benchmarks/ max-complexity = 10 ignore = + C901, + E203, # whitespace before ':': Black disagrees with and explicitly violates this. FI10, FI12, FI13, @@ -43,7 +45,7 @@ ignore = FI55, FI56, FI57, - E722 + W503 require-code = True @@ -61,7 +63,6 @@ passenv = commands = coverage run --rcfile .coveragerc_{envname} --source sagemaker_tensorflow_container -m py.test {posargs} {env:IGNORE_COVERAGE:} coverage report --include *sagemaker_tensorflow_container* --show-missing -deps = sagemaker-containers extras = test [testenv:flake8]