fix: fix canary test failure issues (#1016)

caxiaohu · web-flow · commit cff8b69b1fb3 · 2019-09-03T14:47:18.000-07:00
diff --git a/tests/integ/file_system_input_utils.py b/tests/integ/file_system_input_utils.py
@@ -285,31 +285,35 @@ def _instance_profile_exists(sagemaker_session):
     return True
 
 
-def tear_down(sagemaker_session, fs_resources):
-    if "file_system_fsx_id" in fs_resources:
-        fsx_client = sagemaker_session.boto_session.client("fsx")
-        fsx_client.delete_file_system(FileSystemId=fs_resources["file_system_fsx_id"])
-
-    efs_client = sagemaker_session.boto_session.client("efs")
-    if "mount_efs_target_id" in fs_resources:
-        efs_client.delete_mount_target(MountTargetId=fs_resources["mount_efs_target_id"])
-
-    if "file_system_efs_id" in fs_resources:
-        for _ in retries(30, "Checking mount target deleting status"):
-            desc = efs_client.describe_mount_targets(
-                FileSystemId=fs_resources["file_system_efs_id"]
-            )
-            if len(desc["MountTargets"]) > 0:
-                status = desc["MountTargets"][0]["LifeCycleState"]
-                if status == "deleted":
+def tear_down(sagemaker_session, fs_resources={}):
+    try:
+        if "file_system_fsx_id" in fs_resources:
+            fsx_client = sagemaker_session.boto_session.client("fsx")
+            fsx_client.delete_file_system(FileSystemId=fs_resources["file_system_fsx_id"])
+
+        efs_client = sagemaker_session.boto_session.client("efs")
+        if "mount_efs_target_id" in fs_resources:
+            efs_client.delete_mount_target(MountTargetId=fs_resources["mount_efs_target_id"])
+
+        if "file_system_efs_id" in fs_resources:
+            for _ in retries(30, "Checking mount target deleting status"):
+                desc = efs_client.describe_mount_targets(
+                    FileSystemId=fs_resources["file_system_efs_id"]
+                )
+                if len(desc["MountTargets"]) > 0:
+                    status = desc["MountTargets"][0]["LifeCycleState"]
+                    if status == "deleted":
+                        break
+                else:
                     break
-            else:
-                break
 
-        efs_client.delete_file_system(FileSystemId=fs_resources["file_system_efs_id"])
+            efs_client.delete_file_system(FileSystemId=fs_resources["file_system_efs_id"])
 
-    if "ec2_instance_id" in fs_resources:
-        ec2_resource = sagemaker_session.boto_session.resource("ec2")
-        _terminate_instance(ec2_resource, [fs_resources["ec2_instance_id"]])
+        if "ec2_instance_id" in fs_resources:
+            ec2_resource = sagemaker_session.boto_session.resource("ec2")
+            _terminate_instance(ec2_resource, [fs_resources["ec2_instance_id"]])
 
-    _delete_key_pair(sagemaker_session)
+        _delete_key_pair(sagemaker_session)
+
+    except Exception:
+        pass
diff --git a/tests/integ/test_kmeans_efs_fsx.py b/tests/integ/test_kmeans_efs_fsx.py
@@ -14,7 +14,6 @@
 
 import pytest
 
-import tests.integ
 from sagemaker import KMeans
 from sagemaker.amazon.amazon_estimator import FileSystemRecordSet
 from sagemaker.parameter import IntegerParameter, CategoricalParameter
@@ -47,10 +46,6 @@ def efs_fsx_setup(sagemaker_session, ec2_instance_type):
             tear_down(sagemaker_session, fs_resources)
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
 def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
         role = efs_fsx_setup["role_name"]
@@ -82,10 +77,6 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
         assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
 def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
         role = efs_fsx_setup["role_name"]
@@ -116,10 +107,6 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
         assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
 def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup["role_name"]
     subnets = [efs_fsx_setup["subnet_id"]]
@@ -176,10 +163,6 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
         assert best_training_job
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
 def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup["role_name"]
     subnets = [efs_fsx_setup["subnet_id"]]
diff --git a/tests/integ/test_tf_efs_fsx.py b/tests/integ/test_tf_efs_fsx.py
@@ -17,7 +17,6 @@
 
 import pytest
 
-import tests.integ
 from sagemaker.inputs import FileSystemInput
 from sagemaker.parameter import IntegerParameter
 from sagemaker.tensorflow import TensorFlow
@@ -50,10 +49,6 @@ def efs_fsx_setup(sagemaker_session, ec2_instance_type):
             tear_down(sagemaker_session, fs_resources)
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
 def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup["role_name"]
     subnets = [efs_fsx_setup["subnet_id"]]
@@ -86,10 +81,6 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     )
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
 def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup["role_name"]
     subnets = [efs_fsx_setup["subnet_id"]]
@@ -122,10 +113,6 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     )
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
 def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup["role_name"]
     subnets = [efs_fsx_setup["subnet_id"]]
@@ -170,10 +157,6 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instanc
     assert best_training_job
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
 def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup["role_name"]
     subnets = [efs_fsx_setup["subnet_id"]]
diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py
@@ -148,9 +148,7 @@ def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version, cpu_instance
     script_path = os.path.join(data_path, "mnist.py")
 
     ec2_client = sagemaker_session.boto_session.client("ec2")
-    subnet_ids, security_group_id = get_or_create_vpc_resources(
-        ec2_client, sagemaker_session.boto_session.region_name
-    )
+    subnet_ids, security_group_id = get_or_create_vpc_resources(ec2_client)
 
     mx = MXNet(
         entry_point=script_path,
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -695,9 +695,7 @@ def test_tuning_tf_vpc_multi(sagemaker_session, cpu_instance_type):
     script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")
 
     ec2_client = sagemaker_session.boto_session.client("ec2")
-    subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(
-        ec2_client, sagemaker_session.boto_region_name
-    )
+    subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(ec2_client)
     vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id)
 
     estimator = TensorFlow(
diff --git a/tests/integ/vpc_test_utils.py b/tests/integ/vpc_test_utils.py
@@ -19,6 +19,7 @@
 
 VPC_NAME = "sagemaker-python-sdk-test-vpc"
 LOCK_PATH = os.path.join(tempfile.gettempdir(), "sagemaker_test_vpc_lock")
+LOCK_PATH_EFS = os.path.join(tempfile.gettempdir(), "sagemaker_efs_fsx_vpc_lock")
 
 
 def _get_subnet_ids_by_name(ec2_client, name):
@@ -64,7 +65,7 @@ def _route_table_id(ec2_client, vpc_id):
 
 def check_or_create_vpc_resources_efs_fsx(sagemaker_session, name=VPC_NAME):
     # use lock to prevent race condition when tests are running concurrently
-    with lock.lock(LOCK_PATH):
+    with lock.lock(LOCK_PATH_EFS):
         ec2_client = sagemaker_session.boto_session.client("ec2")
 
         if _vpc_exists(ec2_client, name):
@@ -121,7 +122,7 @@ def _create_vpc_with_name_efs_fsx(ec2_client, name):
 
 def _create_vpc_resources(ec2_client, name):
     vpc_id = ec2_client.create_vpc(CidrBlock="10.0.0.0/16")["Vpc"]["VpcId"]
-    print("created vpc: {}".format(vpc_id))
+    ec2_client.create_tags(Resources=[vpc_id], Tags=[{"Key": "Name", "Value": name}])
 
     availability_zone_name = ec2_client.describe_availability_zones()["AvailabilityZones"][0][
         "ZoneName"
@@ -163,30 +164,29 @@ def _create_vpc_resources(ec2_client, name):
     )
 
     ec2_client.create_tags(
-        Resources=[vpc_id, subnet_id_a, subnet_id_b, security_group_id],
+        Resources=[subnet_id_a, subnet_id_b, security_group_id],
         Tags=[{"Key": "Name", "Value": name}],
     )
-
     return vpc_id, [subnet_id_a, subnet_id_b], security_group_id
 
 
-def _create_vpc_with_name(ec2_client, region, name):
+def _create_vpc_with_name(ec2_client, name):
     vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(ec2_client, name)
     return [subnet_id_a, subnet_id_b], security_group_id
 
 
-def get_or_create_vpc_resources(ec2_client, region, name=VPC_NAME):
+def get_or_create_vpc_resources(ec2_client):
     # use lock to prevent race condition when tests are running concurrently
     with lock.lock(LOCK_PATH):
-        if _vpc_exists(ec2_client, name):
-            print("using existing vpc: {}".format(name))
+        if _vpc_exists(ec2_client, VPC_NAME):
+            print("using existing vpc: {}".format(VPC_NAME))
             return (
-                _get_subnet_ids_by_name(ec2_client, name),
-                _get_security_id_by_name(ec2_client, name),
+                _get_subnet_ids_by_name(ec2_client, VPC_NAME),
+                _get_security_id_by_name(ec2_client, VPC_NAME),
             )
         else:
-            print("creating new vpc: {}".format(name))
-            return _create_vpc_with_name(ec2_client, region, name)
+            print("creating new vpc: {}".format(VPC_NAME))
+            return _create_vpc_with_name(ec2_client, VPC_NAME)
 
 
 def setup_security_group_for_encryption(ec2_client, security_group_id):