Skip to content

Commit cff8b69

Browse files
authored
fix: fix canary test failure issues (#1016)
1 parent ee3d912 commit cff8b69

File tree

6 files changed

+42
-76
lines changed

6 files changed

+42
-76
lines changed

tests/integ/file_system_input_utils.py

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -285,31 +285,35 @@ def _instance_profile_exists(sagemaker_session):
285285
return True
286286

287287

288-
def tear_down(sagemaker_session, fs_resources):
289-
if "file_system_fsx_id" in fs_resources:
290-
fsx_client = sagemaker_session.boto_session.client("fsx")
291-
fsx_client.delete_file_system(FileSystemId=fs_resources["file_system_fsx_id"])
292-
293-
efs_client = sagemaker_session.boto_session.client("efs")
294-
if "mount_efs_target_id" in fs_resources:
295-
efs_client.delete_mount_target(MountTargetId=fs_resources["mount_efs_target_id"])
296-
297-
if "file_system_efs_id" in fs_resources:
298-
for _ in retries(30, "Checking mount target deleting status"):
299-
desc = efs_client.describe_mount_targets(
300-
FileSystemId=fs_resources["file_system_efs_id"]
301-
)
302-
if len(desc["MountTargets"]) > 0:
303-
status = desc["MountTargets"][0]["LifeCycleState"]
304-
if status == "deleted":
288+
def tear_down(sagemaker_session, fs_resources={}):
289+
try:
290+
if "file_system_fsx_id" in fs_resources:
291+
fsx_client = sagemaker_session.boto_session.client("fsx")
292+
fsx_client.delete_file_system(FileSystemId=fs_resources["file_system_fsx_id"])
293+
294+
efs_client = sagemaker_session.boto_session.client("efs")
295+
if "mount_efs_target_id" in fs_resources:
296+
efs_client.delete_mount_target(MountTargetId=fs_resources["mount_efs_target_id"])
297+
298+
if "file_system_efs_id" in fs_resources:
299+
for _ in retries(30, "Checking mount target deleting status"):
300+
desc = efs_client.describe_mount_targets(
301+
FileSystemId=fs_resources["file_system_efs_id"]
302+
)
303+
if len(desc["MountTargets"]) > 0:
304+
status = desc["MountTargets"][0]["LifeCycleState"]
305+
if status == "deleted":
306+
break
307+
else:
305308
break
306-
else:
307-
break
308309

309-
efs_client.delete_file_system(FileSystemId=fs_resources["file_system_efs_id"])
310+
efs_client.delete_file_system(FileSystemId=fs_resources["file_system_efs_id"])
310311

311-
if "ec2_instance_id" in fs_resources:
312-
ec2_resource = sagemaker_session.boto_session.resource("ec2")
313-
_terminate_instance(ec2_resource, [fs_resources["ec2_instance_id"]])
312+
if "ec2_instance_id" in fs_resources:
313+
ec2_resource = sagemaker_session.boto_session.resource("ec2")
314+
_terminate_instance(ec2_resource, [fs_resources["ec2_instance_id"]])
314315

315-
_delete_key_pair(sagemaker_session)
316+
_delete_key_pair(sagemaker_session)
317+
318+
except Exception:
319+
pass

tests/integ/test_kmeans_efs_fsx.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import pytest
1616

17-
import tests.integ
1817
from sagemaker import KMeans
1918
from sagemaker.amazon.amazon_estimator import FileSystemRecordSet
2019
from sagemaker.parameter import IntegerParameter, CategoricalParameter
@@ -47,10 +46,6 @@ def efs_fsx_setup(sagemaker_session, ec2_instance_type):
4746
tear_down(sagemaker_session, fs_resources)
4847

4948

50-
@pytest.mark.skipif(
51-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
52-
reason="EFS integration tests need to be fixed before running in all regions.",
53-
)
5449
def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
5550
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
5651
role = efs_fsx_setup["role_name"]
@@ -82,10 +77,6 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
8277
assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
8378

8479

85-
@pytest.mark.skipif(
86-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
87-
reason="EFS integration tests need to be fixed before running in all regions.",
88-
)
8980
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
9081
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
9182
role = efs_fsx_setup["role_name"]
@@ -116,10 +107,6 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
116107
assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
117108

118109

119-
@pytest.mark.skipif(
120-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
121-
reason="EFS integration tests need to be fixed before running in all regions.",
122-
)
123110
def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
124111
role = efs_fsx_setup["role_name"]
125112
subnets = [efs_fsx_setup["subnet_id"]]
@@ -176,10 +163,6 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
176163
assert best_training_job
177164

178165

179-
@pytest.mark.skipif(
180-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
181-
reason="EFS integration tests need to be fixed before running in all regions.",
182-
)
183166
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
184167
role = efs_fsx_setup["role_name"]
185168
subnets = [efs_fsx_setup["subnet_id"]]

tests/integ/test_tf_efs_fsx.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
import pytest
1919

20-
import tests.integ
2120
from sagemaker.inputs import FileSystemInput
2221
from sagemaker.parameter import IntegerParameter
2322
from sagemaker.tensorflow import TensorFlow
@@ -50,10 +49,6 @@ def efs_fsx_setup(sagemaker_session, ec2_instance_type):
5049
tear_down(sagemaker_session, fs_resources)
5150

5251

53-
@pytest.mark.skipif(
54-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
55-
reason="EFS integration tests need to be fixed before running in all regions.",
56-
)
5752
def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
5853
role = efs_fsx_setup["role_name"]
5954
subnets = [efs_fsx_setup["subnet_id"]]
@@ -86,10 +81,6 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
8681
)
8782

8883

89-
@pytest.mark.skipif(
90-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
91-
reason="EFS integration tests need to be fixed before running in all regions.",
92-
)
9384
def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
9485
role = efs_fsx_setup["role_name"]
9586
subnets = [efs_fsx_setup["subnet_id"]]
@@ -122,10 +113,6 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
122113
)
123114

124115

125-
@pytest.mark.skipif(
126-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
127-
reason="EFS integration tests need to be fixed before running in all regions.",
128-
)
129116
def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
130117
role = efs_fsx_setup["role_name"]
131118
subnets = [efs_fsx_setup["subnet_id"]]
@@ -170,10 +157,6 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instanc
170157
assert best_training_job
171158

172159

173-
@pytest.mark.skipif(
174-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
175-
reason="EFS integration tests need to be fixed before running in all regions.",
176-
)
177160
def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
178161
role = efs_fsx_setup["role_name"]
179162
subnets = [efs_fsx_setup["subnet_id"]]

tests/integ/test_transformer.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,7 @@ def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version, cpu_instance
148148
script_path = os.path.join(data_path, "mnist.py")
149149

150150
ec2_client = sagemaker_session.boto_session.client("ec2")
151-
subnet_ids, security_group_id = get_or_create_vpc_resources(
152-
ec2_client, sagemaker_session.boto_session.region_name
153-
)
151+
subnet_ids, security_group_id = get_or_create_vpc_resources(ec2_client)
154152

155153
mx = MXNet(
156154
entry_point=script_path,

tests/integ/test_tuner.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -695,9 +695,7 @@ def test_tuning_tf_vpc_multi(sagemaker_session, cpu_instance_type):
695695
script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")
696696

697697
ec2_client = sagemaker_session.boto_session.client("ec2")
698-
subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(
699-
ec2_client, sagemaker_session.boto_region_name
700-
)
698+
subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(ec2_client)
701699
vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id)
702700

703701
estimator = TensorFlow(

tests/integ/vpc_test_utils.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
VPC_NAME = "sagemaker-python-sdk-test-vpc"
2121
LOCK_PATH = os.path.join(tempfile.gettempdir(), "sagemaker_test_vpc_lock")
22+
LOCK_PATH_EFS = os.path.join(tempfile.gettempdir(), "sagemaker_efs_fsx_vpc_lock")
2223

2324

2425
def _get_subnet_ids_by_name(ec2_client, name):
@@ -64,7 +65,7 @@ def _route_table_id(ec2_client, vpc_id):
6465

6566
def check_or_create_vpc_resources_efs_fsx(sagemaker_session, name=VPC_NAME):
6667
# use lock to prevent race condition when tests are running concurrently
67-
with lock.lock(LOCK_PATH):
68+
with lock.lock(LOCK_PATH_EFS):
6869
ec2_client = sagemaker_session.boto_session.client("ec2")
6970

7071
if _vpc_exists(ec2_client, name):
@@ -121,7 +122,7 @@ def _create_vpc_with_name_efs_fsx(ec2_client, name):
121122

122123
def _create_vpc_resources(ec2_client, name):
123124
vpc_id = ec2_client.create_vpc(CidrBlock="10.0.0.0/16")["Vpc"]["VpcId"]
124-
print("created vpc: {}".format(vpc_id))
125+
ec2_client.create_tags(Resources=[vpc_id], Tags=[{"Key": "Name", "Value": name}])
125126

126127
availability_zone_name = ec2_client.describe_availability_zones()["AvailabilityZones"][0][
127128
"ZoneName"
@@ -163,30 +164,29 @@ def _create_vpc_resources(ec2_client, name):
163164
)
164165

165166
ec2_client.create_tags(
166-
Resources=[vpc_id, subnet_id_a, subnet_id_b, security_group_id],
167+
Resources=[subnet_id_a, subnet_id_b, security_group_id],
167168
Tags=[{"Key": "Name", "Value": name}],
168169
)
169-
170170
return vpc_id, [subnet_id_a, subnet_id_b], security_group_id
171171

172172

173-
def _create_vpc_with_name(ec2_client, region, name):
173+
def _create_vpc_with_name(ec2_client, name):
174174
vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(ec2_client, name)
175175
return [subnet_id_a, subnet_id_b], security_group_id
176176

177177

178-
def get_or_create_vpc_resources(ec2_client, region, name=VPC_NAME):
178+
def get_or_create_vpc_resources(ec2_client):
179179
# use lock to prevent race condition when tests are running concurrently
180180
with lock.lock(LOCK_PATH):
181-
if _vpc_exists(ec2_client, name):
182-
print("using existing vpc: {}".format(name))
181+
if _vpc_exists(ec2_client, VPC_NAME):
182+
print("using existing vpc: {}".format(VPC_NAME))
183183
return (
184-
_get_subnet_ids_by_name(ec2_client, name),
185-
_get_security_id_by_name(ec2_client, name),
184+
_get_subnet_ids_by_name(ec2_client, VPC_NAME),
185+
_get_security_id_by_name(ec2_client, VPC_NAME),
186186
)
187187
else:
188-
print("creating new vpc: {}".format(name))
189-
return _create_vpc_with_name(ec2_client, region, name)
188+
print("creating new vpc: {}".format(VPC_NAME))
189+
return _create_vpc_with_name(ec2_client, VPC_NAME)
190190

191191

192192
def setup_security_group_for_encryption(ec2_client, security_group_id):

0 commit comments

Comments
 (0)