diff --git a/tests/conftest.py b/tests/conftest.py index 5fa68c7ec3..8e2aef4fbe 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,6 +19,7 @@ import pytest import tests.integ from botocore.config import Config +from packaging.version import Version from sagemaker import Session, utils from sagemaker.local import LocalSession @@ -57,7 +58,6 @@ def pytest_addoption(parser): "--rl-ray-full-version", action="store", default=RLEstimator.RAY_LATEST_VERSION ) parser.addoption("--sklearn-full-version", action="store", default="0.20.0") - parser.addoption("--tf-full-version", action="store", default="2.2.0") parser.addoption("--ei-tf-full-version", action="store") parser.addoption("--xgboost-full-version", action="store", default="1.0-1") @@ -304,35 +304,45 @@ def sklearn_full_version(request): @pytest.fixture(scope="module") -def tf_full_version(request): - return request.config.getoption("--tf-full-version") +def tf_training_latest_version(): + return "2.2.0" + + +@pytest.fixture(scope="module") +def tf_training_latest_py_version(): + return "py37" + + +@pytest.fixture(scope="module") +def tf_serving_latest_version(): + return "2.1.0" + + +@pytest.fixture(scope="module") +def tf_full_version(tf_training_latest_version, tf_serving_latest_version): + """Fixture for TF tests that test both training and inference. + + Fixture exists as such, since TF training and TFS have different latest versions. + Otherwise, this would simply be a single latest version. + """ + return str(min(Version(tf_training_latest_version), Version(tf_serving_latest_version))) @pytest.fixture(scope="module") def tf_full_py_version(tf_full_version): - """fixture to match tf_full_version + """Fixture to match tf_full_version - Fixture exists as such, since tf_full_version may be overridden --tf-full-version. + Fixture exists as such, since TF training and TFS have different latest versions. Otherwise, this would simply be py37 to match the latest version support. - - TODO: Evaluate use of --tf-full-version with possible eye to remove and simplify code. """ - version = [int(val) for val in tf_full_version.split(".")] - if version < [1, 11]: + version = Version(tf_full_version) + if version < Version("1.11"): return "py2" - if version < [2, 2]: + if version < Version("2.2"): return "py3" return "py37" -@pytest.fixture(scope="module") -def tf_serving_version(tf_full_version): - full_version = [int(val) for val in tf_full_version.split(".")] - if full_version < [2, 2]: - return tf_full_version - return "2.1.0" - - @pytest.fixture(scope="module", params=["1.15.0", "2.0.0"]) def ei_tf_full_version(request): tf_ei_version = request.config.getoption("--ei-tf-full-version") diff --git a/tests/integ/test_airflow_config.py b/tests/integ/test_airflow_config.py index 4b43004cb5..0dc1ab4add 100644 --- a/tests/integ/test_airflow_config.py +++ b/tests/integ/test_airflow_config.py @@ -512,7 +512,7 @@ def test_sklearn_airflow_config_uploads_data_source_to_s3( @pytest.mark.canary_quick def test_tf_airflow_config_uploads_data_source_to_s3( - sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version + sagemaker_session, cpu_instance_type, tf_training_latest_version, tf_training_latest_py_version ): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): tf = TensorFlow( @@ -524,8 +524,8 @@ def test_tf_airflow_config_uploads_data_source_to_s3( train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - framework_version=tf_full_version, - py_version=tf_full_py_version, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, metric_definitions=[ {"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"} ], diff --git a/tests/integ/test_data_capture_config.py b/tests/integ/test_data_capture_config.py index b4ad979fcf..f230055418 100644 --- a/tests/integ/test_data_capture_config.py +++ b/tests/integ/test_data_capture_config.py @@ -41,7 +41,7 @@ def test_enabling_data_capture_on_endpoint_shows_correct_data_capture_status( - sagemaker_session, tf_serving_version + sagemaker_session, tf_serving_latest_version ): endpoint_name = unique_name_from_base("sagemaker-tensorflow-serving") model_data = sagemaker_session.upload_data( @@ -52,7 +52,7 @@ def test_enabling_data_capture_on_endpoint_shows_correct_data_capture_status( model = TensorFlowModel( model_data=model_data, role=ROLE, - framework_version=tf_serving_version, + framework_version=tf_serving_latest_version, sagemaker_session=sagemaker_session, ) predictor = model.deploy( @@ -98,7 +98,7 @@ def test_enabling_data_capture_on_endpoint_shows_correct_data_capture_status( def test_disabling_data_capture_on_endpoint_shows_correct_data_capture_status( - sagemaker_session, tf_serving_version + sagemaker_session, tf_serving_latest_version ): endpoint_name = unique_name_from_base("sagemaker-tensorflow-serving") model_data = sagemaker_session.upload_data( @@ -109,7 +109,7 @@ def test_disabling_data_capture_on_endpoint_shows_correct_data_capture_status( model = TensorFlowModel( model_data=model_data, role=ROLE, - framework_version=tf_serving_version, + framework_version=tf_serving_latest_version, sagemaker_session=sagemaker_session, ) destination_s3_uri = os.path.join( @@ -184,7 +184,7 @@ def test_disabling_data_capture_on_endpoint_shows_correct_data_capture_status( def test_updating_data_capture_on_endpoint_shows_correct_data_capture_status( - sagemaker_session, tf_serving_version + sagemaker_session, tf_serving_latest_version ): endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving") model_data = sagemaker_session.upload_data( @@ -195,7 +195,7 @@ def test_updating_data_capture_on_endpoint_shows_correct_data_capture_status( model = TensorFlowModel( model_data=model_data, role=ROLE, - framework_version=tf_serving_version, + framework_version=tf_serving_latest_version, sagemaker_session=sagemaker_session, ) destination_s3_uri = os.path.join( diff --git a/tests/integ/test_horovod.py b/tests/integ/test_horovod.py index cc37532ef3..cd76abfd12 100644 --- a/tests/integ/test_horovod.py +++ b/tests/integ/test_horovod.py @@ -15,10 +15,10 @@ import json import os import tarfile -from six.moves.urllib.parse import urlparse import boto3 import pytest +from six.moves.urllib.parse import urlparse import sagemaker.utils import tests.integ as integ @@ -28,27 +28,49 @@ horovod_dir = os.path.join(os.path.dirname(__file__), "..", "data", "horovod") -@pytest.fixture(scope="module") -def gpu_instance_type(request): - return "ml.p2.xlarge" - - @pytest.mark.canary_quick -def test_hvd_cpu(sagemaker_session, cpu_instance_type, tmpdir): - _create_and_fit_estimator(sagemaker_session, cpu_instance_type, tmpdir) +def test_hvd_cpu( + sagemaker_session, + tf_training_latest_version, + tf_training_latest_py_version, + cpu_instance_type, + tmpdir, +): + _create_and_fit_estimator( + sagemaker_session, + tf_training_latest_version, + tf_training_latest_py_version, + cpu_instance_type, + tmpdir, + ) @pytest.mark.canary_quick @pytest.mark.skipif( integ.test_region() in integ.TRAINING_NO_P2_REGIONS, reason="no ml.p2 instances in this region" ) -def test_hvd_gpu(sagemaker_session, gpu_instance_type, tmpdir): - _create_and_fit_estimator(sagemaker_session, gpu_instance_type, tmpdir) +def test_hvd_gpu( + sagemaker_session, tf_training_latest_version, tf_training_latest_py_version, tmpdir +): + _create_and_fit_estimator( + sagemaker_session, + tf_training_latest_version, + tf_training_latest_py_version, + "ml.p2.xlarge", + tmpdir, + ) @pytest.mark.local_mode @pytest.mark.parametrize("instances, processes", [[1, 2], (2, 1), (2, 2)]) -def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdir): +def test_horovod_local_mode( + sagemaker_local_session, + tf_training_latest_version, + tf_training_latest_py_version, + instances, + processes, + tmpdir, +): output_path = "file://%s" % tmpdir job_name = sagemaker.utils.unique_name_from_base("tf-horovod") estimator = TensorFlow( @@ -57,9 +79,9 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi train_instance_count=2, train_instance_type="local", sagemaker_session=sagemaker_local_session, - py_version=integ.PYTHON_VERSION, output_path=output_path, - framework_version="1.12", + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, distributions={"mpi": {"enabled": True, "processes_per_host": processes}}, ) @@ -96,7 +118,7 @@ def extract_files_from_s3(s3_url, tmpdir, sagemaker_session): tar_file.extractall(tmpdir) -def _create_and_fit_estimator(sagemaker_session, instance_type, tmpdir): +def _create_and_fit_estimator(sagemaker_session, tf_version, py_version, instance_type, tmpdir): job_name = sagemaker.utils.unique_name_from_base("tf-horovod") estimator = TensorFlow( entry_point=os.path.join(horovod_dir, "hvd_basic.py"), @@ -104,8 +126,8 @@ def _create_and_fit_estimator(sagemaker_session, instance_type, tmpdir): train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, - py_version=integ.PYTHON_VERSION, - framework_version="1.12", + py_version=py_version, + framework_version=tf_version, distributions={"mpi": {"enabled": True}}, ) diff --git a/tests/integ/test_model_monitor.py b/tests/integ/test_model_monitor.py index c876b696b9..ce8391bd5f 100644 --- a/tests/integ/test_model_monitor.py +++ b/tests/integ/test_model_monitor.py @@ -88,7 +88,7 @@ @pytest.fixture(scope="module") -def predictor(sagemaker_session, tf_serving_version): +def predictor(sagemaker_session, tf_serving_latest_version): endpoint_name = unique_name_from_base("sagemaker-tensorflow-serving") model_data = sagemaker_session.upload_data( path=os.path.join(tests.integ.DATA_DIR, "tensorflow-serving-test-model.tar.gz"), @@ -100,7 +100,7 @@ def predictor(sagemaker_session, tf_serving_version): model = TensorFlowModel( model_data=model_data, role=ROLE, - framework_version=tf_serving_version, + framework_version=tf_serving_latest_version, sagemaker_session=sagemaker_session, ) predictor = model.deploy( diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py index 428d2c59c4..ba960ab3ad 100644 --- a/tests/integ/test_tf.py +++ b/tests/integ/test_tf.py @@ -22,7 +22,7 @@ from sagemaker.utils import unique_name_from_base, sagemaker_timestamp import tests.integ -from tests.integ import kms_utils, timeout, PYTHON_VERSION +from tests.integ import kms_utils, timeout from tests.integ.retry import retries from tests.integ.s3_utils import assert_s3_files_exist @@ -39,7 +39,7 @@ def test_mnist_with_checkpoint_config( - sagemaker_session, instance_type, tf_full_version, tf_full_py_version + sagemaker_session, instance_type, tf_training_latest_version, tf_training_latest_py_version ): checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format( sagemaker_session.default_bucket(), sagemaker_timestamp() @@ -51,8 +51,8 @@ def test_mnist_with_checkpoint_config( train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, - framework_version=tf_full_version, - py_version=tf_full_py_version, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}], checkpoint_s3_uri=checkpoint_s3_uri, checkpoint_local_path=checkpoint_local_path, @@ -82,7 +82,7 @@ def test_mnist_with_checkpoint_config( assert actual_training_checkpoint_config == expected_training_checkpoint_config -def test_server_side_encryption(sagemaker_session, tf_serving_version): +def test_server_side_encryption(sagemaker_session, tf_full_version, tf_full_py_version): with kms_utils.bucket_with_encryption(sagemaker_session, ROLE) as (bucket_with_kms, kms_key): output_path = os.path.join( bucket_with_kms, "test-server-side-encryption", time.strftime("%y%m%d-%H%M") @@ -95,8 +95,8 @@ def test_server_side_encryption(sagemaker_session, tf_serving_version): train_instance_count=1, train_instance_type="ml.c5.xlarge", sagemaker_session=sagemaker_session, - framework_version=tf_serving_version, - py_version=PYTHON_VERSION, + framework_version=tf_full_version, + py_version=tf_full_py_version, code_location=output_path, output_path=output_path, model_dir="/opt/ml/model", @@ -123,15 +123,17 @@ def test_server_side_encryption(sagemaker_session, tf_serving_version): @pytest.mark.canary_quick -def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version, tf_full_py_version): +def test_mnist_distributed( + sagemaker_session, instance_type, tf_training_latest_version, tf_training_latest_py_version +): estimator = TensorFlow( entry_point=SCRIPT, role=ROLE, train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, - framework_version=tf_full_version, - py_version=tf_full_py_version, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, distributions=PARAMETER_SERVER_DISTRIBUTION, ) inputs = estimator.sagemaker_session.upload_data( @@ -147,16 +149,15 @@ def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version, tf ) -def test_mnist_async(sagemaker_session, cpu_instance_type, tf_serving_version): +def test_mnist_async(sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version): estimator = TensorFlow( entry_point=SCRIPT, role=ROLE, train_instance_count=1, train_instance_type="ml.c5.4xlarge", - py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, - # testing py-sdk functionality, no need to run against all TF versions - framework_version=tf_serving_version, + framework_version=tf_full_version, + py_version=tf_full_py_version, tags=TAGS, ) inputs = estimator.sagemaker_session.upload_data( @@ -188,15 +189,17 @@ def test_mnist_async(sagemaker_session, cpu_instance_type, tf_serving_version): _assert_model_name_match(sagemaker_session.sagemaker_client, endpoint_name, model_name) -def test_deploy_with_input_handlers(sagemaker_session, instance_type, tf_serving_version): +def test_deploy_with_input_handlers( + sagemaker_session, instance_type, tf_full_version, tf_full_py_version +): estimator = TensorFlow( entry_point="training.py", source_dir=TFS_RESOURCE_PATH, role=ROLE, train_instance_count=1, train_instance_type=instance_type, - framework_version=tf_serving_version, - py_version=PYTHON_VERSION, + framework_version=tf_full_version, + py_version=tf_full_py_version, sagemaker_session=sagemaker_session, tags=TAGS, ) diff --git a/tests/integ/test_tf_efs_fsx.py b/tests/integ/test_tf_efs_fsx.py index 295482d57f..f1f422fd71 100644 --- a/tests/integ/test_tf_efs_fsx.py +++ b/tests/integ/test_tf_efs_fsx.py @@ -55,7 +55,11 @@ def efs_fsx_setup(sagemaker_session, ec2_instance_type): reason="EFS integration tests need to be fixed before running in all regions.", ) def test_mnist_efs( - efs_fsx_setup, sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version + efs_fsx_setup, + sagemaker_session, + cpu_instance_type, + tf_training_latest_version, + tf_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] @@ -67,8 +71,8 @@ def test_mnist_efs( train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - framework_version=tf_full_version, - py_version=tf_full_py_version, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) @@ -96,7 +100,11 @@ def test_mnist_efs( reason="EFS integration tests need to be fixed before running in all regions.", ) def test_mnist_lustre( - efs_fsx_setup, sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version + efs_fsx_setup, + sagemaker_session, + cpu_instance_type, + tf_training_latest_version, + tf_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] @@ -108,8 +116,8 @@ def test_mnist_lustre( train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - framework_version=tf_full_version, - py_version=tf_full_py_version, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) @@ -133,7 +141,11 @@ def test_mnist_lustre( reason="EFS integration tests need to be fixed before running in all regions.", ) def test_tuning_tf_efs( - efs_fsx_setup, sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version + efs_fsx_setup, + sagemaker_session, + cpu_instance_type, + tf_training_latest_version, + tf_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] @@ -145,8 +157,8 @@ def test_tuning_tf_efs( train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - framework_version=tf_full_version, - py_version=tf_full_py_version, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) @@ -182,7 +194,11 @@ def test_tuning_tf_efs( reason="EFS integration tests need to be fixed before running in all regions.", ) def test_tuning_tf_lustre( - efs_fsx_setup, sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version + efs_fsx_setup, + sagemaker_session, + cpu_instance_type, + tf_training_latest_version, + tf_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] @@ -194,8 +210,8 @@ def test_tuning_tf_lustre( train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - framework_version=tf_full_version, - py_version=tf_full_py_version, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) diff --git a/tests/integ/test_tfs.py b/tests/integ/test_tfs.py index 8999090ca6..acbf98401d 100644 --- a/tests/integ/test_tfs.py +++ b/tests/integ/test_tfs.py @@ -27,7 +27,7 @@ @pytest.fixture(scope="module") -def tfs_predictor(sagemaker_session, tf_serving_version): +def tfs_predictor(sagemaker_session, tf_serving_latest_version): endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving") model_data = sagemaker_session.upload_data( path=os.path.join(tests.integ.DATA_DIR, "tensorflow-serving-test-model.tar.gz"), @@ -37,7 +37,7 @@ def tfs_predictor(sagemaker_session, tf_serving_version): model = TensorFlowModel( model_data=model_data, role="SageMakerRole", - framework_version=tf_serving_version, + framework_version=tf_serving_latest_version, sagemaker_session=sagemaker_session, ) predictor = model.deploy(1, "ml.c5.xlarge", endpoint_name=endpoint_name) @@ -54,7 +54,7 @@ def tar_dir(directory, tmpdir): @pytest.fixture def tfs_predictor_with_model_and_entry_point_same_tar( - sagemaker_local_session, tf_serving_version, tmpdir + sagemaker_local_session, tf_serving_latest_version, tmpdir ): endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving") @@ -65,7 +65,7 @@ def tfs_predictor_with_model_and_entry_point_same_tar( model = TensorFlowModel( model_data="file://" + model_tar, role="SageMakerRole", - framework_version=tf_serving_version, + framework_version=tf_serving_latest_version, sagemaker_session=sagemaker_local_session, ) predictor = model.deploy(1, "local", endpoint_name=endpoint_name) @@ -78,7 +78,7 @@ def tfs_predictor_with_model_and_entry_point_same_tar( @pytest.fixture(scope="module") def tfs_predictor_with_model_and_entry_point_and_dependencies( - sagemaker_local_session, tf_serving_version + sagemaker_local_session, tf_serving_latest_version ): endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving") @@ -98,7 +98,7 @@ def tfs_predictor_with_model_and_entry_point_and_dependencies( model_data=model_data, role="SageMakerRole", dependencies=dependencies, - framework_version=tf_serving_version, + framework_version=tf_serving_latest_version, sagemaker_session=sagemaker_local_session, ) diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py index 77b75537a9..9f3a95097d 100644 --- a/tests/integ/test_transformer.py +++ b/tests/integ/test_transformer.py @@ -28,7 +28,6 @@ from tests.integ import ( datasets, DATA_DIR, - PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES, TRANSFORM_DEFAULT_TIMEOUT_MINUTES, ) @@ -334,7 +333,7 @@ def test_transform_mxnet_logs( def test_transform_tf_kms_network_isolation( - sagemaker_session, cpu_instance_type, tmpdir, tf_serving_version + sagemaker_session, cpu_instance_type, tmpdir, tf_full_version, tf_full_py_version ): data_path = os.path.join(DATA_DIR, "tensorflow_mnist") @@ -343,8 +342,8 @@ def test_transform_tf_kms_network_isolation( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, - framework_version=tf_serving_version, - py_version=PYTHON_VERSION, + framework_version=tf_full_version, + py_version=tf_full_py_version, sagemaker_session=sagemaker_session, ) diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py index a1a14ee49f..d721d15e01 100644 --- a/tests/integ/test_tuner.py +++ b/tests/integ/test_tuner.py @@ -51,8 +51,6 @@ from tests.integ.record_set import prepare_record_set_from_local_files from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -DATA_PATH = os.path.join(DATA_DIR, "iris", "data") - @pytest.fixture(scope="module") def kmeans_train_set(sagemaker_session): @@ -590,8 +588,8 @@ def test_tuning_mxnet( @pytest.mark.canary_quick -def test_tuning_tf_script_mode( - sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version +def test_tuning_tf( + sagemaker_session, cpu_instance_type, tf_training_latest_version, tf_training_latest_py_version ): resource_path = os.path.join(DATA_DIR, "tensorflow_mnist") script_path = os.path.join(resource_path, "mnist.py") @@ -602,8 +600,8 @@ def test_tuning_tf_script_mode( train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - framework_version=tf_full_version, - py_version=tf_full_py_version, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} @@ -624,7 +622,7 @@ def test_tuning_tf_script_mode( path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist" ) - tuning_job_name = unique_name_from_base("tune-tf-script-mode", max_length=32) + tuning_job_name = unique_name_from_base("tune-tf", max_length=32) tuner.fit(inputs, job_name=tuning_job_name) print("Started hyperparameter tuning job with name: " + tuning_job_name) @@ -633,13 +631,15 @@ def test_tuning_tf_script_mode( tuner.wait() -@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.") -def test_tuning_tf_vpc_multi(sagemaker_session, cpu_instance_type): +def test_tuning_tf_vpc_multi( + sagemaker_session, cpu_instance_type, tf_training_latest_version, tf_training_latest_py_version +): """Test Tensorflow multi-instance using the same VpcConfig for training and inference""" instance_type = cpu_instance_type instance_count = 2 - script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") + resource_path = os.path.join(DATA_DIR, "tensorflow_mnist") + script_path = os.path.join(resource_path, "mnist.py") ec2_client = sagemaker_session.boto_session.client("ec2") subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(ec2_client) @@ -648,9 +648,8 @@ def test_tuning_tf_vpc_multi(sagemaker_session, cpu_instance_type): estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", - training_steps=1, - evaluation_steps=1, - hyperparameters={"input_tensor_name": "inputs"}, + framework_version=tf_training_latest_version, + py_version=tf_training_latest_py_version, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, @@ -658,31 +657,30 @@ def test_tuning_tf_vpc_multi(sagemaker_session, cpu_instance_type): subnets=subnet_ids, security_group_ids=[security_group_id], encrypt_inter_container_traffic=True, - framework_version="1.11", - py_version=PYTHON_VERSION, ) - inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix="integ-test-data/tf_iris") - hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)} - - objective_metric_name = "loss" - metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}] + hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} + objective_metric_name = "accuracy" + metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)"}] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, - objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) - tuning_job_name = unique_name_from_base("tune-tf", max_length=32) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): + inputs = estimator.sagemaker_session.upload_data( + path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist" + ) + + tuning_job_name = unique_name_from_base("tune-tf", max_length=32) tuner.fit(inputs, job_name=tuning_job_name) - print("Started hyperparameter tuning job with name:" + tuning_job_name) + print(f"Started hyperparameter tuning job with name: {tuning_job_name}") time.sleep(15) tuner.wait()