From 57f57efb229b1ea174e23d0805cf5ab4c6829bfb Mon Sep 17 00:00:00 2001 From: Samrudhi Sharma Date: Thu, 15 Feb 2024 16:17:38 -0800 Subject: [PATCH] Enhance model builder selection logic to include model size --- .../extras/huggingface_requirements.txt | 7 ++ requirements/extras/test_requirements.txt | 7 ++ setup.py | 1 + src/sagemaker/serve/builder/model_builder.py | 66 ++++++++++++++++++- ...gpu.py => test_serve_model_builder_gpu.py} | 7 +- .../serve/test_serve_transformers.py | 2 +- 6 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 requirements/extras/huggingface_requirements.txt rename tests/integ/sagemaker/serve/{test_model_builder_gpu.py => test_serve_model_builder_gpu.py} (95%) diff --git a/requirements/extras/huggingface_requirements.txt b/requirements/extras/huggingface_requirements.txt new file mode 100644 index 0000000000..d9e1487dd4 --- /dev/null +++ b/requirements/extras/huggingface_requirements.txt @@ -0,0 +1,7 @@ +accelerate +numpy>=1.17 +packaging>=20.0 +psutil +pyyaml +torch>=1.10.0 +huggingface_hub diff --git a/requirements/extras/test_requirements.txt b/requirements/extras/test_requirements.txt index ba7d8c3849..6db6b484af 100644 --- a/requirements/extras/test_requirements.txt +++ b/requirements/extras/test_requirements.txt @@ -39,3 +39,10 @@ tritonclient[http]<2.37.0 onnx==1.14.1 # tf2onnx==1.15.1 nbformat>=5.9,<6 +accelerate +numpy>=1.17 +packaging>=20.0 +psutil +pyyaml +torch>=1.10.0 +huggingface_hub diff --git a/setup.py b/setup.py index b1070319d3..5b8845efed 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,7 @@ def read_requirements(filename): "feature-processor": read_requirements( "requirements/extras/feature-processor_requirements.txt" ), + "huggingface": read_requirements("requirements/extras/huggingface_requirements.txt"), } # Meta dependency groups extras["all"] = [item for group in extras.values() for item in group] diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index 0ade8096f6..6977a13f1a 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -20,7 +20,9 @@ from pathlib import Path +from accelerate.commands.estimate import estimate_command_parser, gather_data from sagemaker import Session +from sagemaker.djl_inference import defaults from sagemaker.model import Model from sagemaker.base_predictor import PredictorBase from sagemaker.serializers import NumpySerializer, TorchTensorSerializer @@ -39,6 +41,7 @@ from sagemaker.serve.save_retrive.version_1_0_0.metadata.metadata import Metadata from sagemaker.serve.spec.inference_spec import InferenceSpec from sagemaker.serve.utils.predictors import _get_local_mode_predictor +from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback from sagemaker.serve.detector.image_detector import ( auto_detect_container, _detect_framework_and_version, @@ -65,6 +68,8 @@ ModelServer.DJL_SERVING, } +MIB_CONVERSION_FACTOR = 0.00000095367431640625 +MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer # pylint: disable=attribute-defined-outside-init @dataclass @@ -567,7 +572,7 @@ def wrapper(*args, **kwargs): # It supports two modes of deployment # 1/ SageMaker Endpoint # 2/ Local launch with container - def build( + def build( # pylint: disable=R0911 self, mode: Type[Mode] = None, role_arn: str = None, @@ -616,6 +621,10 @@ def build( ) if hf_model_md.get("pipeline_tag") == "text-generation": # pylint: disable=R1705 return self._build_for_tgi() + elif self.can_fit_on_single_gpu(): + return self._build_for_transformers() + elif self.model in defaults.FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES: + return self._build_for_djl() else: return self._build_for_transformers() @@ -672,3 +681,58 @@ def validate(self, model_dir: str) -> Type[bool]: """ return get_metadata(model_dir) + + def total_inference_model_size_mib(self): + """Calculates the model size from HF accelerate + + This function gets the model size from accelerate. It also adds a + padding and converts to size MiB. When performing inference, expect + to add up to an additional 20% to the given model size as found by EleutherAI. + """ + dtypes = "float32" + try: + if self.env_vars.get("dtypes"): + dtypes = self.env_vars.get("dtypes") + + parser = estimate_command_parser() + args = parser.parse_args([self.model, "--dtypes", dtypes]) + except ValueError: + logging.error("Args specified incorrect for model %s", self.model) + + output = gather_data( + args + ) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam" + + total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR + logger.info("Total memory size MIB: %s", total_memory_size_mib) + return total_memory_size_mib + + def can_fit_on_single_gpu(self): + """Check if model can fit on a single GPU + + This function gets the GPU info or fallback to set the size of a single GPU. + If the size of the model is <= single gpu memory size, returns true. + """ + try: + gpu_info = _get_gpu_info(self.instance_type, self.sagemaker_session) + logger.info("GPU info %s for instance %s", gpu_info, self.instance_type) + single_gpu_size_mib = gpu_info[1] / gpu_info[0] + except ValueError: + gpu_fallback = _get_gpu_info_fallback( + self.instance_type, self.sagemaker_session.boto_region_name + ) + logger.info("GPU fallback picked up %s", gpu_fallback) + single_gpu_size_mib = gpu_fallback[1] / gpu_fallback[0] + + if single_gpu_size_mib is None: + logger.info("Unable to determine single GPU size for instance %s", self.instance_type) + return False + + if self.total_inference_model_size_mib() <= single_gpu_size_mib: + logger.info( + "Total inference model size MIB %s, single GPU size for instance MIB %s", + self.total_inference_model_size_mib(), + single_gpu_size_mib, + ) + return True + return False diff --git a/tests/integ/sagemaker/serve/test_model_builder_gpu.py b/tests/integ/sagemaker/serve/test_serve_model_builder_gpu.py similarity index 95% rename from tests/integ/sagemaker/serve/test_model_builder_gpu.py rename to tests/integ/sagemaker/serve/test_serve_model_builder_gpu.py index c539b9da27..b55c88a38a 100644 --- a/tests/integ/sagemaker/serve/test_model_builder_gpu.py +++ b/tests/integ/sagemaker/serve/test_serve_model_builder_gpu.py @@ -13,9 +13,8 @@ from __future__ import absolute_import import pytest -from sagemaker.serve import Mode -from sagemaker.serve.builder.model_builder import ModelBuilder from sagemaker.serve.builder.schema_builder import SchemaBuilder +from sagemaker.serve.builder.model_builder import ModelBuilder, Mode from tests.integ.sagemaker.serve.constants import ( HF_DIR, PYTHON_VERSION_IS_NOT_310, @@ -90,10 +89,10 @@ def model_builder(request): def test_non_text_generation_model_single_GPU(sagemaker_session, model_builder, model_input): iam_client = sagemaker_session.boto_session.client("iam") role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"] + model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session) caught_ex = None with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT): try: - model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session) logger.info("Running in SAGEMAKER_ENDPOINT mode") predictor = model.deploy( mode=Mode.SAGEMAKER_ENDPOINT, @@ -137,9 +136,9 @@ def test_non_text_generation_model_multi_GPU(sagemaker_session, model_builder, m iam_client = sagemaker_session.boto_session.client("iam") role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"] caught_ex = None + model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session) with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT): try: - model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session) logger.info("Running in SAGEMAKER_ENDPOINT mode") predictor = model.deploy( mode=Mode.SAGEMAKER_ENDPOINT, diff --git a/tests/integ/sagemaker/serve/test_serve_transformers.py b/tests/integ/sagemaker/serve/test_serve_transformers.py index 735f60d0f2..fc817e7cdd 100644 --- a/tests/integ/sagemaker/serve/test_serve_transformers.py +++ b/tests/integ/sagemaker/serve/test_serve_transformers.py @@ -106,7 +106,7 @@ def test_pytorch_transformers_sagemaker_endpoint( with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT): try: logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...") - predictor = model.deploy(instance_type=gpu_instance_type, initial_instance_count=1) + predictor = model.deploy(instance_type="ml.g4dn.xlarge", initial_instance_count=1) logger.info("Endpoint successfully deployed.") predictor.predict(input) except Exception as e: