Skip to content

Commit

Permalink
Enhance model builder selection logic to include model size
Browse files Browse the repository at this point in the history
  • Loading branch information
samruds committed Feb 16, 2024
1 parent 0900405 commit 57f57ef
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 6 deletions.
7 changes: 7 additions & 0 deletions requirements/extras/huggingface_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
accelerate
numpy>=1.17
packaging>=20.0
psutil
pyyaml
torch>=1.10.0
huggingface_hub
7 changes: 7 additions & 0 deletions requirements/extras/test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,10 @@ tritonclient[http]<2.37.0
onnx==1.14.1
# tf2onnx==1.15.1
nbformat>=5.9,<6
accelerate
numpy>=1.17
packaging>=20.0
psutil
pyyaml
torch>=1.10.0
huggingface_hub
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def read_requirements(filename):
"feature-processor": read_requirements(
"requirements/extras/feature-processor_requirements.txt"
),
"huggingface": read_requirements("requirements/extras/huggingface_requirements.txt"),
}
# Meta dependency groups
extras["all"] = [item for group in extras.values() for item in group]
Expand Down
66 changes: 65 additions & 1 deletion src/sagemaker/serve/builder/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@

from pathlib import Path

from accelerate.commands.estimate import estimate_command_parser, gather_data
from sagemaker import Session
from sagemaker.djl_inference import defaults
from sagemaker.model import Model
from sagemaker.base_predictor import PredictorBase
from sagemaker.serializers import NumpySerializer, TorchTensorSerializer
Expand All @@ -39,6 +41,7 @@
from sagemaker.serve.save_retrive.version_1_0_0.metadata.metadata import Metadata
from sagemaker.serve.spec.inference_spec import InferenceSpec
from sagemaker.serve.utils.predictors import _get_local_mode_predictor
from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback
from sagemaker.serve.detector.image_detector import (
auto_detect_container,
_detect_framework_and_version,
Expand All @@ -65,6 +68,8 @@
ModelServer.DJL_SERVING,
}

MIB_CONVERSION_FACTOR = 0.00000095367431640625
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer

# pylint: disable=attribute-defined-outside-init
@dataclass
Expand Down Expand Up @@ -567,7 +572,7 @@ def wrapper(*args, **kwargs):
# It supports two modes of deployment
# 1/ SageMaker Endpoint
# 2/ Local launch with container
def build(
def build( # pylint: disable=R0911
self,
mode: Type[Mode] = None,
role_arn: str = None,
Expand Down Expand Up @@ -616,6 +621,10 @@ def build(
)
if hf_model_md.get("pipeline_tag") == "text-generation": # pylint: disable=R1705
return self._build_for_tgi()
elif self.can_fit_on_single_gpu():
return self._build_for_transformers()
elif self.model in defaults.FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES:
return self._build_for_djl()
else:
return self._build_for_transformers()

Expand Down Expand Up @@ -672,3 +681,58 @@ def validate(self, model_dir: str) -> Type[bool]:
"""

return get_metadata(model_dir)

def total_inference_model_size_mib(self):
"""Calculates the model size from HF accelerate
This function gets the model size from accelerate. It also adds a
padding and converts to size MiB. When performing inference, expect
to add up to an additional 20% to the given model size as found by EleutherAI.
"""
dtypes = "float32"
try:
if self.env_vars.get("dtypes"):
dtypes = self.env_vars.get("dtypes")

parser = estimate_command_parser()
args = parser.parse_args([self.model, "--dtypes", dtypes])
except ValueError:
logging.error("Args specified incorrect for model %s", self.model)

output = gather_data(
args
) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"

total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
logger.info("Total memory size MIB: %s", total_memory_size_mib)
return total_memory_size_mib

def can_fit_on_single_gpu(self):
"""Check if model can fit on a single GPU
This function gets the GPU info or fallback to set the size of a single GPU.
If the size of the model is <= single gpu memory size, returns true.
"""
try:
gpu_info = _get_gpu_info(self.instance_type, self.sagemaker_session)
logger.info("GPU info %s for instance %s", gpu_info, self.instance_type)
single_gpu_size_mib = gpu_info[1] / gpu_info[0]
except ValueError:
gpu_fallback = _get_gpu_info_fallback(
self.instance_type, self.sagemaker_session.boto_region_name
)
logger.info("GPU fallback picked up %s", gpu_fallback)
single_gpu_size_mib = gpu_fallback[1] / gpu_fallback[0]

if single_gpu_size_mib is None:
logger.info("Unable to determine single GPU size for instance %s", self.instance_type)
return False

if self.total_inference_model_size_mib() <= single_gpu_size_mib:
logger.info(
"Total inference model size MIB %s, single GPU size for instance MIB %s",
self.total_inference_model_size_mib(),
single_gpu_size_mib,
)
return True
return False
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
from __future__ import absolute_import

import pytest
from sagemaker.serve import Mode
from sagemaker.serve.builder.model_builder import ModelBuilder
from sagemaker.serve.builder.schema_builder import SchemaBuilder
from sagemaker.serve.builder.model_builder import ModelBuilder, Mode
from tests.integ.sagemaker.serve.constants import (
HF_DIR,
PYTHON_VERSION_IS_NOT_310,
Expand Down Expand Up @@ -90,10 +89,10 @@ def model_builder(request):
def test_non_text_generation_model_single_GPU(sagemaker_session, model_builder, model_input):
iam_client = sagemaker_session.boto_session.client("iam")
role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
caught_ex = None
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
logger.info("Running in SAGEMAKER_ENDPOINT mode")
predictor = model.deploy(
mode=Mode.SAGEMAKER_ENDPOINT,
Expand Down Expand Up @@ -137,9 +136,9 @@ def test_non_text_generation_model_multi_GPU(sagemaker_session, model_builder, m
iam_client = sagemaker_session.boto_session.client("iam")
role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
caught_ex = None
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
logger.info("Running in SAGEMAKER_ENDPOINT mode")
predictor = model.deploy(
mode=Mode.SAGEMAKER_ENDPOINT,
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/sagemaker/serve/test_serve_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def test_pytorch_transformers_sagemaker_endpoint(
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
predictor = model.deploy(instance_type=gpu_instance_type, initial_instance_count=1)
predictor = model.deploy(instance_type="ml.g4dn.xlarge", initial_instance_count=1)
logger.info("Endpoint successfully deployed.")
predictor.predict(input)
except Exception as e:
Expand Down

0 comments on commit 57f57ef

Please sign in to comment.