Skip to content

Commit

Permalink
change: Enhance model builder selection logic to include model size (a…
Browse files Browse the repository at this point in the history
…ws#4429)

* change: Enhance model builder selection logic to include model size

* Fix conflicts

* Address PR comments

* fix formatting

* fix formatting of test

* Fix token in tasks.json

* Increase coverage for tests

* fix formatting

* Fix requirements

* Import code instead of importing accelerate

* Fix formatting

* Setup dependencies
  • Loading branch information
samruds authored and root committed Mar 6, 2024
1 parent 3b538fb commit 8d3e60c
Show file tree
Hide file tree
Showing 10 changed files with 796 additions and 10 deletions.
1 change: 1 addition & 0 deletions doc/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ docutils==0.15.2
packaging==20.9
jinja2==3.1.3
schema==0.7.5
accelerate>=0.24.1,<=0.27.0
1 change: 1 addition & 0 deletions requirements/extras/huggingface_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
accelerate>=0.24.1,<=0.27.0
1 change: 1 addition & 0 deletions requirements/extras/test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ tritonclient[http]<2.37.0
onnx==1.14.1
# tf2onnx==1.15.1
nbformat>=5.9,<6
accelerate>=0.24.1,<=0.27.0
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def read_requirements(filename):
"feature-processor": read_requirements(
"requirements/extras/feature-processor_requirements.txt"
),
"huggingface": read_requirements("requirements/extras/huggingface_requirements.txt"),
}
# Meta dependency groups
extras["all"] = [item for group in extras.values() for item in group]
Expand Down
78 changes: 77 additions & 1 deletion src/sagemaker/serve/builder/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@

from pathlib import Path

from accelerate.commands.estimate import estimate_command_parser, gather_data
from sagemaker import Session
from sagemaker.model import Model
from sagemaker.base_predictor import PredictorBase
from sagemaker.djl_inference import defaults
from sagemaker.serializers import NumpySerializer, TorchTensorSerializer
from sagemaker.deserializers import JSONDeserializer, TorchTensorDeserializer
from sagemaker.serve.builder.schema_builder import SchemaBuilder
Expand All @@ -41,6 +43,7 @@
from sagemaker.serve.utils import task
from sagemaker.serve.utils.exceptions import TaskNotFoundException
from sagemaker.serve.utils.predictors import _get_local_mode_predictor
from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback
from sagemaker.serve.detector.image_detector import (
auto_detect_container,
_detect_framework_and_version,
Expand All @@ -67,6 +70,9 @@
ModelServer.DJL_SERVING,
}

MIB_CONVERSION_FACTOR = 0.00000095367431640625
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer


# pylint: disable=attribute-defined-outside-init
@dataclass
Expand Down Expand Up @@ -569,7 +575,7 @@ def wrapper(*args, **kwargs):
# It supports two modes of deployment
# 1/ SageMaker Endpoint
# 2/ Local launch with container
def build(
def build( # pylint: disable=R0911
self,
mode: Type[Mode] = None,
role_arn: str = None,
Expand Down Expand Up @@ -625,6 +631,13 @@ def build(

if model_task == "text-generation": # pylint: disable=R1705
return self._build_for_tgi()
elif self._can_fit_on_single_gpu():
return self._build_for_transformers()
elif (
self.model in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES
or self.model in defaults.FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES
):
return self._build_for_djl()
else:
return self._build_for_transformers()

Expand Down Expand Up @@ -696,3 +709,66 @@ def _schema_builder_init(self, model_task: str):
self.schema_builder = SchemaBuilder(sample_inputs, sample_outputs)
except ValueError:
raise TaskNotFoundException(f"Schema builder for {model_task} could not be found.")

def _total_inference_model_size_mib(self):
"""Calculates the model size from HF accelerate
This function gets the model size from accelerate. It also adds a
padding and converts to size MiB. When performing inference, expect
to add up to an additional 20% to the given model size as found by EleutherAI.
"""
dtypes = self.env_vars.get("dtypes", "float32")
parser = estimate_command_parser()
args = parser.parse_args([self.model, "--dtypes", dtypes])

output = gather_data(
args
) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"

if output is None:
raise ValueError(f"Could not get Model size for {self.model}")

total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
logger.info("Total memory size MIB: %s", total_memory_size_mib)
return total_memory_size_mib

def _can_fit_on_single_gpu(self) -> Type[bool]:
"""Check if model can fit on a single GPU
If the size of the model is <= single gpu memory size, returns True else False
"""
try:
single_gpu_size_mib = self._try_fetch_gpu_info()
if self._total_inference_model_size_mib() <= single_gpu_size_mib:
logger.info(
"Total inference model size MIB %s, single GPU size for instance MIB %s",
self._total_inference_model_size_mib(),
single_gpu_size_mib,
)
return True
return False
except ValueError:
logger.info("Unable to determine single GPU size for instance %s", self.instance_type)
return False

def _try_fetch_gpu_info(self):
"""Get GPU info
This function gets the GPU info or fallback to set the size of a single GPU
"""
try:
gpu_info = _get_gpu_info(self.instance_type, self.sagemaker_session)
logger.info("GPU info %s for instance %s", gpu_info, self.instance_type)
return gpu_info[1] / gpu_info[0]
except ValueError:
pass
try:
gpu_fallback = _get_gpu_info_fallback(
self.instance_type, self.sagemaker_session.boto_region_name
)
logger.info("GPU fallback picked up %s", gpu_fallback)
return gpu_fallback[1] / gpu_fallback[0]
except ValueError:
raise ValueError(
f"Unable to determine single GPU size for instance: [{self.instance_type}]"
)
2 changes: 1 addition & 1 deletion src/sagemaker/serve/schema/task.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"fill-mask": {
"sample_inputs": {
"properties": {
"inputs": "Paris is the <mask> of France.",
"inputs": "Paris is the [MASK] of France.",
"parameters": {}
}
},
Expand Down
184 changes: 184 additions & 0 deletions tests/integ/sagemaker/serve/test_serve_model_builder_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import

import pytest
from sagemaker.serve.builder.schema_builder import SchemaBuilder
from sagemaker.serve.builder.model_builder import ModelBuilder, Mode
import tests.integ
from tests.integ.sagemaker.serve.constants import (
HF_DIR,
PYTHON_VERSION_IS_NOT_310,
SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
)
from tests.integ.timeout import timeout
from tests.integ.utils import cleanup_model_resources, gpu_list, retry_with_instance_list
import logging

logger = logging.getLogger(__name__)

model_id = "bert-base-uncased"

sample_input = {"inputs": "Hello I'm a [MASK] model."}

sample_output = [
{
"score": 0.10731109976768494,
"token": 4827,
"token_str": "fashion",
"sequence": "hello i'm a fashion model.",
},
{
"score": 0.08774465322494507,
"token": 2535,
"token_str": "role",
"sequence": "hello i'm a role model.",
},
{
"score": 0.05338414013385773,
"token": 2047,
"token_str": "new",
"sequence": "hello i'm a new model.",
},
{
"score": 0.04667224362492561,
"token": 3565,
"token_str": "super",
"sequence": "hello i'm a super model.",
},
{
"score": 0.027096163481473923,
"token": 2986,
"token_str": "fine",
"sequence": "hello i'm a fine model.",
},
]


@pytest.fixture
def model_input():
return {"inputs": "The man worked as a [MASK]."}


@pytest.fixture
def model_builder_model_schema_builder():
return ModelBuilder(
model_path=HF_DIR, model=model_id, schema_builder=SchemaBuilder(sample_input, sample_output)
)


@pytest.fixture
def model_builder(request):
return request.getfixturevalue(request.param)


@pytest.mark.skipif(
PYTHON_VERSION_IS_NOT_310,
tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
reason="no ml.p2 or ml.p3 instances in this region",
)
@retry_with_instance_list(gpu_list(tests.integ.test_region()))
@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
def test_non_text_generation_model_single_GPU(
sagemaker_session, model_builder, model_input, **kwargs
):
iam_client = sagemaker_session.boto_session.client("iam")
role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
caught_ex = None
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
logger.info("Running in SAGEMAKER_ENDPOINT mode")
predictor = model.deploy(
mode=Mode.SAGEMAKER_ENDPOINT,
instance_type=kwargs["instance_type"],
initial_instance_count=1,
)
logger.info("Endpoint successfully deployed.")
prediction = predictor.predict(model_input)
assert prediction is not None

endpoint_name = predictor.endpoint_name
sagemaker_client = sagemaker_session.boto_session.client("sagemaker")
endpoint_config_name = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)[
"EndpointConfigName"
]
actual_instance_type = sagemaker_client.describe_endpoint_config(
EndpointConfigName=endpoint_config_name
)["ProductionVariants"][0]["InstanceType"]
assert kwargs["instance_type"] == actual_instance_type
except Exception as e:
caught_ex = e
finally:
cleanup_model_resources(
sagemaker_session=model_builder.sagemaker_session,
model_name=model.name,
endpoint_name=model.endpoint_name,
)
if caught_ex:
logger.exception(caught_ex)
assert (
False
), f"Exception {caught_ex} was thrown when running model builder single GPU test"


@pytest.mark.skipif(
PYTHON_VERSION_IS_NOT_310,
tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
reason="no ml.p2 or ml.p3 instances in this region",
)
@retry_with_instance_list(gpu_list(tests.integ.test_region()))
@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
def test_non_text_generation_model_multi_GPU(
sagemaker_session, model_builder, model_input, **kwargs
):
iam_client = sagemaker_session.boto_session.client("iam")
role_arn = iam_client.get_role(RoleName="SageMakerRole")["Role"]["Arn"]
caught_ex = None
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
logger.info("Running in SAGEMAKER_ENDPOINT mode")
predictor = model.deploy(
mode=Mode.SAGEMAKER_ENDPOINT,
instance_type=kwargs["instance_type"],
initial_instance_count=1,
)
logger.info("Endpoint successfully deployed.")
prediction = predictor.predict(model_input)
assert prediction is not None

endpoint_name = predictor.endpoint_name
sagemaker_client = sagemaker_session.boto_session.client("sagemaker")
endpoint_config_name = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)[
"EndpointConfigName"
]
actual_instance_type = sagemaker_client.describe_endpoint_config(
EndpointConfigName=endpoint_config_name
)["ProductionVariants"][0]["InstanceType"]
assert kwargs["instance_type"] == actual_instance_type
except Exception as e:
caught_ex = e
finally:
cleanup_model_resources(
sagemaker_session=model_builder.sagemaker_session,
model_name=model.name,
endpoint_name=model.endpoint_name,
)
if caught_ex:
logger.exception(caught_ex)
assert (
False
), f"Exception {caught_ex} was thrown when running model builder multi GPU test"
20 changes: 13 additions & 7 deletions tests/integ/sagemaker/serve/test_serve_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
import pytest
from sagemaker.serve.builder.schema_builder import SchemaBuilder
from sagemaker.serve.builder.model_builder import ModelBuilder, Mode

import tests.integ
from tests.integ.sagemaker.serve.constants import (
HF_DIR,
PYTHON_VERSION_IS_NOT_310,
SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
)

from tests.integ.timeout import timeout
from tests.integ.utils import cleanup_model_resources
from tests.integ.utils import cleanup_model_resources, gpu_list, retry_with_instance_list
import logging

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -67,7 +67,7 @@


@pytest.fixture
def input():
def model_input():
return {"inputs": "The man worked as a [MASK]."}


Expand All @@ -87,11 +87,14 @@ def model_builder(request):

@pytest.mark.skipif(
PYTHON_VERSION_IS_NOT_310,
reason="Testing feature",
tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
reason="no ml.p2 or ml.p3 instances in this region",
)
@retry_with_instance_list(gpu_list(tests.integ.test_region()))
@pytest.mark.parametrize("model_builder", ["model_builder_model_schema_builder"], indirect=True)
def test_pytorch_transformers_sagemaker_endpoint(
sagemaker_session, model_builder, gpu_instance_type, input
sagemaker_session, model_builder, model_input, **kwargs
):
logger.info("Running in SAGEMAKER_ENDPOINT mode...")
caught_ex = None
Expand All @@ -106,9 +109,12 @@ def test_pytorch_transformers_sagemaker_endpoint(
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
try:
logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
predictor = model.deploy(instance_type=gpu_instance_type, initial_instance_count=1)
predictor = model.deploy(
instance_type=kwargs["instance_type"], initial_instance_count=2
)
logger.info("Endpoint successfully deployed.")
predictor.predict(input)
predictor.predict(model_input)
assert predictor is not None
except Exception as e:
caught_ex = e
finally:
Expand Down
Loading

0 comments on commit 8d3e60c

Please sign in to comment.