diff --git a/src/sagemaker/huggingface/llm_utils.py b/src/sagemaker/huggingface/llm_utils.py index 65befe41b0..1a2abfb2e4 100644 --- a/src/sagemaker/huggingface/llm_utils.py +++ b/src/sagemaker/huggingface/llm_utils.py @@ -57,6 +57,14 @@ def get_huggingface_llm_image_uri( version=version, image_scope="inference", ) + if backend == "huggingface-neuronx": + return image_uris.retrieve( + "huggingface-llm-neuronx", + region=region, + version=version, + image_scope="inference", + inference_tool="neuronx", + ) if backend == "lmi": version = version or "0.24.0" return image_uris.retrieve(framework="djl-deepspeed", region=region, version=version) diff --git a/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json new file mode 100644 index 0000000000..a13336fb79 --- /dev/null +++ b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json @@ -0,0 +1,41 @@ +{ + "inference": { + "processors": [ + "inf2" + ], + "version_aliases": { + "0.0": "0.0.16" + }, + "versions": { + "0.0.16": { + "py_versions": [ + "py310" + ], + "registries": { + "ap-northeast-1": "763104351884", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-4": "457447274322", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "tag_prefix": "1.13.1-optimum0.0.16", + "repository": "huggingface-pytorch-tgi-inference", + "container_version": { + "inf2": "ubuntu22.04" + } + } + } + } +} diff --git a/src/sagemaker/image_uris.py b/src/sagemaker/image_uris.py index 56e4bf346f..efcdc68b22 100644 --- a/src/sagemaker/image_uris.py +++ b/src/sagemaker/image_uris.py @@ -37,6 +37,7 @@ ECR_URI_TEMPLATE = "{registry}.dkr.{hostname}/{repository}" HUGGING_FACE_FRAMEWORK = "huggingface" HUGGING_FACE_LLM_FRAMEWORK = "huggingface-llm" +HUGGING_FACE_LLM_NEURONX_FRAMEWORK = "huggingface-llm-neuronx" XGBOOST_FRAMEWORK = "xgboost" SKLEARN_FRAMEWORK = "sklearn" TRAINIUM_ALLOWED_FRAMEWORKS = "pytorch" @@ -470,6 +471,7 @@ def _validate_version_and_set_if_needed(version, config, framework): if version is None and framework in [ DATA_WRANGLER_FRAMEWORK, HUGGING_FACE_LLM_FRAMEWORK, + HUGGING_FACE_LLM_NEURONX_FRAMEWORK, STABILITYAI_FRAMEWORK, ]: version = _get_latest_versions(available_versions) diff --git a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py index e4d7ab9947..b02fe36e99 100644 --- a/tests/unit/sagemaker/image_uris/test_huggingface_llm.py +++ b/tests/unit/sagemaker/image_uris/test_huggingface_llm.py @@ -19,29 +19,38 @@ LMI_VERSIONS = ["0.24.0"] HF_VERSIONS_MAPPING = { - "0.6.0": "2.0.0-tgi0.6.0-gpu-py39-cu118-ubuntu20.04", - "0.8.2": "2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04", - "0.9.3": "2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04", - "1.0.3": "2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04", - "1.1.0": "2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04", - "1.2.0": "2.1.1-tgi1.2.0-gpu-py310-cu121-ubuntu20.04", - "1.3.1": "2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04", - "1.3.3": "2.1.1-tgi1.3.3-gpu-py310-cu121-ubuntu20.04", + "gpu": { + "0.6.0": "2.0.0-tgi0.6.0-gpu-py39-cu118-ubuntu20.04", + "0.8.2": "2.0.0-tgi0.8.2-gpu-py39-cu118-ubuntu20.04", + "0.9.3": "2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04", + "1.0.3": "2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04", + "1.1.0": "2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04", + "1.2.0": "2.1.1-tgi1.2.0-gpu-py310-cu121-ubuntu20.04", + "1.3.1": "2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04", + "1.3.3": "2.1.1-tgi1.3.3-gpu-py310-cu121-ubuntu20.04", + }, + "inf2": { + "0.0.16": "1.13.1-optimum0.0.16-neuronx-py310-ubuntu22.04", + }, } -@pytest.mark.parametrize("load_config", ["huggingface-llm.json"], indirect=True) +@pytest.mark.parametrize( + "load_config", ["huggingface-llm.json", "huggingface-llm-neuronx.json"], indirect=True +) def test_huggingface_uris(load_config): VERSIONS = load_config["inference"]["versions"] + device = load_config["inference"]["processors"][0] + backend = "huggingface-neuronx" if device == "inf2" else "huggingface" for version in VERSIONS: ACCOUNTS = load_config["inference"]["versions"][version]["registries"] for region in ACCOUNTS.keys(): - uri = get_huggingface_llm_image_uri("huggingface", region=region, version=version) + uri = get_huggingface_llm_image_uri(backend, region=region, version=version) expected = expected_uris.huggingface_llm_framework_uri( "huggingface-pytorch-tgi-inference", ACCOUNTS[region], version, - HF_VERSIONS_MAPPING[version], + HF_VERSIONS_MAPPING[device][version], region=region, ) assert expected == uri