From 877221587bf589e2e42bc6798a83e6239775986e Mon Sep 17 00:00:00 2001 From: Samrudhi Sharma Date: Mon, 4 Mar 2024 16:36:18 -0800 Subject: [PATCH] Setup dependencies --- doc/requirements.txt | 1 + .../extras/huggingface_requirements.txt | 1 + requirements/extras/test_requirements.txt | 1 + setup.py | 1 + src/sagemaker/serve/builder/model_builder.py | 2 +- src/sagemaker/serve/utils/estimate_parser.py | 613 ------------------ 6 files changed, 5 insertions(+), 614 deletions(-) create mode 100644 requirements/extras/huggingface_requirements.txt delete mode 100644 src/sagemaker/serve/utils/estimate_parser.py diff --git a/doc/requirements.txt b/doc/requirements.txt index 3d5618ce32..a65e0e4050 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -4,3 +4,4 @@ docutils==0.15.2 packaging==20.9 jinja2==3.1.3 schema==0.7.5 +accelerate>=0.24.1,<=0.27.0 diff --git a/requirements/extras/huggingface_requirements.txt b/requirements/extras/huggingface_requirements.txt new file mode 100644 index 0000000000..31c6e65899 --- /dev/null +++ b/requirements/extras/huggingface_requirements.txt @@ -0,0 +1 @@ +accelerate>=0.24.1,<=0.27.0 diff --git a/requirements/extras/test_requirements.txt b/requirements/extras/test_requirements.txt index ba7d8c3849..0ccce9cb7a 100644 --- a/requirements/extras/test_requirements.txt +++ b/requirements/extras/test_requirements.txt @@ -39,3 +39,4 @@ tritonclient[http]<2.37.0 onnx==1.14.1 # tf2onnx==1.15.1 nbformat>=5.9,<6 +accelerate>=0.24.1,<=0.27.0 diff --git a/setup.py b/setup.py index b1070319d3..5b8845efed 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,7 @@ def read_requirements(filename): "feature-processor": read_requirements( "requirements/extras/feature-processor_requirements.txt" ), + "huggingface": read_requirements("requirements/extras/huggingface_requirements.txt"), } # Meta dependency groups extras["all"] = [item for group in extras.values() for item in group] diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index a07ae0eedf..c66057397f 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -20,6 +20,7 @@ from pathlib import Path +from accelerate.commands.estimate import estimate_command_parser, gather_data from sagemaker import Session from sagemaker.model import Model from sagemaker.base_predictor import PredictorBase @@ -40,7 +41,6 @@ from sagemaker.serve.save_retrive.version_1_0_0.metadata.metadata import Metadata from sagemaker.serve.spec.inference_spec import InferenceSpec from sagemaker.serve.utils import task -from sagemaker.serve.utils.estimate_parser import estimate_command_parser, gather_data from sagemaker.serve.utils.exceptions import TaskNotFoundException from sagemaker.serve.utils.predictors import _get_local_mode_predictor from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback diff --git a/src/sagemaker/serve/utils/estimate_parser.py b/src/sagemaker/serve/utils/estimate_parser.py deleted file mode 100644 index 7cd4cd0d4d..0000000000 --- a/src/sagemaker/serve/utils/estimate_parser.py +++ /dev/null @@ -1,613 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -"""Utilities for estimating model size""" -from __future__ import absolute_import - -import os -import argparse -import enum -import re -import logging -from collections import defaultdict -from contextlib import contextmanager -from typing import Dict, List, Optional, Tuple, Union - -logger = logging.getLogger(__name__) - - -class CustomDtype(enum.Enum): - """An enum that contains multiple custom dtypes that can be used for `infer_auto_device_map`.""" - - FP8 = "fp8" - INT4 = "int4" - INT2 = "int2" - - -def import_torch_nn(): - """Import torch.nn""" - try: - import torch.nn - return torch.nn - except ImportError: - raise Exception("Unable to import torch.nn, install dependency") - - -def import_torch(): - """Import torch""" - try: - import torch - return torch - except ImportError: - raise Exception("Unable to import torch, install dependency") - - -def import_Auto_Config(): - """Import transformers""" - try: - from transformers import AutoConfig - return AutoConfig - except ImportError: - raise Exception("Unable to import transformers.AutoConfig, install Transformers dependency") - - -def import_Auto_Model(): - """Import transformers""" - try: - from transformers import AutoModel - return AutoModel - except ImportError: - raise Exception("Unable to import transformers.AutoModel, install Transformers dependency") - - -def import_model_info(): - """Import model info from huggingface_hub""" - try: - from huggingface_hub import model_info - - return model_info - except ImportError: - raise Exception("Unable to import model_info, check if huggingface_hub is installed") - - -def get_max_layer_size( - modules: List[Tuple[str, import_torch_nn().Module]], - module_sizes: Dict[str, int], - no_split_module_classes: List[str], -): - """Utility function - - Function that will scan a list of named modules and return the maximum size used by - one full layer. - - The definition of a layer being: - - a module with no direct children (just parameters and buffers) - - a module whose class name is in the list `no_split_module_classes` - - Args: - modules (`List[Tuple[str, torch.nn.Module]]`): - The list of named modules where we want to determine the maximum layer size. - module_sizes (`Dict[str, int]`): - A dictionary mapping each layer name to its size - (as generated by `compute_module_sizes`). - no_split_module_classes (`List[str]`): - A list of class names for layers we don't want to be split. - - Returns: - `Tuple[int, List[str]]`: The maximum size of a layer with the list of layer names - realizing that maximum size. - """ - max_size = 0 - layer_names = [] - modules_to_treat = modules.copy() - while len(modules_to_treat) > 0: - module_name, module = modules_to_treat.pop(0) - modules_children = ( - list(module.named_children()) if isinstance(module, import_torch_nn().Module) else [] - ) - if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes: - # No splitting this one so we compare to the max_size - size = module_sizes[module_name] - if size > max_size: - max_size = size - layer_names = [module_name] - elif size == max_size: - layer_names.append(module_name) - else: - modules_to_treat = [ - (f"{module_name}.{n}", v) for n, v in modules_children - ] + modules_to_treat - return max_size, layer_names - - -def _get_proper_dtype(dtype: Union[str, import_torch().device]) -> import_torch().dtype: - """Just does torch.dtype(dtype) if necessary.""" - if isinstance(dtype, str): - # We accept "torch.float16" or just "float16" - dtype = dtype.replace("torch.", "") - dtype = getattr(import_torch(), dtype) - return dtype - - -def dtype_byte_size(dtype: import_torch().dtype): - """Returns the size (in bytes) occupied by one parameter of type `dtype`. - - Example: - ```py - >>> dtype_byte_size(torch.float32) - 4 - ``` - """ - if dtype == import_torch().bool: # pylint: disable=R1705 - return 1 / 8 - elif dtype == CustomDtype.INT2: - return 1 / 4 - elif dtype == CustomDtype.INT4: - return 1 / 2 - elif dtype == CustomDtype.FP8: - return 1 - bit_search = re.search(r"[^\d](\d+)$", str(dtype)) - if bit_search is None: - raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") - bit_size = int(bit_search.groups()[0]) - return bit_size // 8 - - -def named_module_tensors( - module: import_torch_nn().Module, - include_buffers: bool = True, - recurse: bool = False, - remove_non_persistent: bool = False, -): - """A helper function that gathers all the tensors (parameters + buffers) of a given module. - - If `include_buffers=True` - it's the same as doing `module.named_parameters(recurse=recurse) - + module.named_buffers(recurse=recurse)`. - - Args: - module (`torch.nn.Module`): - The module we want the tensors on. - include_buffer (`bool`, *optional*, defaults to `True`): - Whether or not to include the buffers in the result. - recurse (`bool`, *optional`, defaults to `False`): - Whether or not to go look in every submodule or - just return the direct parameters and buffers. - remove_non_persistent (`bool`, *optional*, defaults to `False`): - Whether or not to remove the non persistent buffer from the buffers. - Useful only when include_buffers = True - """ - yield from module.named_parameters(recurse=recurse) - - if include_buffers: - non_persistent_buffers = set() - if remove_non_persistent: - non_persistent_buffers = get_non_persistent_buffers(module, recurse=recurse) - for named_buffer in module.named_buffers(recurse=recurse): - name, _ = named_buffer - if name not in non_persistent_buffers: - yield named_buffer - - -def get_non_persistent_buffers(module: import_torch_nn().Module, recurse: bool = False): - """Gather all non persistent buffers of a given modules into a set - - Args: - module (`nn.Module`): - The module we want the non persistent buffers on. - recurse (`bool`, *optional*, defaults to `False`): - Whether or not to go look in every submodule or - just return the direct non persistent buffers. - """ - - non_persistent_buffers_set = module._non_persistent_buffers_set - if recurse: - for _, m in module.named_modules(): - non_persistent_buffers_set |= m._non_persistent_buffers_set - - return non_persistent_buffers_set - - -def compute_module_sizes( - model: import_torch_nn().Module, - dtype: Optional[Union[str, import_torch().device]] = None, - special_dtypes: Optional[Dict[str, Union[str, import_torch().device]]] = None, -): - """Compute the size of each submodule of a given model.""" - if dtype is not None: - dtype = _get_proper_dtype(dtype) - dtype_size = dtype_byte_size(dtype) - if special_dtypes is not None: - special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()} - special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()} - module_sizes = defaultdict(int) - for name, tensor in named_module_tensors(model, recurse=True): - if special_dtypes is not None and name in special_dtypes: - size = tensor.numel() * special_dtypes_size[name] - elif dtype is None: - size = tensor.numel() * dtype_byte_size(tensor.dtype) - elif str(tensor.dtype).startswith(("torch.uint", "torch.int", "torch.bool")): - # According to the code in set_module_tensor_to_device, these types won't be converted - # so use their original size here - size = tensor.numel() * dtype_byte_size(tensor.dtype) - else: - size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype)) - name_parts = name.split(".") - for idx in range(len(name_parts) + 1): - module_sizes[".".join(name_parts[:idx])] += size - - return module_sizes - - -def calculate_maximum_sizes(model: import_torch_nn().Module): - """Computes the total size of the model and its largest layer""" - sizes = compute_module_sizes(model) - # `transformers` models store this information for us - no_split_modules = getattr(model, "_no_split_modules", None) - if no_split_modules is None: - no_split_modules = [] - - modules_to_treat = ( - list(model.named_parameters(recurse=False)) - + list(model.named_children()) - + list(model.named_buffers(recurse=False)) - ) - largest_layer = get_max_layer_size(modules_to_treat, sizes, no_split_modules) - total_size = sizes[""] - return total_size, largest_layer - - -def convert_bytes(size): - """Converts `size` from bytes to the largest possible unit""" - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if size < 1024.0: - return f"{round(size, 2)} {x}" - size /= 1024.0 - - return f"{round(size, 2)} PB" - - -def verify_on_hub(repo: str, token: str = None): - """Verifies that the model is on the hub and returns the model info.""" - model_info = import_model_info() - try: - return model_info(repo, token=token) - except ValueError: - return "gated/repo" - - -@contextmanager -def create_empty_model( - model_name: str, - library_name: str, - trust_remote_code: bool = False, - access_token: str = None, -): - """Creates an empty model from its parent library on the `Hub` to calculate memory consumption. - - Args: - model_name (`str`): - The model name on the Hub - library_name (`str`): - The library the model has an integration with, such as `transformers`. - Will be used if `model_name` has no metadata on the Hub to determine the library. - trust_remote_code (`bool`, `optional`, defaults to `False`): - Whether or not to allow for custom models defined on the Hub - in their own modeling files. This option should only be set to `True` for - repositories you trust and in which you have read the code, as it will - execute code present on the Hub on your local machine. - access_token (`str`, `optional`, defaults to `None`): - The access token to use to access private or gated models on the Hub. - - Returns: - `torch.nn.Module`: The torch model that has been initialized on the `meta` device. - - """ - model_info = verify_on_hub(model_name, access_token) # pylint: disable=W0621 - # Simplified errors - if model_info == "gated": # pylint: disable=R1720 - raise ValueError( - f"Repo for model `{model_name}` is gated. " - f"You must be authenticated to access it. Please run `huggingface-cli login`." - ) - elif model_info == "repo": - raise ValueError( - f"Repo for model `{model_name}` does not exist on the Hub." - f" If you are trying to access a private repo," - "make sure you are authenticated via `huggingface-cli login` and have access." - ) - if library_name is None: - library_name = getattr(model_info, "library_name", False) - if not library_name: - raise ValueError( - f"Model `{model_name}` does not have any library metadata on the Hub, " - f"please manually pass" - f" in a `--library_name` to use (such as `transformers`)" - ) - if library_name == "transformers": - print(f"Loading pretrained config for `{model_name}` from `transformers`...") - if model_info.config is None: - raise RuntimeError( - f"Tried to load `{model_name}` with `transformers` but " - f"it does not have any metadata." - ) - - auto_map = model_info.config.get("auto_map", False) - config = import_Auto_Config().from_pretrained( - model_name, trust_remote_code=trust_remote_code, token=access_token - ) - - with init_empty_weights(): - # remote code could specify a specific `AutoModel` class in the `auto_map` - constructor = import_Auto_Model() - if isinstance(auto_map, dict): - value = None - for key in auto_map.keys(): - if key.startswith("AutoModelFor"): - value = key - break - if value is not None: - try: - import transformers - constructor = getattr(transformers, value) - except ImportError: - raise Exception("Unable to import transformers, install dependency") - model = constructor.from_config(config, trust_remote_code=trust_remote_code) - else: - raise ValueError( - f"Library `{library_name}` is not supported yet, " - f"please open an issue on GitHub for us to add support." - ) - return model - - -@contextmanager -def init_empty_weights(include_buffers: bool = None): - """Context manager - - A context manager under which models are initialized with all parameters on the meta device, - therefore creating an - empty model. Useful when just initializing the model would blow the available RAM. - """ - if include_buffers is None: - include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False) - with init_on_device( # pylint: disable=E1129 - import_torch().device("meta"), include_buffers=include_buffers - ) as f: - yield f - - -def parse_flag_from_env(key, default=False): - """Returns truthy value for `key` from the env if available else the default.""" - value = os.environ.get(key, str(default)) - return str_to_bool(value) == 1 # As its name indicates `str_to_bool` actually returns an int... - - -def str_to_bool(value) -> int: - """Converts a string representation of truth to `True` (1) or `False` (0). - - True values are `y`, `yes`, `t`, `true`, `on`, and `1`; - False value are `n`, `no`, `f`, `false`, `off`, and `0`; - """ - value = value.lower() - if value in ("y", "yes", "t", "true", "on", "1"): # pylint: disable=R1705 - return 1 - elif value in ("n", "no", "f", "false", "off", "0"): - return 0 - else: - raise ValueError(f"invalid truth value {value}") - - -@contextmanager -def init_on_device(device: import_torch().device, include_buffers: bool = None): - """A context manager under which models are initialized""" - if include_buffers is None: - include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False) - - old_register_parameter = import_torch_nn().Module.register_parameter - if include_buffers: - old_register_buffer = import_torch_nn().Module.register_buffer - - def register_empty_parameter(module, name, param): - """Doctype: register_empty_parameter""" - old_register_parameter(module, name, param) - if param is not None: - param_cls = type(module._parameters[name]) - kwargs = module._parameters[name].__dict__ - kwargs["requires_grad"] = param.requires_grad - module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs) - - def register_empty_buffer(module, name, buffer, persistent=True): - """Doctype: register_empty_buffer""" - old_register_buffer(module, name, buffer, persistent=persistent) - if buffer is not None: - module._buffers[name] = module._buffers[name].to(device) - - # Patch tensor creation - if include_buffers: - tensor_constructors_to_patch = { - torch_function_name: getattr(import_torch(), torch_function_name) - for torch_function_name in ["empty", "zeros", "ones", "full"] - } - else: - tensor_constructors_to_patch = {} - - def patch_tensor_constructor(fn): - """Doctype: patch_tensor_constructor""" - - def wrapper(*args, **kwargs): - kwargs["device"] = device - return fn(*args, **kwargs) - - return wrapper - - try: - import_torch_nn().Module.register_parameter = register_empty_parameter - if include_buffers: - import_torch_nn().Module.register_buffer = register_empty_buffer - for torch_function_name in tensor_constructors_to_patch.keys(): - setattr( - import_torch(), - torch_function_name, - patch_tensor_constructor(getattr(import_torch(), torch_function_name)), - ) - yield - finally: - import_torch_nn().Module.register_parameter = old_register_parameter - if include_buffers: - import_torch_nn().Module.register_buffer = old_register_buffer - for ( - torch_function_name, - old_torch_function, - ) in tensor_constructors_to_patch.items(): - setattr(import_torch(), torch_function_name, old_torch_function) - - -def create_ascii_table(headers: list, rows: list, title: str): - """Creates a pretty table from a list of rows, minimal version of `tabulate`.""" - sep_char, in_between = "│", "─" - column_widths = [] - for i in range(len(headers)): # pylint: disable=C0200 - column_values = [row[i] for row in rows] + [headers[i]] - max_column_width = max(len(value) for value in column_values) # pylint: disable=C0200 - column_widths.append(max_column_width) - - formats = [f"%{column_widths[i]}s" for i in range(len(rows[0]))] - - pattern = f"{sep_char}{sep_char.join(formats)}{sep_char}" - diff = 0 - - def make_row(left_char, middle_char, right_char): - return ( - f"{left_char}{middle_char.join([in_between * n for n in column_widths])}" - f"{in_between * diff}{right_char}" - ) - - separator = make_row("├", "┼", "┤") - if len(title) > sum(column_widths): - diff = abs(len(title) - len(separator)) - column_widths[-1] += diff - - # Update with diff - separator = make_row("├", "┼", "┤") - initial_rows = [ - make_row("┌", in_between, "┐"), - f"{sep_char}{title.center(len(separator) - 2)}{sep_char}", - make_row("├", "┬", "┤"), - ] - table = "\n".join(initial_rows) + "\n" - column_widths[-1] += diff - centered_line = [text.center(column_widths[i]) for i, text in enumerate(headers)] - table += f"{pattern % tuple(centered_line)}\n{separator}\n" - for i, line in enumerate(rows): - centered_line = [t.center(column_widths[i]) for i, t in enumerate(line)] - table += f"{pattern % tuple(centered_line)}\n" - table += f'└{"┴".join([in_between * n for n in column_widths])}┘' - - return table - - -def estimate_command_parser(subparsers=None): - """Doctype: estimate_command_parser""" - if subparsers is not None: - parser = subparsers.add_parser("estimate-memory") - else: - parser = argparse.ArgumentParser( - description="Model size estimator for fitting a model onto CUDA memory." - ) - - parser.add_argument("model_name", type=str, help="The model name on the Hugging Face Hub.") - parser.add_argument( - "--library_name", - type=str, - help="The library the model has an integration with, such as `transformers`, " - "needed only if this information is not stored on the Hub.", - choices=["timm", "transformers"], - ) - parser.add_argument( - "--dtypes", - type=str, - nargs="+", - default=["float32", "float16", "int8", "int4"], - help="The dtypes to use for the model, must be one (or many) of " - "`float32`, `float16`, `int8`, and `int4`", - choices=["float32", "float16", "int8", "int4"], - ) - parser.add_argument( - "--trust_remote_code", - action="store_true", - help="""Whether or not to allow for custom models defined on the Hub in their own modeling - files. This flag should only be used for repositories you trust and in which you have - read the code, as it will execute code present on the Hub on your local machine.""", - ) - - if subparsers is not None: - parser.set_defaults(func=estimate_command) - return parser - - -def gather_data(args): - """Creates an empty model and gathers the data for the sizes""" - try: - model = create_empty_model( - args.model_name, - library_name=args.library_name, - trust_remote_code=args.trust_remote_code, - ) - except (RuntimeError, OSError) as e: - raise e - - total_size, largest_layer = calculate_maximum_sizes(model) - - data = [] - - for dtype in args.dtypes: - dtype_total_size = total_size - dtype_largest_layer = largest_layer[0] - if dtype == "float16": - dtype_total_size /= 2 - dtype_largest_layer /= 2 - elif dtype == "int8": - dtype_total_size /= 4 - dtype_largest_layer /= 4 - elif dtype == "int4": - dtype_total_size /= 8 - dtype_largest_layer /= 8 - dtype_training_size = dtype_total_size * 4 - data.append([dtype, dtype_largest_layer, dtype_total_size, dtype_training_size]) - return data - - -def estimate_command(args): - """Doctype: estimate_command""" - data = gather_data(args) - for row in data: - for i, item in enumerate(row): - if isinstance(item, (int, float)): - row[i] = convert_bytes(item) - - headers = ["dtype", "Largest Layer", "Total Size", "Training using Adam"] - - title = f"Memory Usage for loading `{args.model_name}`" - table = create_ascii_table(headers, data, title) - print(table) - - -def main(): - """Doctype: main""" - parser = estimate_command_parser() - args = parser.parse_args() - estimate_command(args) - - -if __name__ == "__main__": - main()