diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index b27fe1077..15bfaa697 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -2236,13 +2236,15 @@ def _infer_hardware( llm_artifact_gateway: LLMArtifactGateway, model_name: str, checkpoint_path: str, + is_batch_job: bool = False, ) -> CreateDockerImageBatchJobResourceRequests: config = llm_artifact_gateway.get_model_config(checkpoint_path) dtype_size = 2 + kv_multiplier = 20 if is_batch_job else 2 min_kv_cache_size = ( - 2 + kv_multiplier * dtype_size * config["num_hidden_layers"] * config["hidden_size"] @@ -2267,7 +2269,7 @@ def _infer_hardware( min_memory_gb = math.ceil((min_kv_cache_size + model_weights_size) / 1_000_000_000 / 0.9) logger.info( - f"Memory calculation result: {min_memory_gb=} for {model_name}, min_kv_cache_size: {min_kv_cache_size}, model_weights_size: {model_weights_size}" + f"Memory calculation result: {min_memory_gb=} for {model_name}, min_kv_cache_size: {min_kv_cache_size}, model_weights_size: {model_weights_size}, is_batch_job: {is_batch_job}" ) if min_memory_gb <= 24: @@ -2408,6 +2410,7 @@ async def execute( self.llm_artifact_gateway, request.model_config.model, request.model_config.checkpoint_path, + is_batch_job=True, ) # Reconcile gpus count with num_shards from request assert hardware.gpus is not None diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py index 770f3bda1..4aa9e982b 100644 --- a/model-engine/tests/unit/domain/test_llm_use_cases.py +++ b/model-engine/tests/unit/domain/test_llm_use_cases.py @@ -1861,6 +1861,13 @@ def test_infer_hardware(fake_llm_artifact_gateway): assert hardware.storage == "160Gi" assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + hardware = _infer_hardware(fake_llm_artifact_gateway, "mixtral-8x7b", "", is_batch_job=True) + assert hardware.cpus == "20" + assert hardware.gpus == 2 + assert hardware.memory == "160Gi" + assert hardware.storage == "160Gi" + assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + fake_llm_artifact_gateway.model_config = { "architectures": ["MixtralForCausalLM"], "attention_dropout": 0.0, @@ -1892,6 +1899,13 @@ def test_infer_hardware(fake_llm_artifact_gateway): assert hardware.storage == "460Gi" assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + hardware = _infer_hardware(fake_llm_artifact_gateway, "mixtral-8x22b", "", is_batch_job=True) + assert hardware.cpus == "80" + assert hardware.gpus == 8 + assert hardware.memory == "800Gi" + assert hardware.storage == "460Gi" + assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + fake_llm_artifact_gateway.model_config = { "_name_or_path": "meta-llama/Llama-2-7b-hf", "architectures": ["LlamaForCausalLM"], @@ -1919,6 +1933,13 @@ def test_infer_hardware(fake_llm_artifact_gateway): assert hardware.storage == "80Gi" assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10 + hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-7b", "", is_batch_job=True) + assert hardware.cpus == "20" + assert hardware.gpus == 2 + assert hardware.memory == "48Gi" + assert hardware.storage == "80Gi" + assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10 + fake_llm_artifact_gateway.model_config = { "architectures": ["LlamaForCausalLM"], "attention_dropout": 0.0, @@ -1947,6 +1968,13 @@ def test_infer_hardware(fake_llm_artifact_gateway): assert hardware.storage == "80Gi" assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10 + hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-3-8b", "", is_batch_job=True) + assert hardware.cpus == "20" + assert hardware.gpus == 2 + assert hardware.memory == "48Gi" + assert hardware.storage == "80Gi" + assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10 + fake_llm_artifact_gateway.model_config = { "_name_or_path": "meta-llama/Llama-2-13b-hf", "architectures": ["LlamaForCausalLM"], @@ -1974,6 +2002,13 @@ def test_infer_hardware(fake_llm_artifact_gateway): assert hardware.storage == "80Gi" assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10 + hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-13b", "", is_batch_job=True) + assert hardware.cpus == "40" + assert hardware.gpus == 4 + assert hardware.memory == "96Gi" + assert hardware.storage == "96Gi" + assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10 + fake_llm_artifact_gateway.model_config = { "architectures": ["LlamaForCausalLM"], "bos_token_id": 1, @@ -2001,6 +2036,13 @@ def test_infer_hardware(fake_llm_artifact_gateway): assert hardware.storage == "96Gi" assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10 + hardware = _infer_hardware(fake_llm_artifact_gateway, "codellama-34b", "", is_batch_job=True) + assert hardware.cpus == "20" + assert hardware.gpus == 2 + assert hardware.memory == "160Gi" + assert hardware.storage == "160Gi" + assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + fake_llm_artifact_gateway.model_config = { "_name_or_path": "meta-llama/Llama-2-70b-hf", "architectures": ["LlamaForCausalLM"], @@ -2028,6 +2070,13 @@ def test_infer_hardware(fake_llm_artifact_gateway): assert hardware.storage == "160Gi" assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-70b", "", is_batch_job=True) + assert hardware.cpus == "20" + assert hardware.gpus == 2 + assert hardware.memory == "160Gi" + assert hardware.storage == "160Gi" + assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + fake_llm_artifact_gateway.model_config = { "architectures": ["LlamaForCausalLM"], "attention_dropout": 0.0, @@ -2056,6 +2105,13 @@ def test_infer_hardware(fake_llm_artifact_gateway): assert hardware.storage == "160Gi" assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-3-70b", "", is_batch_job=True) + assert hardware.cpus == "40" + assert hardware.gpus == 4 + assert hardware.memory == "320Gi" + assert hardware.storage == "320Gi" + assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100 + # (TODO) figure out how to calculate memory for llama-3-8b-instruct-262k # fake_llm_artifact_gateway.model_config = { # "_name_or_path": "gradientai/llama3-8b-stage65k-chat",