Bump kv cache min memory for batch jobs (#536)

* bump kv cache min for batch jobs * Add test for batch job * Bump multiplier to 18 to get batch job to use 4 GPU
scaleapi · Jun 10, 2024 · 6447c5f · 6447c5f
1 parent f84adbb
commit 6447c5f
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 2 deletions.
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -2236,13 +2236,15 @@ def _infer_hardware(
     llm_artifact_gateway: LLMArtifactGateway,
     model_name: str,
     checkpoint_path: str,
+    is_batch_job: bool = False,
 ) -> CreateDockerImageBatchJobResourceRequests:
     config = llm_artifact_gateway.get_model_config(checkpoint_path)
 
     dtype_size = 2
+    kv_multiplier = 20 if is_batch_job else 2
 
     min_kv_cache_size = (
-        2
+        kv_multiplier
         * dtype_size
         * config["num_hidden_layers"]
         * config["hidden_size"]
@@ -2267,7 +2269,7 @@ def _infer_hardware(
     min_memory_gb = math.ceil((min_kv_cache_size + model_weights_size) / 1_000_000_000 / 0.9)
 
     logger.info(
-        f"Memory calculation result: {min_memory_gb=} for {model_name}, min_kv_cache_size: {min_kv_cache_size}, model_weights_size: {model_weights_size}"
+        f"Memory calculation result: {min_memory_gb=} for {model_name}, min_kv_cache_size: {min_kv_cache_size}, model_weights_size: {model_weights_size}, is_batch_job: {is_batch_job}"
     )
 
     if min_memory_gb <= 24:
@@ -2408,6 +2410,7 @@ async def execute(
             self.llm_artifact_gateway,
             request.model_config.model,
             request.model_config.checkpoint_path,
+            is_batch_job=True,
         )
         # Reconcile gpus count with num_shards from request
         assert hardware.gpus is not None

diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -1861,6 +1861,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.storage == "160Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
+    hardware = _infer_hardware(fake_llm_artifact_gateway, "mixtral-8x7b", "", is_batch_job=True)
+    assert hardware.cpus == "20"
+    assert hardware.gpus == 2
+    assert hardware.memory == "160Gi"
+    assert hardware.storage == "160Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
+
     fake_llm_artifact_gateway.model_config = {
         "architectures": ["MixtralForCausalLM"],
         "attention_dropout": 0.0,
@@ -1892,6 +1899,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.storage == "460Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
+    hardware = _infer_hardware(fake_llm_artifact_gateway, "mixtral-8x22b", "", is_batch_job=True)
+    assert hardware.cpus == "80"
+    assert hardware.gpus == 8
+    assert hardware.memory == "800Gi"
+    assert hardware.storage == "460Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
+
     fake_llm_artifact_gateway.model_config = {
         "_name_or_path": "meta-llama/Llama-2-7b-hf",
         "architectures": ["LlamaForCausalLM"],
@@ -1919,6 +1933,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.storage == "80Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10
 
+    hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-7b", "", is_batch_job=True)
+    assert hardware.cpus == "20"
+    assert hardware.gpus == 2
+    assert hardware.memory == "48Gi"
+    assert hardware.storage == "80Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10
+
     fake_llm_artifact_gateway.model_config = {
         "architectures": ["LlamaForCausalLM"],
         "attention_dropout": 0.0,
@@ -1947,6 +1968,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.storage == "80Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10
 
+    hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-3-8b", "", is_batch_job=True)
+    assert hardware.cpus == "20"
+    assert hardware.gpus == 2
+    assert hardware.memory == "48Gi"
+    assert hardware.storage == "80Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10
+
     fake_llm_artifact_gateway.model_config = {
         "_name_or_path": "meta-llama/Llama-2-13b-hf",
         "architectures": ["LlamaForCausalLM"],
@@ -1974,6 +2002,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.storage == "80Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10
 
+    hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-13b", "", is_batch_job=True)
+    assert hardware.cpus == "40"
+    assert hardware.gpus == 4
+    assert hardware.memory == "96Gi"
+    assert hardware.storage == "96Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10
+
     fake_llm_artifact_gateway.model_config = {
         "architectures": ["LlamaForCausalLM"],
         "bos_token_id": 1,
@@ -2001,6 +2036,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.storage == "96Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10
 
+    hardware = _infer_hardware(fake_llm_artifact_gateway, "codellama-34b", "", is_batch_job=True)
+    assert hardware.cpus == "20"
+    assert hardware.gpus == 2
+    assert hardware.memory == "160Gi"
+    assert hardware.storage == "160Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
+
     fake_llm_artifact_gateway.model_config = {
         "_name_or_path": "meta-llama/Llama-2-70b-hf",
         "architectures": ["LlamaForCausalLM"],
@@ -2028,6 +2070,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.storage == "160Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
+    hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-70b", "", is_batch_job=True)
+    assert hardware.cpus == "20"
+    assert hardware.gpus == 2
+    assert hardware.memory == "160Gi"
+    assert hardware.storage == "160Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
+
     fake_llm_artifact_gateway.model_config = {
         "architectures": ["LlamaForCausalLM"],
         "attention_dropout": 0.0,
@@ -2056,6 +2105,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
     assert hardware.storage == "160Gi"
     assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
 
+    hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-3-70b", "", is_batch_job=True)
+    assert hardware.cpus == "40"
+    assert hardware.gpus == 4
+    assert hardware.memory == "320Gi"
+    assert hardware.storage == "320Gi"
+    assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100
+
     # (TODO) figure out how to calculate memory for llama-3-8b-instruct-262k
     # fake_llm_artifact_gateway.model_config = {
     #     "_name_or_path": "gradientai/llama3-8b-stage65k-chat",