Skip to content

Commit

Permalink
Bump kv cache min memory for batch jobs (#536)
Browse files Browse the repository at this point in the history
* bump kv cache min for batch jobs

* Add test for batch job

* Bump multiplier to 18 to get batch job to use 4 GPU
  • Loading branch information
dmchoiboi authored Jun 10, 2024
1 parent f84adbb commit 6447c5f
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2236,13 +2236,15 @@ def _infer_hardware(
llm_artifact_gateway: LLMArtifactGateway,
model_name: str,
checkpoint_path: str,
is_batch_job: bool = False,
) -> CreateDockerImageBatchJobResourceRequests:
config = llm_artifact_gateway.get_model_config(checkpoint_path)

dtype_size = 2
kv_multiplier = 20 if is_batch_job else 2

min_kv_cache_size = (
2
kv_multiplier
* dtype_size
* config["num_hidden_layers"]
* config["hidden_size"]
Expand All @@ -2267,7 +2269,7 @@ def _infer_hardware(
min_memory_gb = math.ceil((min_kv_cache_size + model_weights_size) / 1_000_000_000 / 0.9)

logger.info(
f"Memory calculation result: {min_memory_gb=} for {model_name}, min_kv_cache_size: {min_kv_cache_size}, model_weights_size: {model_weights_size}"
f"Memory calculation result: {min_memory_gb=} for {model_name}, min_kv_cache_size: {min_kv_cache_size}, model_weights_size: {model_weights_size}, is_batch_job: {is_batch_job}"
)

if min_memory_gb <= 24:
Expand Down Expand Up @@ -2408,6 +2410,7 @@ async def execute(
self.llm_artifact_gateway,
request.model_config.model,
request.model_config.checkpoint_path,
is_batch_job=True,
)
# Reconcile gpus count with num_shards from request
assert hardware.gpus is not None
Expand Down
56 changes: 56 additions & 0 deletions model-engine/tests/unit/domain/test_llm_use_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -1861,6 +1861,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
assert hardware.storage == "160Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

hardware = _infer_hardware(fake_llm_artifact_gateway, "mixtral-8x7b", "", is_batch_job=True)
assert hardware.cpus == "20"
assert hardware.gpus == 2
assert hardware.memory == "160Gi"
assert hardware.storage == "160Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

fake_llm_artifact_gateway.model_config = {
"architectures": ["MixtralForCausalLM"],
"attention_dropout": 0.0,
Expand Down Expand Up @@ -1892,6 +1899,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
assert hardware.storage == "460Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

hardware = _infer_hardware(fake_llm_artifact_gateway, "mixtral-8x22b", "", is_batch_job=True)
assert hardware.cpus == "80"
assert hardware.gpus == 8
assert hardware.memory == "800Gi"
assert hardware.storage == "460Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

fake_llm_artifact_gateway.model_config = {
"_name_or_path": "meta-llama/Llama-2-7b-hf",
"architectures": ["LlamaForCausalLM"],
Expand Down Expand Up @@ -1919,6 +1933,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
assert hardware.storage == "80Gi"
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10

hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-7b", "", is_batch_job=True)
assert hardware.cpus == "20"
assert hardware.gpus == 2
assert hardware.memory == "48Gi"
assert hardware.storage == "80Gi"
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10

fake_llm_artifact_gateway.model_config = {
"architectures": ["LlamaForCausalLM"],
"attention_dropout": 0.0,
Expand Down Expand Up @@ -1947,6 +1968,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
assert hardware.storage == "80Gi"
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10

hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-3-8b", "", is_batch_job=True)
assert hardware.cpus == "20"
assert hardware.gpus == 2
assert hardware.memory == "48Gi"
assert hardware.storage == "80Gi"
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10

fake_llm_artifact_gateway.model_config = {
"_name_or_path": "meta-llama/Llama-2-13b-hf",
"architectures": ["LlamaForCausalLM"],
Expand Down Expand Up @@ -1974,6 +2002,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
assert hardware.storage == "80Gi"
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10

hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-13b", "", is_batch_job=True)
assert hardware.cpus == "40"
assert hardware.gpus == 4
assert hardware.memory == "96Gi"
assert hardware.storage == "96Gi"
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10

fake_llm_artifact_gateway.model_config = {
"architectures": ["LlamaForCausalLM"],
"bos_token_id": 1,
Expand Down Expand Up @@ -2001,6 +2036,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
assert hardware.storage == "96Gi"
assert hardware.gpu_type == GpuType.NVIDIA_AMPERE_A10

hardware = _infer_hardware(fake_llm_artifact_gateway, "codellama-34b", "", is_batch_job=True)
assert hardware.cpus == "20"
assert hardware.gpus == 2
assert hardware.memory == "160Gi"
assert hardware.storage == "160Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

fake_llm_artifact_gateway.model_config = {
"_name_or_path": "meta-llama/Llama-2-70b-hf",
"architectures": ["LlamaForCausalLM"],
Expand Down Expand Up @@ -2028,6 +2070,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
assert hardware.storage == "160Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-2-70b", "", is_batch_job=True)
assert hardware.cpus == "20"
assert hardware.gpus == 2
assert hardware.memory == "160Gi"
assert hardware.storage == "160Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

fake_llm_artifact_gateway.model_config = {
"architectures": ["LlamaForCausalLM"],
"attention_dropout": 0.0,
Expand Down Expand Up @@ -2056,6 +2105,13 @@ def test_infer_hardware(fake_llm_artifact_gateway):
assert hardware.storage == "160Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

hardware = _infer_hardware(fake_llm_artifact_gateway, "llama-3-70b", "", is_batch_job=True)
assert hardware.cpus == "40"
assert hardware.gpus == 4
assert hardware.memory == "320Gi"
assert hardware.storage == "320Gi"
assert hardware.gpu_type == GpuType.NVIDIA_HOPPER_H100

# (TODO) figure out how to calculate memory for llama-3-8b-instruct-262k
# fake_llm_artifact_gateway.model_config = {
# "_name_or_path": "gradientai/llama3-8b-stage65k-chat",
Expand Down

0 comments on commit 6447c5f

Please sign in to comment.