From 07dbd3455c41a1257cb1222d99fe4130e8f7e232 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Dec 2024 13:01:45 +0200 Subject: [PATCH] fix graceful shutdown Signed-off-by: Konrad Zawora --- vllm/executor/multiproc_hpu_executor.py | 3 +++ vllm/executor/multiproc_worker_utils.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/vllm/executor/multiproc_hpu_executor.py b/vllm/executor/multiproc_hpu_executor.py index a1ad4e906c6f8..118a18c02b072 100644 --- a/vllm/executor/multiproc_hpu_executor.py +++ b/vllm/executor/multiproc_hpu_executor.py @@ -42,6 +42,9 @@ def _check_executor_parameters(self): f"please ensure that world_size ({world_size}) " f"is less than than max local hpu count ({hpu_device_count})") + def __del__(self): + self.shutdown() + class MultiprocessingHPUExecutorAsync(MultiprocessingHPUExecutor, MultiprocessingGPUExecutorAsync): diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index fe475db6d3f57..d187643392d55 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -15,6 +15,7 @@ import vllm.envs as envs from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.triton_utils.importing import HAS_TRITON from vllm.utils import cuda_is_initialized @@ -291,6 +292,22 @@ def set_multiprocessing_worker_envs(parallel_config): "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.") os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if (current_platform.is_hpu() + and parallel_config.distributed_executor_backend == 'mp' + and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might " + "cause application hangs on exit. Using " + "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " + "as it was explicitly requested.") + else: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might " + "cause application hangs on exit. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "To override that behavior, please set " + "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + # Configure thread parallelism if OMP_NUM_THREADS isn't set # # Helps to avoid CPU contention. The default of spawning a thread per