From d94b842b62bfe479f8dcf210167d1258be9d90bd Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 14 Dec 2024 21:14:58 +0000 Subject: [PATCH 1/2] cuda: prevent task lockup on timeout error When creating a checkpoint of large models, the `checkpoint` action of `cuda-checkpoint` can exceed the CRIU timeout. This causes CRIU to fail with the following error, leaving the CUDA task in a locked state: cuda_plugin: Checkpointing CUDA devices on pid 84145 restore_tid 84202 Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 0 Error (cuda_plugin.c:139): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call Error (cuda_plugin.c:396): cuda_plugin: CHECKPOINT_DEVICES failed with net: Unlock network cuda_plugin: finished cuda_plugin stage 0 err -1 cuda_plugin: resuming devices on pid 84145 cuda_plugin: Restore thread pid 84202 found for real pid 84145 Unfreezing tasks into 1 Unseizing 84145 into 1 Error (criu/cr-dump.c:2111): Dumping FAILED. To fix this, we set `task_info->checkpointed` before invoking the `checkpoint` action to ensure that the CUDA task is resumed even if CRIU times out. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index e78828b189..976ce824ca 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -391,14 +391,14 @@ int cuda_plugin_checkpoint_devices(int pid) if (resume_restore_thread(restore_tid, &save_sigset)) { return -1; } + + task_info->checkpointed = 1; status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); goto interrupt; } - task_info->checkpointed = 1; - interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); From 66cb6deba907ea49e13b42f9ac0590f6c908e3dc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 21 Dec 2024 14:17:35 +0000 Subject: [PATCH 2/2] seize: use separate checkpoint_devices function Move `run_plugins(CHECKPOINT_DEVICES)` out of `collect_pstree()` to ensure that the function's sole responsibility is to use the cgroup freezer for the process tree. This allows us to avoid a time-out error when checkpointing applications with large GPU state. Suggested-by: Andrei Vagin Suggested-by: Jesus Ramos Signed-off-by: Radostin Stoyanov --- criu/cr-dump.c | 6 ++++++ criu/include/seize.h | 1 + criu/seize.c | 23 ++++++++++++++++------- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 1bc5d934f5..e73cad5bc4 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1963,6 +1963,9 @@ int cr_pre_dump_tasks(pid_t pid) if (collect_pstree()) goto err; + if (checkpoint_devices()) + goto err; + if (collect_pstree_ids_predump()) goto err; @@ -2192,6 +2195,9 @@ int cr_dump_tasks(pid_t pid) if (collect_pstree()) goto err; + if (checkpoint_devices()) + goto err; + if (collect_pstree_ids()) goto err; diff --git a/criu/include/seize.h b/criu/include/seize.h index 64e8d2d12f..fc7facad37 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,6 +2,7 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +extern int checkpoint_devices(void); struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); diff --git a/criu/seize.c b/criu/seize.c index 9bd1832d9b..f2c67862bb 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1017,7 +1017,6 @@ int collect_pstree(void) pid_t pid = root_item->pid->real; int ret, exit_code = -1; struct proc_status_creds creds; - struct pstree_item *iter; timing_start(TIME_FREEZING); @@ -1078,6 +1077,21 @@ int collect_pstree(void) goto err; } + exit_code = 0; + timing_stop(TIME_FREEZING); + timing_start(TIME_FROZEN); + +err: + /* Freezing stage finished in time - disable timer. */ + alarm(0); + return exit_code; +} + +int checkpoint_devices(void) +{ + struct pstree_item *iter; + int ret, exit_code = -1; + for_each_pstree_item(iter) { if (!task_alive(iter)) continue; @@ -1087,11 +1101,6 @@ int collect_pstree(void) } exit_code = 0; - timing_stop(TIME_FREEZING); - timing_start(TIME_FROZEN); - err: - /* Freezing stage finished in time - disable timer. */ - alarm(0); return exit_code; -} +} \ No newline at end of file