From 29d40e2da18d7ea43d3d8308d78cf364f9783b15 Mon Sep 17 00:00:00 2001 From: gongel Date: Sat, 11 May 2024 14:30:44 +0800 Subject: [PATCH] last ckpt should be model done when afs (#8402) --- paddlenlp/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 590c884d9c0f..76034060c721 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2141,10 +2141,10 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op if not self.is_in_train: self.args.unified_checkpoint_config = unified_checkpoint_config_backup if strtobool(os.getenv("FLAG_LLM_PDC", "False")): - # save checkpoint_done file to ensure checkpoint is complete + # save model_done file to ensure model is complete if self.args.should_save_model_state and self.args.should_save: # For ckpt integrity - paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done")) + paddle.save(self.state.global_step, os.path.join(output_dir, ".model_done")) def _save_checkpoint(self, model, metrics=None): # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"