Skip to content

Commit

Permalink
last ckpt should be model done when afs (#8402)
Browse files Browse the repository at this point in the history
  • Loading branch information
gongel authored May 11, 2024
1 parent 99fbc41 commit 29d40e2
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2141,10 +2141,10 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op
if not self.is_in_train:
self.args.unified_checkpoint_config = unified_checkpoint_config_backup
if strtobool(os.getenv("FLAG_LLM_PDC", "False")):
# save checkpoint_done file to ensure checkpoint is complete
# save model_done file to ensure model is complete
if self.args.should_save_model_state and self.args.should_save:
# For ckpt integrity
paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done"))
paddle.save(self.state.global_step, os.path.join(output_dir, ".model_done"))

def _save_checkpoint(self, model, metrics=None):
# assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
Expand Down

0 comments on commit 29d40e2

Please sign in to comment.