PaddlePaddle · wawltor · Nov 25, 2024 · Sep 23, 2024 · Sep 24, 2024 · Oct 11, 2024
diff --git a/paddlenlp/peft/lora/lora_model.py b/paddlenlp/peft/lora/lora_model.py
@@ -262,7 +262,9 @@ def from_pretrained(cls, model, lora_path, **kwargs):
                     pre_tensor_parallel_split = True
                     tp_actions = lora_model._get_tensor_parallel_convert_actions(loaded_keys, is_split=True)
                 state_dict = load_state_dict(
-                    shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys
+                    shard_file,
+                    tp_actions if pre_tensor_parallel_split else None,
+                    expected_keys,
                 )
                 error_msgs += _load_state_dict_into_model(lora_model.model, state_dict, "")
                 del state_dict

diff --git a/paddlenlp/peft/prefix/prefix_model.py b/paddlenlp/peft/prefix/prefix_model.py
@@ -333,7 +333,9 @@ def from_pretrained(
                     pre_tensor_parallel_split = True
                     tp_actions = prefix_model._get_tensor_parallel_convert_actions(is_split=True)
                 state_dict = load_state_dict(
-                    shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys
+                    shard_file,
+                    tp_actions if pre_tensor_parallel_split else None,
+                    expected_keys,
                 )
                 error_msgs += _load_state_dict_into_model(prefix_model.prefix_encoder, state_dict, "")
                 del state_dict

diff --git a/paddlenlp/trainer/plugins/unified_checkpoint.py b/paddlenlp/trainer/plugins/unified_checkpoint.py
@@ -48,7 +48,11 @@
 )
 from paddlenlp.utils.distributed import distributed_allgather, distributed_gather
 from paddlenlp.utils.env import (
+    BETA1_KEYNAME,
+    BETA2_KEYNAME,
     LORA_WEIGHTS_NAME,
+    MOMENT1_KEYNAME,
+    MOMENT2_KEYNAME,
     PADDLE_MASTER_WEIGHTS_INDEX_NAME,
     PADDLE_MASTER_WEIGHTS_NAME,
     PADDLE_OPTIMIZER_INDEX_NAME,
@@ -86,6 +90,7 @@
     _traverse_copy_to_shm,
     create_meta_dict,
 )
+from .unified_checkpoint_quantization import quant_unified_optimizer
 
 FP32_MASTER = "fp32_master_0"
 optimizer_scalar_name = [
@@ -115,6 +120,7 @@
 
     SKIP_SAVE_MODEL_WEIGHT = "skip_save_model_weight"
     MASTER_WEIGHT_COMPATIBLE = "master_weight_compatible"
+    REMOVE_MASTER_WEIGHT = "remove_master_weight"
     ASYNC_SAVE = "async_save"
     IGNORE_MERGE_OPTIMIZER = "ignore_merge_optimizer"
 
@@ -149,11 +155,15 @@
             self._shared_save_master_weight_flag = multiprocessing.Array("i", 1)
             self._shared_save_optimizer_flag = multiprocessing.Array("i", 1)
 
-    def _file_save_async_or_sync(self, state_dict, path, is_sync=True, state_dict_type="model_weight"):
+    def _file_save_async_or_sync(
+        self, state_dict, path, is_sync=True, state_dict_type="model_weight", ckpt_quant_stage="O0"
+    ):
         if is_sync:
             for k in list(state_dict.keys()):
                 if isinstance(state_dict[k], paddle.Tensor):
                     state_dict[k] = state_dict.pop(k).cpu().numpy()
+
+            state_dict = quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stage)
             safe_save_file(state_dict, path, metadata={"format": "np"})
         else:
             if state_dict_type == "model_weight":
@@ -221,6 +231,7 @@
                             self._lock,
                             state_dict_type,
                             self.global_rank,
+                            ckpt_quant_stage,
                         ),
                     )
                     self._process_optimizer_weight.start()
@@ -246,6 +257,7 @@
         lock,
         state_dict_type,
         global_rank,
+        ckpt_quant_stage="O0",
     ):
         shm = shared_memory.SharedMemory(name=shm_name)
         while True:
@@ -258,6 +270,9 @@
                 path = shared_save_path[:].decode("utf-8").rstrip("\x00")
                 logger.info(f"Start to async save {path}")
                 state_dict = _read_state_dict_from_shm(meta_dict, shm)  # numpy array
+                state_dict = quant_unified_optimizer(
+                    state_dict, state_dict_type, ckpt_quant_stage
+                )  # ckpt quantization
                 safe_save_file(state_dict, path, {"format": "np"})
                 del state_dict
                 saved_signal_path = os.path.join(os.path.dirname(path), f".{state_dict_type}.done.{global_rank}")
@@ -356,7 +371,6 @@
         if self.args.should_save:
             config_to_save.save_pretrained(save_directory)
         paddle.device.cuda.empty_cache()
-
         if strtobool(os.getenv("FLAG_LLM_PDC", "False")) and self.args.should_save:
             world_size = paddle.distributed.get_world_size()
             save_info = {
@@ -433,6 +447,7 @@
             path=os.path.join(output_dir, optimizer_name),
             is_sync=is_sync_save,
             state_dict_type="optimizer_weight",
+            ckpt_quant_stage=model.config.ckpt_quant_stage,
         )
         self._file_save_async_or_sync(
             master_weights,
@@ -523,6 +538,7 @@
             path=os.path.join(save_directory, shard_optim_file),
             is_sync=is_sync_save,
             state_dict_type="optimizer_weight",
+            ckpt_quant_stage=model.config.ckpt_quant_stage,
         )
         if master_weight_state_dict is not None:
             self._file_save_async_or_sync(
@@ -626,7 +642,10 @@
 
         # save checkpoint
         self._file_save_async_or_sync(
-            state_dict, path=os.path.join(output_dir, weight_filename), is_sync=True, state_dict_type="model_weight"
+            state_dict,
+            path=os.path.join(output_dir, weight_filename),
+            is_sync=True,
+            state_dict_type="model_weight",
         )
 
         if isinstance(model_to_save, PrefixModelForCausalLM):
@@ -659,6 +678,11 @@
             static_name, type_name = generate_base_static_name(key)
             new_name = static2struct_name_mappings[static_name] + "/" + type_name
             optim_state_dict[new_name] = optim_state_dict.pop(key)
+
+        if UnifiedCheckpointOption.REMOVE_MASTER_WEIGHT.value in self.args.unified_checkpoint_config:
+            logger.info("Skip master weight saving.")
+            master_weights = None
+
         if master_weights is not None:
             for key in list(master_weights.keys()):
                 master_weights[static2struct_name_mappings[key]] = master_weights.pop(key)
@@ -700,6 +724,7 @@
             path=os.path.join(output_dir, "optimizer-00001-of-00001.safetensors"),
             is_sync=True,
             state_dict_type="optimizer_weight",
+            ckpt_quant_stage=model.config.ckpt_quant_stage,
         )
         if master_weights is not None:
             self._file_save_async_or_sync(
@@ -800,7 +825,11 @@
                 tp_actions = model.get_tensor_parallel_convert_actions(model.config, loaded_keys, ignore_error=True)
         # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
         state_dict = load_state_dict(
-            shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys, device="expected"
+            shard_file,
+            tp_actions if pre_tensor_parallel_split else None,
+            expected_keys,
+            device="expected",
+            ckpt_quant_stage=model.config.ckpt_quant_stage,
         )
 
         if not pre_tensor_parallel_split:
@@ -984,10 +1013,22 @@
                         tp_actions = mapping_optimizer_tp_actions(tp_actions, expected_keys)
 
                     # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
-                    state_dict = load_state_dict(shard_file, tp_actions, expected_keys, device="expected")
+                    state_dict = load_state_dict(
+                        shard_file,
+                        tp_actions,
+                        expected_keys,
+                        device="expected",
+                        ckpt_quant_stage=model.config.ckpt_quant_stage,
+                    )
                 else:
                     # for pipeline model, we don't need to use tp_actions
-                    state_dict = load_state_dict(shard_file, None, expected_keys, device="expected")
+                    state_dict = load_state_dict(
+                        shard_file,
+                        None,
+                        expected_keys,
+                        device="expected",
+                        ckpt_quant_stage=model.config.ckpt_quant_stage,
+                    )
 
             returned_state_dict.update(state_dict)
             # force memory release
@@ -1000,6 +1041,7 @@
         state_dict_master_weight = load_resolved_archive_file(
             resolved_archive_file_mw, sharded_metadata_mw, expected_keys_mw, is_master_weights=True
         )
+
     # rename optimizer param
     for key in list(state_dict_optim.keys()):
         key_name = key.split("/")
@@ -1057,6 +1099,11 @@
         static_name, type_name = generate_base_static_name(key)
         new_name = static2struct_name_mappings[static_name] + "/" + type_name
         optim_state_dict[new_name] = optim_state_dict.pop(key)
+
+    if UnifiedCheckpointOption.REMOVE_MASTER_WEIGHT.value in args.unified_checkpoint_config:
+        logger.info("Skip master weight saving.")
+        master_weights = None
+
     if master_weights is not None:
         for key in list(master_weights.keys()):
             master_weights[static2struct_name_mappings[key]] = master_weights.pop(key)
@@ -1707,7 +1754,9 @@
     if len(missing_keys) > 0:
         raise ValueError(f"Missing keys: {missing_keys}")
 
-    state_dict = load_state_dict(resolved_archive_file[0], None, expected_keys)
+    state_dict = load_state_dict(
+        resolved_archive_file[0], None, expected_keys, ckpt_quant_stage=model.config.ckpt_quant_stage
+    )
     error_msgs = _load_state_dict_into_model(model, state_dict, "")
     del state_dict
     gc.collect()
@@ -1737,9 +1786,13 @@
         )
         expected_keys_mw = sharded_metadata_mw["all_optimizer_keys"]
 
-    state_dict_optim = load_state_dict(resolved_archive_file[0], None, expected_keys)
+    state_dict_optim = load_state_dict(
+        resolved_archive_file[0], None, expected_keys, ckpt_quant_stage=model.config.ckpt_quant_stage
+    )
     if has_master_weights:
-        state_dict_optim_mw = load_state_dict(resolved_archive_file_mw[0], None, expected_keys_mw)
+        state_dict_optim_mw = load_state_dict(
+            resolved_archive_file_mw[0], None, expected_keys_mw, ckpt_quant_stage=model.config.ckpt_quant_stage
+        )
 
     for key in list(state_dict_optim.keys()):
         key_name = key.split("/")
@@ -2008,14 +2061,27 @@
     filter_tensor_list = [[] for i in range(tp_size)]
 
     if tp_rank == 0:
+        quant = False
+        if model_to_save.config.ckpt_quant_stage != "O0":
+            quant = True
         tensor_bytes_dict = {}
         model_state_dict = get_expected_state_dict(model_to_save)
         for (k, v) in state_dict.items():
-            model_v = model_state_dict[k.split("/")[0]] if is_optimizer else v
-            if hasattr(model_v, "is_distributed") and model_v.is_distributed:
-                tensor_bytes_dict[k] = v.numel().item() * tp_size * dtype_byte_size(v.dtype)
+            weight_key = k.split("/")[0]
+            model_v = model_state_dict[weight_key] if is_optimizer else v
+            if not quant or not is_optimizer:
+                if hasattr(model_v, "is_distributed") and model_v.is_distributed:
+                    tensor_bytes_dict[k] = v.numel().item() * tp_size * dtype_byte_size(v.dtype)
+                else:
+                    tensor_bytes_dict[k] = v.numel().item() * dtype_byte_size(v.dtype)
             else:
-                tensor_bytes_dict[k] = v.numel().item() * dtype_byte_size(v.dtype)
+                if weight_key not in tensor_bytes_dict:
+                    tensor_bytes_dict[weight_key] = 0
+
+                if hasattr(model_v, "is_distributed") and model_v.is_distributed:
+                    tensor_bytes_dict[weight_key] += v.numel().item() * tp_size * dtype_byte_size(v.dtype)
+                else:
+                    tensor_bytes_dict[weight_key] += v.numel().item() * dtype_byte_size(v.dtype)
 
         filter_tensor_list = []
         current_block = []
@@ -2036,7 +2102,14 @@
                 current_block = []
                 current_block_size = 0
 
-            current_block.append(key)
+            if not quant or not is_optimizer:
+                current_block.append(key)
+            else:
+                current_block.append(key + "/" + MOMENT1_KEYNAME)
+                current_block.append(key + "/" + MOMENT2_KEYNAME)
+                current_block.append(key + "/" + BETA1_KEYNAME)
+                current_block.append(key + "/" + BETA2_KEYNAME)
+
             current_block_size += weight_size
             total_size += weight_size
 
@@ -2307,7 +2380,10 @@
 def update_master_weight_status(args, optimizer, has_master_weight, safe_serialization):
     if is_need_master_weight(optimizer, is_fp16_or_bp16=(args.fp16 or args.bf16)):
         if not has_master_weight:
-            if UnifiedCheckpointOption.MASTER_WEIGHT_COMPATIBLE.value in args.unified_checkpoint_config:
+            if (
+                UnifiedCheckpointOption.REMOVE_MASTER_WEIGHT.value in args.unified_checkpoint_config
+                or UnifiedCheckpointOption.MASTER_WEIGHT_COMPATIBLE.value in args.unified_checkpoint_config
+            ):
                 index_filename_master_weights = (
                     PADDLE_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
                 )
@@ -2319,7 +2395,8 @@
             else:
                 raise ValueError(
                     "Can't find a valid unified master weight checkpoint,"
-                    f"add '{UnifiedCheckpointOption.MASTER_WEIGHT_COMPATIBLE.value}' into 'unified_checkpoint_config' to "
+                    f"add '{UnifiedCheckpointOption.MASTER_WEIGHT_COMPATIBLE.value}'"
+                    f" or '{UnifiedCheckpointOption.REMOVE_MASTER_WEIGHT.value}' into 'unified_checkpoint_config' to "
                     "load model checkpoint as master weight"
                 )
         else: