PaddlePaddle · wawltor · Aug 5, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/legacy/examples/RLHF/infer_utils.py b/legacy/examples/RLHF/infer_utils.py
@@ -73,9 +73,7 @@ def create_infer_model(model, dtype, set_state=False):
             hcg = dist.fleet.get_hybrid_communicate_group()  # may differ with training
             config.tensor_parallel_degree = hcg.get_model_parallel_world_size()
             config.tensor_parallel_rank = hcg.get_model_parallel_rank()
-            config.weight_only_quant_bits = -1
             config.quant_type = None
-            config.use_cachekv_int8 = False
             config.single_card_ptq = True
             infer_model_cls = getattr(paddlenlp.experimental.transformers, model.__class__.__name__ + "InferenceModel")
             # ori_init_weights = infer_model_cls.init_weights

diff --git a/llm/alignment/ppo/infer_utils.py b/llm/alignment/ppo/infer_utils.py
@@ -73,9 +73,7 @@ def create_infer_model(model, dtype, set_state=False):
             hcg = dist.fleet.get_hybrid_communicate_group()  # may differ with training
             config.tensor_parallel_degree = hcg.get_model_parallel_world_size()
             config.tensor_parallel_rank = hcg.get_model_parallel_rank()
-            config.weight_only_quant_bits = -1
             config.quant_type = None
-            config.use_cachekv_int8 = False
             config.single_card_ptq = True
             infer_model_cls = getattr(paddlenlp.experimental.transformers, model.__class__.__name__ + "InferenceModel")
             # ori_init_weights = infer_model_cls.init_weights

diff --git a/llm/docs/inference.md b/llm/docs/inference.md
@@ -154,8 +154,10 @@ python ./predict/predictor.py  --model_name_or_path ./inference --inference_mode
 
 # PTQ-A8W8静态图推理命令参考
 # 以下环境变量用于开启int8矩阵乘的算法选择以获得更快的推理速度，打开之后第一次执行会执行算法选择从而导致速度较慢。
-export FLAGS_use_autotune=1
-export FLAGS_cublaslt_exhaustive_search_times=10
+# 开启后会在计算int8 matmul时启用cuBLASLt全局搜索找寻最优配置
+export FLAGS_enable_blaslt_global_search=1
+# 开启后会在离线文件中加载int8 matmul配置(使用方式可参考https://github.com/PaddlePaddle/Paddle/pull/66132描述)
+export FLAGS_cublaslt_device_best_config=/path/to/file
 export FLAGS_cache_inference_while_scope=1
 
 python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static"
@@ -185,7 +187,7 @@ python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --
 python ./predict/predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --dtype float16 --block_attn
 
 # CacheKV 动态量化推理命令参考
-python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --cachekv_int8
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --block_attn --cachekv_int8_type dynamic
 ```
 
 #### 2.4.2 静态图推理
@@ -204,7 +206,7 @@ python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat
 python ./predict/export_model.py --model_name_or_path checkpoints/llama_ptq_ckpts --inference_model --output_path ./inference --dtype float16 --block_attn
 
 # CacheKV 动态量化动转静命令参考
-python ./predict/export_model.py  --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --cachekv_int8
+python ./predict/export_model.py  --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --block_attn --cachekv_int8_type dynamic
 ```
 
 **step2：静态图推理**
@@ -226,12 +228,13 @@ export FLAGS_cache_inference_while_scope=1
 
 python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --block_attn
 
-# CacheKV 动态量化8静态图推理命令参考
-python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --cachekv_int8 --block_attn
+# CacheKV 动态量化int8静态图推理命令参考
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --dtype "float16" --mode "static" --cachekv_int8_type dynamic --block_attn
 ```
 **Note**：
-1. 使用Weight Only Int8 推理需要额外传入 `quant_type`。
-2. A8W8推理传入的 `model_name_or_path` 为PTQ校准产出的量化模型。
+1. `quant_type`可选的数值有`weight_only_int8`，`weight_only_int4`，`a8w8`, `a8w8c8`。
+2. `a8w8`推理传入的 `model_name_or_path` 为PTQ校准产出的量化模型，需要额外的act和weight的scale校准表。
+3. `cachekv_int8_type`可选`dynamic`和`static`两种，`static`需要额外的cache kv的scale校准表。
 
 
 ## 3. 推理参数介绍
@@ -254,4 +257,4 @@ python ./predict/predictor.py  --model_name_or_path ./inference --inference_mode
 - `inference_model`: 是否使用Inference Model 推理，默认值为 False。
 - `block_attn`: 是否使用Block Attention 推理， 默认值为False。
 - `block_size`: 如果使用Block Attention 推理，指定一个Block可以存储的token数量，默认值为64。
-- `cachekv_int8`: 是否使用cachekv int8量化用于节省显存，默认值为False。
+- `cachekv_int8_type`: 是否使用cachekv int8量化用于节省显存，可以是动态或者静态，默认值为None。
diff --git a/llm/predict/export_model.py b/llm/predict/export_model.py
@@ -57,7 +57,7 @@ def main():
         {
             "dtype": predictor_args.dtype,
             "export_precache": predictor_args.export_precache,
-            "use_cachekv_int8": predictor_args.use_cachekv_int8,
+            "cachekv_int8_type": predictor_args.cachekv_int8_type,
         },
     )
     predictor.model.config.save_pretrained(export_args.output_path)

diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -96,8 +96,8 @@ class PredictorArgument:
     )
     inference_model: bool = field(default=False, metadata={"help": "whether use InferenceModel to do generation"})
     quant_type: str = field(
-        default=None,
-        metadata={"help": "Quantization type. Supported values: a8w8, weight_only_int4, weight_only_int8"},
+        default="",
+        metadata={"help": "Quantization type. Supported values: a8w8, a8w8c8, weight_only_int4, weight_only_int8"},
     )
     avx_model: bool = field(
         default=False, metadata={"help": "whether use AvxModel to do generation when using cpu inference"}
@@ -116,9 +116,9 @@ class PredictorArgument:
 
     block_attn: bool = field(default=False, metadata={"help": "whether use block attention"})
     block_size: int = field(default=64, metadata={"help": "the block size for cache_kvs."})
-    cachekv_int8: bool = field(
-        default=False,
-        metadata={"help": "If cachekv_int8 set as `True`, cache kv would be quantized to int8 dynamically. "},
+    cachekv_int8_type: str = field(
+        default=None,
+        metadata={"help": "If cachekv_int8_type set as `dynamic`, cache kv would be quantized to int8 dynamically. If cachekv_int8_type set as `static`, cache kv would be quantized to int8 Statically."},
     )
 
     chat_template: str = field(
@@ -136,10 +136,6 @@ class PredictorArgument:
     def total_max_length(self):
         return self.src_length + self.max_length
 
-    @property
-    def use_cachekv_int8(self):
-        return "dynamic" if self.cachekv_int8 else "None"
-
 
 @dataclass
 class ModelArgument:
@@ -824,7 +820,7 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
                 paddle.ones(shape=[config.batch_size, 1, config.src_length, config.src_length], dtype=config.dtype)
             )
 
-        if config.use_cachekv_int8 == "dynamic":
+        if config.cachekv_int8_type == "dynamic":
             self.k_quant_scales = [
                 paddle.zeros([config.batch_size, self.num_attention_heads], dtype="float32")
                 for _ in range(self.num_layers)
@@ -1015,17 +1011,17 @@ def __init__(
         BasePredictor.__init__(self, config, tokenizer)
         BlockInferencePredictorMixin.__init__(self, config, tokenizer)
 
-        if config.use_cachekv_int8 == "dynamic" or config.use_cachekv_int8 == "static":
-            self.cache_kvs = [paddle.zeros(shape, dtype="uint8") for shape in self.cache_kvs_shape]
-        else:
-            self.cache_kvs = [paddle.zeros(shape, dtype=self.dtype) for shape in self.cache_kvs_shape]
+        cachekv_dtype = self.dtype
+        if config.cachekv_int8_type is not None:
+            cachekv_dtype = "uint8"
+        self.cache_kvs = [paddle.zeros(shape, dtype=cachekv_dtype) for shape in self.cache_kvs_shape]
 
         self.model = model
 
         self.init_inputs(config)
         if config.export_precache:
             self.inputs["pre_caches"] = self.pre_caches
-        if config.use_cachekv_int8 == "dynamic":
+        if config.cachekv_int8_type == "dynamic":
             self.inputs["k_quant_scales"] = self.k_quant_scales
             self.inputs["v_quant_scales"] = self.v_quant_scales
             self.inputs["k_dequant_scales"] = self.k_dequant_scales
@@ -1090,23 +1086,19 @@ def __init__(
                 self.inputs["pre_caches_{}".format(i)] = self.pre_caches[i]
 
         self.cache_kvs = {}
-        if config.use_cachekv_int8 == "dynamic" or config.use_cachekv_int8 == "static":
-            for i in range(len(self.cache_kvs_shape) // 2):
-                self.cache_kvs["key_caches_{}".format(i)] = paddle.zeros(self.cache_kvs_shape[2 * i], dtype="uint8")
-                self.cache_kvs["value_caches_{}".format(i)] = paddle.zeros(
-                    self.cache_kvs_shape[2 * i + 1], dtype="uint8"
-                )
-        else:
-            for i in range(len(self.cache_kvs_shape) // 2):
-                self.cache_kvs["key_caches_{}".format(i)] = paddle.zeros(
-                    self.cache_kvs_shape[2 * i], dtype=config.dtype
-                )
-                self.cache_kvs["value_caches_{}".format(i)] = paddle.zeros(
-                    self.cache_kvs_shape[2 * i + 1], dtype=config.dtype
-                )
+        cachekv_dtype = config.dtype
+        if config.cachekv_int8_type is not None:
+            cachekv_dtype = "uint8"
+        for i in range(len(self.cache_kvs_shape) // 2):
+            self.cache_kvs["key_caches_{}".format(i)] = paddle.zeros(
+                self.cache_kvs_shape[2 * i], dtype=cachekv_dtype
+            )
+            self.cache_kvs["value_caches_{}".format(i)] = paddle.zeros(
+                self.cache_kvs_shape[2 * i + 1], dtype=cachekv_dtype
+            )
 
         for i in range(self.num_layers):
-            if self.config.use_cachekv_int8 == "dynamic":
+            if self.config.cachekv_int8_type == "dynamic":
                 self.inputs["k_quant_scales_" + str(i)] = self.k_quant_scales[i]
                 self.inputs["v_quant_scales_" + str(i)] = self.v_quant_scales[i]
                 self.inputs["k_dequant_scales_" + str(i)] = self.k_dequant_scales[i]
@@ -1362,35 +1354,23 @@ def create_predictor(
             config = AutoConfig.from_pretrained(predictor_args.model_name_or_path)
             config.tensor_parallel_degree = tensor_parallel_degree
             config.tensor_parallel_rank = tensor_parallel_rank
-            config.weight_only_quant_bits = -1
-            config.quant_type = None
-            config.model_name_or_path = ""
-            config.use_cachekv_int8 = predictor_args.use_cachekv_int8
+            config.model_name_or_path = predictor_args.model_name_or_path
+            config.quant_type = predictor_args.quant_type
+            config.cachekv_int8_type = predictor_args.cachekv_int8_type
             config.single_card_ptq = True
             if predictor_args.avx_model:
                 config.avx_type = predictor_args.avx_type
 
-            if predictor_args.quant_type is not None:
-                if predictor_args.quant_type.startswith("weight_only_int"):
-                    weight_only_quant_bits = int(predictor_args.quant_type[-1])
-                    config.weight_only_quant_bits = weight_only_quant_bits
-                    config.quant_type = predictor_args.quant_type
-                elif predictor_args.quant_type == "a8w8":
-                    config.quant_type = predictor_args.quant_type
-
-            if config.quantization_config.quant_type is not None and "a8w8" in config.quantization_config.quant_type:
-                config.model_name_or_path = predictor_args.model_name_or_path
+            if config.quantization_config.quant_type is not None:
                 config.quant_type = config.quantization_config.quant_type
+                if "c8" in config.quant_type:
+                    config.cachekv_int8_type = "static"
 
                 ptq_multicards_num = get_ptq_multicards_num(config.model_name_or_path)
                 logger.info(f"PTQ from {ptq_multicards_num} cards, so we will not split")
                 if ptq_multicards_num > 1:
                     config.single_card_ptq = False
 
-                # Turn on GEMM int8 kernel tuning
-                paddle.base.core.enable_autotune()
-                paddle.base.core.update_autotune_status()
-
             if "llama" in config.architectures[0].lower():
                 if model_args.model_type == "llama-img2txt":
                     # we use llama for img2txt.
@@ -1530,7 +1510,6 @@ def create_predictor(
                 if predictor_args.block_attn:
                     config.block_size = predictor_args.block_size
                     config.max_seq_len = predictor_args.total_max_length
-                    config.use_dynamic_cachekv_quant = predictor_args.use_cachekv_int8 == "dynamic"
                     from paddlenlp.experimental.transformers import (
                         LlamaForCausalLMBlockInferenceModel as LlamaInferenceModel,
                     )

diff --git a/paddlenlp/experimental/transformers/bloom/modeling.py b/paddlenlp/experimental/transformers/bloom/modeling.py
@@ -80,11 +80,14 @@
 
         self.embed_dim = config.hidden_size
         self.n_head = config.n_head
+
         self.use_weight_only = False
-        self.weight_only_quant_bits = config.weight_only_quant_bits
-        self.quant_algo = "weight_only_int" + str(self.weight_only_quant_bits)
-        if self.weight_only_quant_bits != -1:
+        if config.quant_type == "weight_only_int8":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
             self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
 
         if self.use_weight_only:
             assert (
@@ -171,7 +174,7 @@
             self.embed_dim,
             self.n_head,
             4 * self.embed_dim,
-            weight_only_quant_bits=self.weight_only_quant_bits,
+            quant_type=config.quant_type,
             activation="gelu",
             num_layers=config.n_layer,
             nranks=config.tensor_parallel_degree,

diff --git a/paddlenlp/experimental/transformers/chatglm/modeling.py b/paddlenlp/experimental/transformers/chatglm/modeling.py
@@ -127,17 +127,12 @@
         self.world_size = 1
 
         self.use_weight_only = False
-        self.weight_only_quant_bits = config.weight_only_quant_bits
-        self.quant_algo = "weight_only_int" + str(self.weight_only_quant_bits)
-        if self.weight_only_quant_bits != -1:
+        if config.quant_type == "weight_only_int8":
             self.use_weight_only = True
-
-        if self.use_weight_only:
-            assert (
-                self.quant_algo == "weight_only_int8" or self.quant_algo == "weight_only_int4"
-            ), "Expected quant_algo equal to 'weight_only_int8' or 'weight_only_int4', but received {}".format(
-                self.quant_algo
-            )
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
 
         try:
             self.current_rank = paddle.distributed.get_rank()
@@ -238,7 +233,7 @@
             config.hidden_size,
             config.num_attention_heads,
             4 * config.hidden_size,
-            weight_only_quant_bits=self.weight_only_quant_bits,
+            quant_type=config.quant_type,
             activation="gelu",
             num_layers=config.num_layers,
             nranks=config.tensor_parallel_degree,

diff --git a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py
@@ -78,17 +78,12 @@
         self.multi_query_group_num = config.multi_query_group_num
 
         self.use_weight_only = False
-        self.weight_only_quant_bits = config.weight_only_quant_bits
-        self.quant_algo = "weight_only_int" + str(self.weight_only_quant_bits)
-        if self.weight_only_quant_bits != -1:
+        if config.quant_type == "weight_only_int8":
             self.use_weight_only = True
-
-        if self.use_weight_only:
-            assert (
-                self.quant_algo == "weight_only_int8" or self.quant_algo == "weight_only_int4"
-            ), "Expected quant_algo equal to 'weight_only_int8' or 'weight_only_int4', but received {}".format(
-                self.quant_algo
-            )
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
 
         ln_scale_attrs = [
             paddle.ParamAttr(name="encoder.layers.{}.input_layernorm.weight".format(i))
@@ -159,7 +154,7 @@
             config.num_attention_heads,
             config.ffn_hidden_size,
             dropout_rate=0.0,
-            weight_only_quant_bits=self.weight_only_quant_bits,
+            quant_type=config.quant_type,
             activation="swiglu",
             normalize_before=True,
             num_layers=config.num_hidden_layers,