fix codestyle

PaddlePaddle · Jun 13, 2024 · 8717c5e · 8717c5e
1 parent 8d67d0f
commit 8717c5e
Show file tree

Hide file tree

Showing 10 changed files with 36 additions and 13 deletions.
diff --git a/llm/predictor.py b/llm/predictor.py
@@ -652,7 +652,7 @@ def _create_predictor(self, predictor_args: PredictorArgument):
             config.enable_custom_device(predictor_args.device, device_id)
         elif predictor_args.device == "xpu":
             raise ValueError(
-                "you should export xpu static model with --block_attn flag and use predictor with --block_attn too" 
+                "you should export xpu static model with --block_attn flag and use predictor with --block_attn too"
                 "https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/inference.md"
             )
         else:
@@ -925,7 +925,9 @@ def _preprocess(self, source):
             source = [self.tokenizer.apply_chat_template(sentence, tokenize=False) for sentence in source]
 
         for i, text in enumerate(source):
-            add_special_tokens = self.tokenizer.chat_template is None or isinstance(self.tokenizer, (ChatGLMv2Tokenizer, ChatGLMTokenizer)) 
+            add_special_tokens = self.tokenizer.chat_template is None or isinstance(
+                self.tokenizer, (ChatGLMv2Tokenizer, ChatGLMTokenizer)
+            )
             add_special_tokens = add_special_tokens if not self.benchmark else False
             tokens = self.tokenizer(
                 text,
@@ -1087,10 +1089,9 @@ def _create_predictor(self, predictor_args: PredictorArgument):
             config.set_xpu_device_id(device_id)
             xpu_config = paddle.inference.XpuConfig()
             xpu_config.device_id = device_id
-            xpu_config.l3_size = 63*1024*1024
-            xpu_config.l3_autotune_size = 63*1024*1024
+            xpu_config.l3_size = 63 * 1024 * 1024
+            xpu_config.l3_autotune_size = 63 * 1024 * 1024
             config.set_xpu_config(xpu_config)
-            config.enable_new_executor()
         else:
             device_id = int(os.environ.get("FLAGS_selected_gpus", 0))
             config.enable_use_gpu(100, device_id)
@@ -1348,7 +1349,7 @@ def create_predictor(
                 else:
                     if predictor_args.device == "xpu":
                         raise ValueError(
-                            "you should run xpu dynamic model with --block_attn flag" 
+                            "you should run xpu dynamic model with --block_attn flag"
                             "https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/inference.md"
                         )
                     from paddlenlp.experimental.transformers import (
@@ -1608,7 +1609,9 @@ def predict():
 
 def benchmark(predictor, predictor_args, model_args):
     # Just construct a simple benchmark input. We pad input to the src_length.
-    benchmark_texts = [predictor.tokenizer.pad_token * predictor_args.src_length for _ in range(predictor_args.batch_size)]
+    benchmark_texts = [
+        predictor.tokenizer.pad_token * predictor_args.src_length for _ in range(predictor_args.batch_size)
+    ]
 
     batch_benchmark_texts = batchfy_text(benchmark_texts, predictor_args.batch_size)
     print("***********Start Benchmark**********")

diff --git a/paddlenlp/experimental/transformers/bloom/modeling.py b/paddlenlp/experimental/transformers/bloom/modeling.py
@@ -219,6 +219,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset
+
         ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )
@@ -593,6 +594,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(self.max_seq_len - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset_v2
+
         ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset_v2(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )

diff --git a/paddlenlp/experimental/transformers/chatglm/modeling.py b/paddlenlp/experimental/transformers/chatglm/modeling.py
@@ -273,6 +273,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset
+
         ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )

diff --git a/paddlenlp/experimental/transformers/chatglm_v2/modeling.py b/paddlenlp/experimental/transformers/chatglm_v2/modeling.py
@@ -202,6 +202,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset
+
         ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )

diff --git a/paddlenlp/experimental/transformers/fused_transformer_layers.py b/paddlenlp/experimental/transformers/fused_transformer_layers.py
@@ -15,7 +15,7 @@
 
 import paddle
 import paddle.distributed as dist
-from paddle.framework import LayerHelper, in_dynamic_mode, core
+from paddle.framework import LayerHelper, core, in_dynamic_mode
 from paddle.incubate.nn.functional import (
     fused_layer_norm,
     fused_rms_norm,
@@ -28,15 +28,15 @@
 
 from paddlenlp.utils.import_utils import is_paddlenlp_ops_available
 from paddlenlp.utils.log import logger
-from paddlenlp_ops import rebuild_padding_v2
-
 
 if not is_paddlenlp_ops_available():
     logger.warning(
         "The paddlenlp_ops package is not installed. you can read the docs and install it by hand, "
         "you can refer to: https://github.com/PaddlePaddle/PaddleNLP/blob/develop/csrc/README.md"
     )
 
+from paddlenlp_ops import rebuild_padding_v2
+
 if core.is_compiled_with_cuda():
     from paddlenlp_ops import (
         dequant_int8,
@@ -1350,8 +1350,8 @@ class FusedBlockMultiTransformer(FusedMultiTransformerBase):
     def __init__(self, config: FusedMultiTransformerConfig):
         super().__init__(config)
         if not core.is_compiled_with_cuda():
-            self.cache_k_per_batch_maxs = paddle.full(shape=[10, 6], fill_value=0, dtype='float32')
-            self.cache_v_per_batch_maxs = paddle.full(shape=[10, 6], fill_value=0, dtype='float32')
+            self.cache_k_per_batch_maxs = paddle.full(shape=[10, 6], fill_value=0, dtype="float32")
+            self.cache_v_per_batch_maxs = paddle.full(shape=[10, 6], fill_value=0, dtype="float32")
 
     def compute_attn(
         self,

diff --git a/paddlenlp/experimental/transformers/generation_utils.py b/paddlenlp/experimental/transformers/generation_utils.py
@@ -198,6 +198,7 @@ def update_model_kwargs_for_generation(self, cache, just_decoder, next_tokens, e
         if cache is None:
             next_tokens = paddle.where(just_decoder, paddle.full_like(next_tokens, -1), next_tokens)
         from paddlenlp_ops import set_stop_value_multi_ends
+
         next_tokens, model_kwargs["stop_flags"] = set_stop_value_multi_ends(
             next_tokens, model_kwargs["stop_flags"], eos_token_id, 2
         )  # multi ends
@@ -296,6 +297,7 @@ def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
             else:
                 step_idx = model_kwargs["step_idx"]
             from paddlenlp_ops import set_value_by_flags_and_idx
+
             model_kwargs["stop_flags"] = set_value_by_flags_and_idx(
                 model_kwargs["pre_ids"],
                 model_kwargs["tgt_ids"],
@@ -308,6 +310,7 @@ def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
             logits = logits_processors(model_kwargs["all_input_ids"], logits, decoding_step=step_idx_ori)
 
             from paddlenlp_ops import get_token_penalty_multi_scores
+
             logits = get_token_penalty_multi_scores(
                 model_kwargs["pre_ids"],
                 logits,
@@ -319,7 +322,7 @@ def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
                 eos_token_id,
             )
             logits = logits / temperature
-            
+
             # sample
             probs = F.softmax(logits)
 
@@ -340,6 +343,7 @@ def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
                 model_kwargs["all_input_ids"] = paddle.concat([model_kwargs["all_input_ids"], next_tokens], axis=1)
 
             from paddlenlp_ops import save_with_output
+
             save_with_output(
                 next_tokens,
                 batch_idx,
@@ -629,6 +633,7 @@ def _post_process_(
         ):
             step_idx = model_kwargs["step_idx"]
             from paddlenlp_ops import set_value_by_flags_and_idx_v2
+
             set_value_by_flags_and_idx_v2(
                 model_kwargs["pre_ids"],
                 model_kwargs["input_ids"],
@@ -643,6 +648,7 @@ def _post_process_(
 
             # pre-process distribution
             from paddlenlp_ops import get_token_penalty_multi_scores_v2
+
             logits = get_token_penalty_multi_scores_v2(
                 model_kwargs["pre_ids"],
                 logits,
@@ -669,12 +675,14 @@ def _post_process_(
             length_cond = paddle.greater_equal(step_idx, model_kwargs["max_dec_len"])
             stop_flags = paddle.logical_or(model_kwargs["stop_flags"], length_cond)
             from paddlenlp_ops import set_stop_value_multi_ends_v2
+
             set_stop_value_multi_ends_v2(
                 next_tokens, stop_flags, model_kwargs["seq_lens_this_time"], eos_token_id, model_kwargs["next_tokens"]
             )  # multi ends
             paddle.assign(stop_flags, model_kwargs["stop_flags"])
             # update inputs
             from paddlenlp_ops import update_inputs
+
             update_inputs(
                 stop_flags,
                 model_kwargs["not_need_stop"],
@@ -687,6 +695,7 @@ def _post_process_(
                 model_kwargs["is_block_step"],
             )
             from paddlenlp_ops import save_output
+
             save_output(next_tokens, model_kwargs["not_need_stop"], self.config.tensor_parallel_rank)
             return next_tokens
 

diff --git a/paddlenlp/experimental/transformers/gpt/modeling.py b/paddlenlp/experimental/transformers/gpt/modeling.py
@@ -203,6 +203,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset
+
         ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )

diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -346,6 +346,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset
+
         ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )
@@ -433,6 +434,7 @@ def forward(
         if not is_decoder and pre_caches is not None:
             position_offset = 128
         from paddlenlp_ops import fused_get_rotary_embedding
+
         new_rope = fused_get_rotary_embedding(
             input_ids, position_ids, self.head_dim_shape_tensor, position_offset, theta, True
         )
@@ -825,6 +827,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(self.max_seq_len - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset_v2
+
         ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset_v2(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )

diff --git a/paddlenlp/experimental/transformers/opt/modeling.py b/paddlenlp/experimental/transformers/opt/modeling.py
@@ -147,6 +147,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset
+
         ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )

diff --git a/paddlenlp/experimental/transformers/qwen/modeling.py b/paddlenlp/experimental/transformers/qwen/modeling.py
@@ -239,6 +239,7 @@ def remove_padding(self, input_ids, seq_lens_this_time):
         cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
         token_num = paddle.sum(seq_lens_this_time)
         from paddlenlp_ops import get_padding_offset
+
         ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
             input_ids, cum_offsets_now, token_num, seq_lens_this_time
         )
@@ -325,6 +326,7 @@ def forward(
             position_offset = 128
 
         from paddlenlp_ops import fused_get_rotary_embedding
+
         new_rope = fused_get_rotary_embedding(
             input_ids, position_ids, self.head_dim_shape_tensor, position_offset, theta, True
         )