fix llama3 static run

PaddlePaddle · Aug 28, 2024 · 10d3e95 · 10d3e95
1 parent 34a71c8
commit 10d3e95
Showing 1 changed file with 4 additions and 0 deletions.
diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -179,6 +179,7 @@ def _preprocess(self, source):
             source,
             max_length=self.config.src_length,
             truncation=True,
+            return_position_ids=True if not isinstance(self.tokenizer, ChatGLMTokenizer) else False,
             truncation_side="left",
             return_tensors=self.return_tensors,
             padding=True,
@@ -305,6 +306,9 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = N
             inference_config.disable_gpu()
         inference_config.disable_glog_info()
         inference_config.enable_new_executor()
+        # remove `gpu_cpu_map_matmul_v2_to_matmul_pass` to avoid mapping matmul_v2 -> matmul op
+        if config.dtype == "bfloat16":
+            inference_config.delete_pass("gpu_cpu_map_matmul_v2_to_matmul_pass")
         if in_pir_executor_mode():
             inference_config.enable_new_ir()
             if in_cinn_mode():