From 10d3e95a896dcd99a857f6afc082a6646566771b Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Wed, 28 Aug 2024 06:24:51 +0000 Subject: [PATCH] fix llama3 static run --- llm/predict/predictor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py index 8c72991998a6..2273c1086a5a 100644 --- a/llm/predict/predictor.py +++ b/llm/predict/predictor.py @@ -179,6 +179,7 @@ def _preprocess(self, source): source, max_length=self.config.src_length, truncation=True, + return_position_ids=True if not isinstance(self.tokenizer, ChatGLMTokenizer) else False, truncation_side="left", return_tensors=self.return_tensors, padding=True, @@ -305,6 +306,9 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = N inference_config.disable_gpu() inference_config.disable_glog_info() inference_config.enable_new_executor() + # remove `gpu_cpu_map_matmul_v2_to_matmul_pass` to avoid mapping matmul_v2 -> matmul op + if config.dtype == "bfloat16": + inference_config.delete_pass("gpu_cpu_map_matmul_v2_to_matmul_pass") if in_pir_executor_mode(): inference_config.enable_new_ir() if in_cinn_mode():