[Inference] Fix docs and support llama3.2 (#9613)

* fix docs and use fast tokenizer default * support llama3.2
PaddlePaddle · Dec 12, 2024 · f1126c1 · f1126c1
1 parent 4f57179
commit f1126c1
Show file tree

Hide file tree

Showing 7 changed files with 20 additions and 12 deletions.
diff --git a/csrc/gpu/append_attn/utils.cuh b/csrc/gpu/append_attn/utils.cuh
@@ -278,7 +278,7 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
       break;                                       \
     }                                              \
     default: {                                     \
-      PD_THROW("not support the head_dim");        \
+      PD_THROW("not support the head_dim: ", head_dim);        \
     }                                              \
   }
 

diff --git a/llm/docs/predict/inference.md b/llm/docs/predict/inference.md
@@ -27,9 +27,8 @@ PaddleNLP大模型推理提供压缩、推理、服务全流程体验 ：
 PaddleNLP 中已经添加高性能推理模型相关实现，已验证过的模型如下：
 | Models | Example Models |
 |--------|----------------|
-|Llama 3.1, Llama 3, Llama 2|`meta-llama/Meta-Llama-3.1-8B`, `meta-llama/Meta-Llama-3.1-8B-Instruct`, `meta-llama/Meta-Llama-3.1-405B`, `meta-llama/Meta-Llama-3.1-405B-Instruct`,`meta-llama/Meta-Llama-3-8B`, `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-Guard-3-8B`, `Llama-2-7b, meta-llama/Llama-2-7b-chat`, `meta-llama/Llama-2-13b`, `meta-llama/Llama-2-13b-chat`, `meta-llama/Llama-2-70b`, `meta-llama/Llama-2-70b-chat`|
-|Qwen 2| `Qwen/Qwen2-0.5B`, `Qwen/Qwen2-0.5B-Instruct`, `Qwen/Qwen2-1.5B`, `Qwen/Qwen2-1.5B-Instruct`, `Qwen/Qwen2-7B`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-72B`, `Qwen/Qwen2-72B-Instruct`, `Qwen/Qwen2-57B-A14B`, `Qwen/Qwen2-57B-A14B-Instruct`, `Qwen/Qwen2-Math-1.5B-Instruct`|
-|Qwen 2.5| `Qwen/Qwen2.5-7B-Instruct`, `Qwen/Qwen2.5-14B-Instruct`, `Qwen/Qwen2.5-Math-1.5B-Instruct`, `Qwen/Qwen2.5-Coder-1.5B-Instruct`|
+|Llama 3.x, Llama 2|`meta-llama/Llama-3.2-3B-Instruct`, `meta-llama/Meta-Llama-3.1-8B`, `meta-llama/Meta-Llama-3.1-8B-Instruct`, `meta-llama/Meta-Llama-3.1-405B`, `meta-llama/Meta-Llama-3.1-405B-Instruct`,`meta-llama/Meta-Llama-3-8B`, `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-Guard-3-8B`, `Llama-2-7b, meta-llama/Llama-2-7b-chat`, `meta-llama/Llama-2-13b`, `meta-llama/Llama-2-13b-chat`, `meta-llama/Llama-2-70b`, `meta-llama/Llama-2-70b-chat`|
+|Qwen 2.x|`Qwen/Qwen2-1.5B`, `Qwen/Qwen2-1.5B-Instruct`, `Qwen/Qwen2-7B`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-72B`, `Qwen/Qwen2-72B-Instruct`, `Qwen/Qwen2-57B-A14B`, `Qwen/Qwen2-57B-A14B-Instruct`, `Qwen/Qwen2-Math-1.5B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`, `Qwen/Qwen2.5-14B-Instruct`, `Qwen/Qwen2.5-Math-1.5B-Instruct`, `Qwen/Qwen2.5-Coder-1.5B-Instruct`, `Qwen/Qwen2.5-32B-Instruct`, `Qwen/Qwen2.5-72B-Instruct`|
 |Qwen-MoE| `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, `Qwen/Qwen2-57B-A14B`, `Qwen/Qwen2-57B-A14B-Instruct`|
 |Mixtral| `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistralai/Mixtral-8x22B-Instruct-v0.1`|
 |ChatGLM 3, ChatGLM 2| `THUDM/chatglm3-6b`, `THUDM/chatglm2-6b`|

diff --git a/llm/docs/predict/llama.md b/llm/docs/predict/llama.md
@@ -24,6 +24,7 @@
 |meta-llama/Meta-Llama-3.1-8B-Instruct|
 |meta-llama/Meta-Llama-3.1-70B-Instruct|
 |meta-llama/Meta-Llama-3.1-405B-Instruct|
+|meta-llama/Llama-3.2-3B-Instruct|
 
 ## 已验证的预量化模型
 

diff --git a/llm/docs/predict/mixtral.md b/llm/docs/predict/mixtral.md
@@ -45,8 +45,6 @@ python -m paddle.distributed.launch \
     --append_attn
 
 # 静态图推理
-# 需要设置下面的环境变量，否则会导致多卡推理阻塞
-export FLAGS_dynamic_static_unified_comm=false
 export DEVICES=0,1
 python -m paddle.distributed.launch \
     --gpus ${DEVICES} \
@@ -86,7 +84,6 @@ python -m paddle.distributed.launch \
     --append_attn
 
 # 静态图推理
-export FLAGS_dynamic_static_unified_comm=false
 export DEVICES=0,1
 python -m paddle.distributed.launch \
     --gpus ${DEVICES} \

diff --git a/llm/docs/predict/qwen.md b/llm/docs/predict/qwen.md
@@ -21,6 +21,11 @@
 |Qwen/Qwen2-7B-Instruct|
 |Qwen/Qwen1.5-MoE-A2.7B-Chat|
 |Qwen/Qwen2-57B-A14B-Instruct|
+|Qwen/Qwen2.5-1.5B-Instruct|
+|Qwen/Qwen2.5-7B-Instruct|
+|Qwen/Qwen2.5-14B-Instruct|
+|Qwen/Qwen2.5-32B-Instruct|
+|Qwen/Qwen2.5-72B-Instruct|
 
 ## 已验证的预量化模型
 

diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -1296,9 +1296,7 @@ def create_predictor(
     predictor_args: PredictorArgument,
     model_args: ModelArgument,
 ):
-    tokenizer = AutoTokenizer.from_pretrained(
-        predictor_args.model_name_or_path,
-    )
+    tokenizer = AutoTokenizer.from_pretrained(predictor_args.model_name_or_path)
     # init chat_template for tokenizer
     llm_utils.init_chat_template(tokenizer, predictor_args.model_name_or_path, predictor_args.chat_template)
 

diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -1553,7 +1553,11 @@ class LlamaForCausalLMInferenceModel(GenerationInferenceModel, LlamaPretrainedMo
     def __init__(self, config):
         super().__init__(config)
         self.llama = LlamaInferenceModel(config)
-        self.lm_head = LlamaLMHead(config)
+        if config.tie_word_embeddings:
+            self.lm_head = LlamaLMHead(config, embedding_weights=self.llama.embed_tokens.weight, transpose_y=True)
+            self.tie_weights()
+        else:
+            self.lm_head = LlamaLMHead(config)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
@@ -1731,7 +1735,11 @@ def __init__(self, config):
         self.max_seq_len = config.max_seq_len
 
         self.llama = LlamaBlockInferenceModel(config)
-        self.lm_head = LlamaLMHead(config)
+        if config.tie_word_embeddings:
+            self.lm_head = LlamaLMHead(config, embedding_weights=self.llama.embed_tokens.weight, transpose_y=True)
+            self.tie_weights()
+        else:
+            self.lm_head = LlamaLMHead(config)
 
     @classmethod
     def _get_tensor_parallel_mappings(cls, config: LlamaConfig, is_split=True):