PaddlePaddle · wj-Mcat · Jan 18, 2024 · Jan 8, 2024 · Jan 8, 2024 · Jan 17, 2024
diff --git a/llm/utils.py b/llm/utils.py
@@ -467,6 +467,7 @@ def dybatch_preprocess(
                 max_length=src_length,
                 return_attention_mask=False,
                 return_token_type_ids=False,
+                add_special_tokens=tokenizer.chat_template is None or isinstance(tokenizer, ChatGLMv2Tokenizer),
             )
             input_ids.append(tokens["input_ids"][0])
 

diff --git a/tests/fixtures/llm/pretrain.yaml b/tests/fixtures/llm/pretrain.yaml
@@ -33,6 +33,7 @@ inference-predict:
     batch_size: 2
     decode_strategy: greedy_search
     dtype: float16
+    chat_template: none
 
 inference-to-static:
   default:
@@ -45,3 +46,4 @@ inference-infer:
     batch_size: 2
     decode_strategy: greedy_search
     max_length: 20
+    chat_template: none
diff --git a/tests/llm/test_predictor.py b/tests/llm/test_predictor.py
@@ -32,6 +32,7 @@
     get_path_from_url_with_filelock,
     url_file_exists,
 )
+from tests.testing_utils import GPUsTesting, require_gpu
 
 from .testing_utils import LLMTest, argv_context_guard, load_test_config
 
@@ -205,3 +206,41 @@ def test_create_predictor_with_unexpected_length(self):
 
             with argv_context_guard(config):
                 predict()
+
+
+@parameterized_class(
+    ["model_name_or_path", "model_class"],
+    [
+        ["__internal_testing__/tiny-random-llama", LlamaForCausalLM],
+    ],
+)
+class GPUsPredictorTest(LLMTest, GPUsTesting, unittest.TestCase):
+    config_path: str = "./tests/fixtures/llm/predictor.yaml"
+    model_name_or_path: str = None
+    model_class = None
+
+    def setUp(self) -> None:
+        super().setUp()
+        self.model_class.from_pretrained(self.model_name_or_path, dtype="float16").save_pretrained(self.output_dir)
+        AutoTokenizer.from_pretrained(self.model_name_or_path).save_pretrained(self.output_dir)
+
+    @require_gpu(2)
+    def test_predictor(self):
+        self.init_dist_env()
+
+        self.run_predictor({"inference_model": True})
+        result_0 = self._read_result(os.path.join(self.output_dir, "predict.json"))
+        self.run_predictor({"inference_model": False})
+        result_1 = self._read_result(os.path.join(self.output_dir, "predict.json"))
+
+        # compare the generation result of inference & dygraph model
+        assert len(result_0) == len(result_1)
+
+        count, full_match = 0, 0
+        for inference_item, no_inference_item in zip(result_0, result_1):
+            min_length = min(len(inference_item), len(no_inference_item))
+            count += int(inference_item[: min_length // 2] == no_inference_item[: min_length // 2])
+            full_match += int(inference_item[:min_length] == no_inference_item[:min_length])
+
+        self.assertGreaterEqual(full_match / len(result_0), 0.25)
+        self.assertGreaterEqual(count / len(result_0), 0.4)
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -26,6 +26,7 @@
 
 import numpy as np
 import paddle
+import paddle.distributed.fleet as fleet
 import yaml
 
 from paddlenlp.trainer.argparser import strtobool
@@ -470,3 +471,36 @@ def require_paddle_up_to_2_gpus(test_case):
     import paddle
 
     return unittest.skipUnless(paddle.device.cuda.device_count() < 3, "test requires 0 or 1 or 2 GPUs")(test_case)
+
+
+def require_gpu(min_gpus: int = 1):
+    def actual_decorator(func):
+        gpu_count = paddle.device.cuda.device_count()
+
+        if gpu_count < min_gpus:
+            return unittest.skip(f"test requires {min_gpus} GPUs")(func)
+
+        def wrapper(*args, **kwargs):
+            result = func(*args, **kwargs)
+            return result
+
+        return wrapper
+
+    return actual_decorator
+
+
+class GPUsTesting(unittest.TestCase):
+    def init_dist_env(self, config: dict = {}):
+        world_size = paddle.distributed.get_world_size()
+        strategy = fleet.DistributedStrategy()
+        hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": world_size,
+            "pp_degree": 1,
+            "sharding_degree": 1,
+        }
+        hybrid_configs.update(config)
+        strategy.hybrid_configs = hybrid_configs
+
+        fleet.init(is_collective=True, strategy=strategy)
+        fleet.get_hybrid_communicate_group()