diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-13b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh
new file mode 100644
index 000000000000..c56ca59bbb9f
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=meta-llama/Llama-2-13b "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=4 "
+param+="sharding_parallel_degree=2 "
+param+="sharding=stage2 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=4 "
+param+="run_stage=dpo "
+param+="run_mode=tp4_sd2_acc4_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=llama2-13b_dpo "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-13b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh
new file mode 100644
index 000000000000..eea7407dc63e
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=meta-llama/Llama-2-13b "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=1 "
+param+="sharding_parallel_degree=8 "
+param+="sharding=stage1 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=lora "
+param+="run_mode=tp1_sd8_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=llama2-13b_lora "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-13b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh
new file mode 100644
index 000000000000..78a0e1a1d7e9
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=meta-llama/Llama-2-13b "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=2 "
+param+="sharding_parallel_degree=4 "
+param+="sharding=stage2 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=2 "
+param+="run_stage=sft "
+param+="run_mode=tp2_sd4_acc2_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=llama2-13b_sft "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh
new file mode 100644
index 000000000000..86906fc58af7
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=meta-llama/Llama-2-7b "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=4 "
+param+="sharding_parallel_degree=2 "
+param+="sharding=stage2 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=dpo "
+param+="run_mode=tp4_sd2_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=llama2-7b_dpo "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh
new file mode 100644
index 000000000000..0332b92f84e3
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=meta-llama/Llama-2-7b "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=1 "
+param+="pipeline_parallel_degree=1 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=lora "
+param+="run_mode=tp1_pp1_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=llama2-7b_lora "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh
new file mode 100644
index 000000000000..3174fdc2797e
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=meta-llama/Llama-2-7b "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=2 "
+param+="sharding_parallel_degree=4 "
+param+="sharding=stage2 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=sft "
+param+="run_mode=tp2_sd4_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=llama2-7b_sft "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/dpo.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/dpo.json
new file mode 100644
index 000000000000..d2e3e07ac54d
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/dpo.json
@@ -0,0 +1,36 @@
+{
+    "model_name_or_path": "meta-llama/Llama-2-13b",
+    "train_dataset_path": "./data/dpo_benchmark_train/train.json",
+    "dev_dataset_path": "./data/dpo_benchmark_train/dev.json",
+    "output_dir": "./checkpoints/dpo_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 1,
+    "num_train_epochs": 1,
+    "learning_rate": 1e-06,
+    "warmup_steps": 10,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "max_seq_len": 8192,
+    "max_prompt_len": 4096,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "tensor_parallel_degree": 4,
+    "sharding_parallel_degree": 2,
+    "sharding": "stage2",
+    "use_flash_attention": true,
+    "flash_mask": true,
+    "recompute": true,
+    "recompute_granularity": "full",
+    "benchmark": true,
+    "unified_checkpoint": true,
+    "autotuner_benchmark":false,
+    "beta": 0.1,
+    "loss_type": "sigmoid",
+    "label_smoothing": 0.0
+  }
diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/lora.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/lora.json
new file mode 100644
index 000000000000..170c59ab42b4
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/lora.json
@@ -0,0 +1,39 @@
+{
+    "model_name_or_path": "meta-llama/Llama-2-13b",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/lora_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "sharding_parallel_degree": 8,
+    "sharding": "stage1",
+    "pipeline_parallel_degree": 1,
+    "lora": true,
+    "unified_checkpoint": true,
+	"benchmark": true,
+    "zero_padding": true,
+    "flash_mask": true,
+    "use_flash_attention": true,
+    "pissa": false
+  }
diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/sft.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/sft.json
new file mode 100644
index 000000000000..fd538fff6c73
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/sft.json
@@ -0,0 +1,37 @@
+{
+    "model_name_or_path": "meta-llama/Llama-2-13b",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/sft_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 2,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-05,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+	"benchmark": true,
+    "tensor_parallel_degree": 2,
+    "sharding_parallel_degree": 4,
+    "pipeline_parallel_degree": 1,
+    "sharding": "stage2",
+    "zero_padding": true,
+    "flash_mask": true,
+    "unified_checkpoint": true,
+    "use_flash_attention": true
+  }
diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/dpo.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/dpo.json
new file mode 100644
index 000000000000..cb8d0afdd74a
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/dpo.json
@@ -0,0 +1,36 @@
+{
+    "model_name_or_path": "meta-llama/Llama-2-7b",
+    "train_dataset_path": "./data/dpo_benchmark_train/train.json",
+    "dev_dataset_path": "./data/dpo_benchmark_train/dev.json",
+    "output_dir": "./checkpoints/dpo_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 1,
+    "num_train_epochs": 1,
+    "learning_rate": 1e-06,
+    "warmup_steps": 10,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "max_seq_len": 8192,
+    "max_prompt_len": 4096,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "tensor_parallel_degree": 4,
+    "sharding_parallel_degree": 2,
+    "sharding": "stage2",
+    "use_flash_attention": true,
+    "flash_mask": true,
+    "recompute": true,
+    "recompute_granularity": "full",
+    "benchmark": true,
+    "unified_checkpoint": true,
+    "autotuner_benchmark":false,
+    "beta": 0.1,
+    "loss_type": "sigmoid",
+    "label_smoothing": 0.0
+  }
diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/lora.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/lora.json
new file mode 100644
index 000000000000..17fa0b8a8474
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/lora.json
@@ -0,0 +1,37 @@
+{
+    "model_name_or_path": "meta-llama/Llama-2-7b",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/lora_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "lora": true,
+    "unified_checkpoint": true,
+	"benchmark": true,
+    "zero_padding": true,
+    "flash_mask": true,
+    "use_flash_attention": true,
+    "pissa": false
+  }
diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/sft.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/sft.json
new file mode 100644
index 000000000000..cd496681ecb6
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/sft.json
@@ -0,0 +1,37 @@
+{
+    "model_name_or_path": "meta-llama/Llama-2-7b",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/sft_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-05,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+	"benchmark": true,
+    "tensor_parallel_degree": 2,
+    "sharding_parallel_degree": 4,
+    "pipeline_parallel_degree": 1,
+    "sharding": "stage2",
+    "zero_padding": true,
+    "flash_mask": true,
+    "unified_checkpoint": true,
+    "use_flash_attention": true
+  }
diff --git a/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh b/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh
new file mode 100644
index 000000000000..a1b917731589
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh
@@ -0,0 +1,35 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python -m pip install -r ../requirements.txt
+python -m pip install -r ../requirements-dev.txt
+export PYTHONPATH=../:${PYTHONPATH}
+echo ${PYTHONPATH}
+
+
+# install fused_ln custom ops
+cd ../slm/model_zoo/gpt-3/external_ops/
+python setup.py install
+cd -
+
+# install paddlenlp_ops
+cd ../csrc/
+python setup_cuda.py install
+cd -
+
+cd ../llm
+cp -r ../tests/test_tipc/llm/llama2/benchmark_common/benchmark_json ./
+
+wget https://paddlenlp.bj.bcebos.com/llm_benchmark_data/paddle_data.tar.gz
+tar zxvf paddle_data.tar.gz && rm -rf paddle_data.tar.gz
\ No newline at end of file
diff --git a/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh
new file mode 100644
index 000000000000..6f6b27849ea9
--- /dev/null
+++ b/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test training benchmark for a model.
+# Usage：bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num}
+function _set_params(){
+    model_name_or_path=${model_name_or_path:-"Qwen/Qwen2.5-7B"}
+    per_device_train_batch_size=${per_device_train_batch_size:-1}
+    tensor_parallel_degree=${tensor_parallel_degree:-1}
+    pipeline_parallel_degree=${pipeline_parallel_degree:-1}
+    sharding_parallel_degree=${sharding_parallel_degree:-1}
+    sharding=${sharding:-"stage1"}
+    recompute=${recompute:-1}
+    recompute_granularity=${recompute_granularity:-"full"}
+    gradient_accumulation_steps=${gradient_accumulation_steps:-1}
+    run_stage=${run_stage:-"sft"}
+    run_mode=${run_mode:-"DP1-MP1-PP4-mbs1-acc8-recompute"}
+    device_num=${device_num:-"N1C8"}
+    global_batch_size=${global_batch_size:-16}
+    model_item=${model_item:-"qwen-qwen-7b_seqlen2048_pretrain"}
+    max_steps=${max_steps:-150}
+    logging_steps=${logging_steps:-1}
+    base_batch_size=${global_batch_size}
+
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+    model_repo="PaddleNLP"          # (必选) 模型套件的名字
+    speed_unit="tokens/s"         # (必选)速度指标单位
+    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="Effective_Tokens_per_second_per_gpu:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    is_large_model=True           # (可选)普通模型默认为False，如果添加大模型且只取一条ips设置为True
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+
+    fp_item="bf16"
+    # 以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    mkdir -p $(dirname ${train_log_file})
+
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    mkdir -p $(dirname ${profiling_log_file})
+
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+    mkdir -p $(dirname ${speed_log_file})
+
+    OUTPUT_PATH=${run_log_path}/output
+    
+}
+
+function _train(){
+    batch_size=${per_device_train_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+
+    if [ -d $OUTPUT_PATH ]; then
+        rm -rf $OUTPUT_PATH
+    fi
+    mkdir $OUTPUT_PATH
+
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} = "true" ];then
+        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
+        log_file=${profiling_log_file}
+    else
+        add_options=""
+        log_file=${train_log_file}
+    fi
+
+    case ${run_stage} in
+    sft) train_cmd="run_finetune.py benchmark_json/${model_item%_*}/sft.json" ;;
+    lora) train_cmd="run_finetune.py benchmark_json/${model_item%_*}/lora.json" ;;
+    dpo) train_cmd="alignment/dpo/run_dpo.py benchmark_json/${model_item%_*}/dpo.json" ;;
+    *) echo "choose run_stage(sft | lora | dpo)"; exit 1;
+    esac
+
+    if [ ${PADDLE_TRAINER_ID} ]
+    then
+        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
+    else
+        PADDLE_RANK_OPTION=""
+    fi
+    # 以下为通用执行命令，无特殊可不用修改
+    case ${device_num} in
+    N1C1) echo "Run with: device_num=${device_num}, run_mode=${run_mode}, run_stage=${run_stage}"
+        train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog --gpus=0 ${PADDLE_RANK_OPTION}\
+            ${train_cmd}"
+        workerlog_id=0
+        ;;
+    *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}, run_stage=${run_stage}"
+        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
+            ${train_cmd}"
+        workerlog_id=0
+        ;;
+    esac
+    cd ../llm/
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+    python -c "import paddlenlp"
+    if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间
+        ${train_cmd} > ${log_file} 2>&1
+    else
+        timeout 30m ${train_cmd} > ${log_file} 2>&1
+        # echo ${train_cmd}
+        Effective_Tokens_per_second=`cat ${log_file} | grep -E 'Effective_Tokens_per_second|Effective tokens per second:' \
+                                            |awk -F': ' '{print $2}' |awk -F' ' '{print $1}'`
+        num_gpu=$(echo "$device_num" | sed 's/^.*C//')
+        ips=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}')
+        echo "Effective_Tokens_per_second_per_gpu: ${ips}" >> ${log_file}
+    fi
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${device_num} != "N1C1" -a -d mylog ]; then
+        case_path=$PWD && cd - && mkdir -p mylog      # PaddleNLP/tests/mylog
+        cp -r ${case_path}/mylog/workerlog.* ./mylog/
+    fi
+}
+
+export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+#_train       # 如果只产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh
new file mode 100644
index 000000000000..4eb207e828fd
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-14B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=4 "
+param+="sharding_parallel_degree=2 "
+param+="sharding=stage2 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=4 "
+param+="run_stage=dpo "
+param+="run_mode=tp4_sd2_acc4_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-14b_dpo "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh
new file mode 100644
index 000000000000..d040cecacef0
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-14B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=1 "
+param+="sharding_parallel_degree=8 "
+param+="sharding=stage1 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=lora "
+param+="run_mode=tp1_sd8_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-14b_lora "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh
new file mode 100644
index 000000000000..ce078b50f1be
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-14B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=2 "
+param+="sharding_parallel_degree=4 "
+param+="sharding=stage2 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=2 "
+param+="run_stage=sft "
+param+="run_mode=tp2_sd4_acc2_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-14b_sft "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_dpo_bs16_bf16_tp1_sd1_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_dpo_bs16_bf16_tp1_sd1_acc1_dygraph.sh
new file mode 100644
index 000000000000..191020daebf6
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_dpo_bs16_bf16_tp1_sd1_acc1_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-1.5B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=1 "
+param+="sharding_parallel_degree=1 "
+param+="sharding=stage2 "
+param+="recompute=false "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=dpo "
+param+="run_mode=tp1_sd1_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-1_5b_dpo "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh
new file mode 100644
index 000000000000..830ec3c4ca09
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-1.5B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=1 "
+param+="pipeline_parallel_degree=1 "
+param+="recompute=false "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=lora "
+param+="run_mode=tp1_pp1_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-1_5b_lora "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_sft_bs16_bf16_tp1_pp1_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_sft_bs16_bf16_tp1_pp1_acc1_dygraph.sh
new file mode 100644
index 000000000000..9bb0a9a1bfd0
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_sft_bs16_bf16_tp1_pp1_acc1_dygraph.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-1.5B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=1 "
+param+="pipeline_parallel_degree=1 "
+param+="recompute=false "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=sft "
+param+="run_mode=tp1_pp1_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-1_5b_sft "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh
new file mode 100644
index 000000000000..ac37b6e86fbb
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-7B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=4 "
+param+="sharding_parallel_degree=2 "
+param+="sharding=stage2 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=dpo "
+param+="run_mode=tp4_sd2_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-7b_dpo "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh
new file mode 100644
index 000000000000..bc0a6ae685ae
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-7B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=1 "
+param+="pipeline_parallel_degree=1 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=lora "
+param+="run_mode=tp1_pp1_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-7b_lora "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh
new file mode 100644
index 000000000000..76cb9fa7e2e5
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=Qwen/Qwen2.5-7B "
+param+="per_device_train_batch_size=1 "
+param+="tensor_parallel_degree=2 "
+param+="sharding_parallel_degree=4 "
+param+="sharding=stage2 "
+param+="recompute=true "
+param+="recompute_granularity=full "
+param+="gradient_accumulation_steps=1 "
+param+="run_stage=sft "
+param+="run_mode=tp2_sd4_acc1_dygraph "
+param+="device_num=N1C8 "
+param+="global_batch_size=16 "
+param+="model_item=qwen-qwen2_5-7b_sft "
+param+="max_steps=150 "
+
+cd ./tests
+bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/dpo.json
new file mode 100644
index 000000000000..0580e1e47ff8
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/dpo.json
@@ -0,0 +1,36 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-14B",
+    "train_dataset_path": "./data/dpo_benchmark_train/train.json",
+    "dev_dataset_path": "./data/dpo_benchmark_train/dev.json",
+    "output_dir": "./checkpoints/dpo_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 1,
+    "num_train_epochs": 1,
+    "learning_rate": 1e-06,
+    "warmup_steps": 10,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "max_seq_len": 8192,
+    "max_prompt_len": 4096,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "tensor_parallel_degree": 4,
+    "sharding_parallel_degree": 2,
+    "sharding": "stage2",
+    "use_flash_attention": true,
+    "flash_mask": true,
+    "recompute": true,
+    "recompute_granularity": "full",
+    "benchmark": true,
+    "unified_checkpoint": true,
+    "autotuner_benchmark":false,
+    "beta": 0.1,
+    "loss_type": "sigmoid",
+    "label_smoothing": 0.0
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/lora.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/lora.json
new file mode 100644
index 000000000000..a3106cfcce59
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/lora.json
@@ -0,0 +1,39 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-14B",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/lora_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "sharding_parallel_degree": 8,
+    "sharding": "stage1",
+    "pipeline_parallel_degree": 1,
+    "lora": true,
+    "unified_checkpoint": true,
+	"benchmark": true,
+    "zero_padding": true,
+    "flash_mask": true,
+    "use_flash_attention": true,
+    "pissa": false
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/sft.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/sft.json
new file mode 100644
index 000000000000..9ff6c10456c0
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/sft.json
@@ -0,0 +1,37 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-14B",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/sft_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 2,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-05,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+	"benchmark": true,
+    "tensor_parallel_degree": 2,
+    "sharding_parallel_degree": 4,
+    "pipeline_parallel_degree": 1,
+    "sharding": "stage2",
+    "zero_padding": true,
+    "flash_mask": true,
+    "unified_checkpoint": true,
+    "use_flash_attention": true
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/dpo.json
new file mode 100644
index 000000000000..9fac0129d1d9
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/dpo.json
@@ -0,0 +1,36 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-1.5B",
+    "train_dataset_path": "./data/dpo_benchmark_train/train.json",
+    "dev_dataset_path": "./data/dpo_benchmark_train/dev.json",
+    "output_dir": "./checkpoints/dpo_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 1,
+    "num_train_epochs": 1,
+    "learning_rate": 1e-06,
+    "warmup_steps": 10,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "max_seq_len": 8192,
+    "max_prompt_len": 4096,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "tensor_parallel_degree": 1,
+    "sharding_parallel_degree": 1,
+    "sharding": "stage2",
+    "use_flash_attention": true,
+    "flash_mask": true,
+    "recompute": false,
+    "recompute_granularity": "full",
+    "benchmark": true,
+    "unified_checkpoint": true,
+    "autotuner_benchmark":false,
+    "beta": 0.1,
+    "loss_type": "sigmoid",
+    "label_smoothing": 0.0
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/lora.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/lora.json
new file mode 100644
index 000000000000..a1a2acf64a0e
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/lora.json
@@ -0,0 +1,37 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-1.5B",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/lora_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": false,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "lora": true,
+    "unified_checkpoint": true,
+	"benchmark": true,
+    "zero_padding": true,
+    "flash_mask": true,
+    "use_flash_attention": true,
+    "pissa": false
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/sft.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/sft.json
new file mode 100644
index 000000000000..4ff7267dcc5c
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/sft.json
@@ -0,0 +1,37 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-1.5B",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/sft_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-05,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": false,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+	"benchmark": true,
+    "tensor_parallel_degree": 1,
+    "sharding_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "sharding": "stage2",
+    "zero_padding": true,
+    "flash_mask": true,
+    "unified_checkpoint": true,
+    "use_flash_attention": true
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/dpo.json
new file mode 100644
index 000000000000..f47f63ef8c11
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/dpo.json
@@ -0,0 +1,36 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-7B",
+    "train_dataset_path": "./data/dpo_benchmark_train/train.json",
+    "dev_dataset_path": "./data/dpo_benchmark_train/dev.json",
+    "output_dir": "./checkpoints/dpo_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 1,
+    "num_train_epochs": 1,
+    "learning_rate": 1e-06,
+    "warmup_steps": 10,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "max_seq_len": 8192,
+    "max_prompt_len": 4096,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "tensor_parallel_degree": 4,
+    "sharding_parallel_degree": 2,
+    "sharding": "stage2",
+    "use_flash_attention": true,
+    "flash_mask": true,
+    "recompute": true,
+    "recompute_granularity": "full",
+    "benchmark": true,
+    "unified_checkpoint": true,
+    "autotuner_benchmark":false,
+    "beta": 0.1,
+    "loss_type": "sigmoid",
+    "label_smoothing": 0.0
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/lora.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/lora.json
new file mode 100644
index 000000000000..de1169384551
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/lora.json
@@ -0,0 +1,37 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-7B",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/lora_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-04,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "lora": true,
+    "unified_checkpoint": true,
+	"benchmark": true,
+    "zero_padding": true,
+    "flash_mask": true,
+    "use_flash_attention": true,
+    "pissa": false
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/sft.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/sft.json
new file mode 100644
index 000000000000..af7987f4b9b1
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/sft.json
@@ -0,0 +1,37 @@
+{
+    "model_name_or_path": "Qwen/Qwen2.5-7B",
+    "dataset_name_or_path": "./data/sft_benchmark_train/",
+    "output_dir": "./checkpoints/sft_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 8,
+    "eval_accumulation_steps":16,
+    "num_train_epochs": 1,
+    "learning_rate": 3e-05,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 4096,
+    "max_length": 8192,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "accuracy",
+    "recompute": true,
+	"recompute_granularity": "full",
+    "save_total_limit": 1,
+	"benchmark": true,
+    "tensor_parallel_degree": 2,
+    "sharding_parallel_degree": 4,
+    "pipeline_parallel_degree": 1,
+    "sharding": "stage2",
+    "zero_padding": true,
+    "flash_mask": true,
+    "unified_checkpoint": true,
+    "use_flash_attention": true
+  }
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh b/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
new file mode 100644
index 000000000000..416ded186efb
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh
@@ -0,0 +1,35 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python -m pip install -r ../requirements.txt
+python -m pip install -r ../requirements-dev.txt
+export PYTHONPATH=../:${PYTHONPATH}
+echo ${PYTHONPATH}
+
+
+# install fused_ln custom ops
+cd ../slm/model_zoo/gpt-3/external_ops/
+python setup.py install
+cd -
+
+# install paddlenlp_ops
+cd ../csrc/
+python setup_cuda.py install
+cd -
+
+cd ../llm
+cp -r ../tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json ./
+
+wget https://paddlenlp.bj.bcebos.com/llm_benchmark_data/paddle_data.tar.gz
+tar zxvf paddle_data.tar.gz && rm -rf paddle_data.tar.gz
\ No newline at end of file
diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh b/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh
new file mode 100644
index 000000000000..6f6b27849ea9
--- /dev/null
+++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test training benchmark for a model.
+# Usage：bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num}
+function _set_params(){
+    model_name_or_path=${model_name_or_path:-"Qwen/Qwen2.5-7B"}
+    per_device_train_batch_size=${per_device_train_batch_size:-1}
+    tensor_parallel_degree=${tensor_parallel_degree:-1}
+    pipeline_parallel_degree=${pipeline_parallel_degree:-1}
+    sharding_parallel_degree=${sharding_parallel_degree:-1}
+    sharding=${sharding:-"stage1"}
+    recompute=${recompute:-1}
+    recompute_granularity=${recompute_granularity:-"full"}
+    gradient_accumulation_steps=${gradient_accumulation_steps:-1}
+    run_stage=${run_stage:-"sft"}
+    run_mode=${run_mode:-"DP1-MP1-PP4-mbs1-acc8-recompute"}
+    device_num=${device_num:-"N1C8"}
+    global_batch_size=${global_batch_size:-16}
+    model_item=${model_item:-"qwen-qwen-7b_seqlen2048_pretrain"}
+    max_steps=${max_steps:-150}
+    logging_steps=${logging_steps:-1}
+    base_batch_size=${global_batch_size}
+
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+    model_repo="PaddleNLP"          # (必选) 模型套件的名字
+    speed_unit="tokens/s"         # (必选)速度指标单位
+    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="Effective_Tokens_per_second_per_gpu:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    is_large_model=True           # (可选)普通模型默认为False，如果添加大模型且只取一条ips设置为True
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+
+    fp_item="bf16"
+    # 以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    mkdir -p $(dirname ${train_log_file})
+
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    mkdir -p $(dirname ${profiling_log_file})
+
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+    mkdir -p $(dirname ${speed_log_file})
+
+    OUTPUT_PATH=${run_log_path}/output
+    
+}
+
+function _train(){
+    batch_size=${per_device_train_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+
+    if [ -d $OUTPUT_PATH ]; then
+        rm -rf $OUTPUT_PATH
+    fi
+    mkdir $OUTPUT_PATH
+
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+
+    if [ ${profiling} = "true" ];then
+        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
+        log_file=${profiling_log_file}
+    else
+        add_options=""
+        log_file=${train_log_file}
+    fi
+
+    case ${run_stage} in
+    sft) train_cmd="run_finetune.py benchmark_json/${model_item%_*}/sft.json" ;;
+    lora) train_cmd="run_finetune.py benchmark_json/${model_item%_*}/lora.json" ;;
+    dpo) train_cmd="alignment/dpo/run_dpo.py benchmark_json/${model_item%_*}/dpo.json" ;;
+    *) echo "choose run_stage(sft | lora | dpo)"; exit 1;
+    esac
+
+    if [ ${PADDLE_TRAINER_ID} ]
+    then
+        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
+    else
+        PADDLE_RANK_OPTION=""
+    fi
+    # 以下为通用执行命令，无特殊可不用修改
+    case ${device_num} in
+    N1C1) echo "Run with: device_num=${device_num}, run_mode=${run_mode}, run_stage=${run_stage}"
+        train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog --gpus=0 ${PADDLE_RANK_OPTION}\
+            ${train_cmd}"
+        workerlog_id=0
+        ;;
+    *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}, run_stage=${run_stage}"
+        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
+            ${train_cmd}"
+        workerlog_id=0
+        ;;
+    esac
+    cd ../llm/
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+    python -c "import paddlenlp"
+    if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间
+        ${train_cmd} > ${log_file} 2>&1
+    else
+        timeout 30m ${train_cmd} > ${log_file} 2>&1
+        # echo ${train_cmd}
+        Effective_Tokens_per_second=`cat ${log_file} | grep -E 'Effective_Tokens_per_second|Effective tokens per second:' \
+                                            |awk -F': ' '{print $2}' |awk -F' ' '{print $1}'`
+        num_gpu=$(echo "$device_num" | sed 's/^.*C//')
+        ips=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}')
+        echo "Effective_Tokens_per_second_per_gpu: ${ips}" >> ${log_file}
+    fi
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+    if [ ${device_num} != "N1C1" -a -d mylog ]; then
+        case_path=$PWD && cd - && mkdir -p mylog      # PaddleNLP/tests/mylog
+        cp -r ${case_path}/mylog/workerlog.* ./mylog/
+    fi
+}
+
+export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+#_train       # 如果只产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开