diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
index ffb2b0c885b6..b027b1178247 100644
--- a/paddlenlp/trainer/training_args.py
+++ b/paddlenlp/trainer/training_args.py
@@ -1039,7 +1039,10 @@ def __post_init__(self):
                 strategy = fleet.DistributedStrategy()
                 assert self.data_parallel_config == "", "data_parallle_config is not supported in hybrid parallel"
                 if self.pipeline_parallel_degree > 1:
-                    pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
+                    if " " in self.pipeline_parallel_config:
+                        pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
+                    else:
+                        pipeline_parallel_config = set(self.pipeline_parallel_config.split(","))
                     for x in pipeline_parallel_config:
                         if len(x) > 0:
                             if x not in [
@@ -1222,7 +1225,10 @@ def is_segment_parallel_supported():
                 strategy.hybrid_configs = hybrid_configs
 
                 if self.sharding_parallel_degree > 1:
-                    sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
+                    if " " in self.sharding_parallel_config:
+                        sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
+                    else:
+                        sharding_parallel_config = set(self.sharding_parallel_config.split(","))
                     for x in sharding_parallel_config:
                         if len(x) > 0:
                             if x not in [
@@ -1378,7 +1384,10 @@ def is_segment_parallel_supported():
 
             # navie-pp: pipeline_parallel_degree > 1 and gradient_accumulation_steps == 1
             if self.pipeline_parallel_degree > 1 and self.gradient_accumulation_steps > 1:
-                pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
+                if " " in self.pipeline_parallel_config:
+                    pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
+                else:
+                    pipeline_parallel_config = set(self.pipeline_parallel_config.split(","))
                 for x in pipeline_parallel_config:
                     if len(x) > 0:
                         if x not in [
@@ -1464,7 +1473,10 @@ def is_segment_parallel_supported():
                 elif ShardingOption.FULL_SHARD in self.sharding:
                     sharding.stage = 3
 
-                sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
+                if " " in self.sharding_parallel_config:
+                    sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
+                else:
+                    sharding_parallel_config = set(self.sharding_parallel_config.split(","))
                 for x in sharding_parallel_config:
                     if len(x) > 0:
                         if x not in [
diff --git a/scripts/distribute/ci_case_dy.sh b/scripts/distribute/ci_case_dy.sh
index b75900b9fa3f..90ae3de18d4f 100644
--- a/scripts/distribute/ci_case_dy.sh
+++ b/scripts/distribute/ci_case_dy.sh
@@ -22,7 +22,7 @@ export root_path=/workspace/PaddleNLP
 export gpt_case_path=$root_path/legacy/model_zoo/gpt-3
 export gpt_data_path=/fleetx_data
 
-export llm_gpt_case_path=$root_path/llm/gpt-3
+export llm_gpt_case_path=$root_path/llm
 export llm_gpt_data_path=/llm_gpt_data
 
 unset CUDA_VISIBLE_DEVICES
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
index 919913ebdf21..b80fcb253163 100644
--- a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
+++ b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
@@ -33,15 +33,12 @@ else
   master_ip=${ip_lists[0]}
   rank=$PADDLE_TRAINER_ID
   echo $master_ip $rank
-  if [ $rank == 0 ]; then
-    net=$(netstat -anp | grep :2379 | grep "LISTEN")
-    if [ ${#net} == 0 ]; then
-        apt-get install -y --allow-downgrades etcd
-        nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
-        ps -ef |grep etcd
-    fi  
-  else
-      sleep 5
-  fi
+  #多机任务在每台机器上都启动服务，保证同步，否则多机运行会报错
+  net=$(netstat -anp | grep :2379 | grep "LISTEN")
+  if [ ${#net} == 0 ]; then
+      apt-get install -y --allow-downgrades etcd
+      nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+      ps -ef |grep etcd
+  fi  
   sleep 5
 fi
diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
index 3e51c5dc6fb7..3e9ba0d6e185 100644
--- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
+++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
@@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/llama
+cd ../llm
 python -m pip install tool_helpers
 
 rm -rf data && mkdir data
@@ -32,7 +33,7 @@ mv llama_openwebtext_100k.idx ./data
 
 # mv autoconfig
 rm -rf autoconfig
-cp -r ../../tests/test_tipc/auto_tuner/autoconfig ./
+cp -r ../tests/test_tipc/auto_tuner/autoconfig ./
 
 if [ -z "$1" ]; then  
   echo "单机任务"
@@ -43,15 +44,12 @@ else
   master_ip=${ip_lists[0]}
   rank=$PADDLE_TRAINER_ID
   echo $master_ip $rank
-  if [ $rank == 0 ]; then
-    net=$(netstat -anp | grep :2379 | grep "LISTEN")
-    if [ ${#net} == 0 ]; then
-        apt-get install -y --allow-downgrades etcd
-        nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
-        ps -ef |grep etcd
-    fi  
-  else
-      sleep 5
-  fi
+  #多机任务在每台机器上都启动服务，保证同步，否则多机运行会报错
+  net=$(netstat -anp | grep :2379 | grep "LISTEN")
+  if [ ${#net} == 0 ]; then
+      apt-get install -y --allow-downgrades etcd
+      nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+      ps -ef |grep etcd
+  fi  
   sleep 5
 fi
diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh
index c0394a0bd681..6fad1dc36540 100644
--- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh
@@ -97,7 +97,7 @@ function _train(){
             --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}"
         ;;
     esac
-    cd ../llm/llama
+    cd ../llm
     if [[ ${model_item} =~ "resume" ]];then
         cp $PWD/autoconfig/resume.csv $PWD/autoconfig/llama7b_pretrain_history.csv
     fi
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh
index 9807869fb9e6..f170bc54b6c1 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh
@@ -28,7 +28,7 @@
 python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 # get data
-cd ../llm/gpt-3
+cd ../llm
 rm -rf data
 mkdir data
 wget -O data/gpt2-en-mmap.bin https://paddlenlp.bj.bcebos.com/datasets/PDC_DATASETS/PRETRAIN/openwebtext2/gpt/mmap/gpt2-en-mmap.bin
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh
index fa05504c743b..315489b3111f 100755
--- a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh
@@ -178,7 +178,7 @@ function _train(){
         workerlog_id=0
         ;;
     esac
-    cd ../llm/gpt-3
+    cd ../llm
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
     if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
         ${train_cmd} > ${log_file} 2>&1
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
index 388b179e6905..e3ebf51a1908 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
@@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/llama
+cd ../llm/
 python -m pip install tool_helpers
 
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh
index 6e4862bf3c43..549a7a98f5a7 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh
@@ -175,7 +175,7 @@ function _train(){
         workerlog_id=0
         ;;
     esac
-    cd ../llm/llama
+    cd ../llm
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
     python -c "import paddlenlp"
     if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
index 9405521c7b3f..9b2f06c9f434 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
@@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/llama
+cd ../llm/
 python -m pip install tool_helpers
 
 # download data
@@ -36,16 +37,13 @@ ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
 master_ip=${ip_lists[0]}
 rank=$PADDLE_TRAINER_ID
 echo $master_ip $rank
-if [ $rank == 0 ]; then
-    net=$(netstat -anp | grep "2379 " | grep "LISTEN")
-    if [ ${#net} == 0 ]; then
-        nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
-        ps -ef |grep etcd
-    fi  
-else
-    sleep 5
-fi
+#多机任务在每台机器上都启动服务，保证同步，否则多机运行会报错
+net=$(netstat -anp | grep ":2379" | grep "LISTEN")
+if [ ${#net} == 0 ]; then
+    nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+    ps -ef |grep etcd
+fi  
 
 # mv autoconfig
 rm -rf auto_config_*
-cp -r ../../tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_* ./
+cp -r ../tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_* ./
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh
index 7ddcab91e056..9afb2a0902c8 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh
@@ -128,7 +128,7 @@ function _train(){
             ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-auto_tuner.json"
         ;;
     esac
-    cd ../llm/llama
+    cd ../llm
     rm -rf ./auto_config_${MODEL_TYPE}/*GBS*
     rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log
     rm -rf ./auto_config_${MODEL_TYPE}/*csv
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh
new file mode 100644
index 000000000000..001ccbf0a07f
--- /dev/null
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh
@@ -0,0 +1,48 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=qwen/qwen-14b "
+param+="per_device_train_batch_size=1 "
+param+="data_parallel_degree=1 "
+param+="tensor_parallel_degree=1 "
+param+="pipeline_parallel_degree=4 "
+param+="virtual_pp_degree=1 "
+param+="sequence_parallel=0 "
+param+="sharding_parallel_degree=8 "
+param+="sharding=stage1 "
+param+="recompute=0 "
+param+="recompute_granularity=full_attn "
+param+="run_mode=MP1-PP4-sharding8-mbs1-acc4 "
+param+="device_num=N4C32 "
+param+="global_batch_size=32 "
+param+="model_item=qwen-qwen-14b_seqlen4096_pretrain "
+param+="max_steps=100 "
+param+="gradient_accumulation_steps=4 "
+param+="pp_recompute_interval=1 "
+param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, "
+#多机新添加的参数
+param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, "
+param+="max_seq_length=4096 "
+param+="min_learning_rate=0.000005 "
+param+="save_steps=5000 "
+param+="eval_steps=1000 "
+param+="scale_loss=1024 "
+param+="sharding_parallel_config=split_param,enable_stage1_overlap, "
+
+
+cd ./tests
+bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-72b_seqlen4096_pretrain_bs32_bf16_MP8-PP4-sharding1-mbs1-acc32.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-72b_seqlen4096_pretrain_bs32_bf16_MP8-PP4-sharding1-mbs1-acc32.sh
new file mode 100644
index 000000000000..3bbaee756bde
--- /dev/null
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-72b_seqlen4096_pretrain_bs32_bf16_MP8-PP4-sharding1-mbs1-acc32.sh
@@ -0,0 +1,48 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=qwen/qwen-72b "
+param+="per_device_train_batch_size=1 "
+param+="data_parallel_degree=1 "
+param+="tensor_parallel_degree=8 "
+param+="pipeline_parallel_degree=4 "
+param+="virtual_pp_degree=4 "
+param+="sequence_parallel=1 "
+param+="sharding_parallel_degree=1 "
+param+="sharding=stage1 "
+param+="recompute=0 "
+param+="recompute_granularity=full_attn "
+param+="run_mode=MP8-PP4-sharding1-mbs1-acc32 "
+param+="device_num=N4C32 "
+param+="global_batch_size=32 "
+param+="model_item=qwen-qwen-72b_seqlen4096_pretrain "
+param+="max_steps=100 "
+param+="gradient_accumulation_steps=32 "
+param+="pp_recompute_interval=1 "
+param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, "
+#多机新添加的参数
+param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, "
+param+="max_seq_length=4096 "
+param+="min_learning_rate=0.000005 "
+param+="save_steps=5000 "
+param+="eval_steps=1000 "
+param+="scale_loss=1024 "
+param+="sharding_parallel_config=split_param,enable_stage1_overlap, "
+
+
+cd ./tests
+bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh
new file mode 100644
index 000000000000..69882b131a52
--- /dev/null
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh
@@ -0,0 +1,48 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=qwen/qwen-7b "
+param+="per_device_train_batch_size=2 "
+param+="data_parallel_degree=1 "
+param+="tensor_parallel_degree=2 "
+param+="pipeline_parallel_degree=1 "
+param+="virtual_pp_degree=1 "
+param+="sequence_parallel=1 "
+param+="sharding_parallel_degree=16 "
+param+="sharding=stage1 "
+param+="recompute=0 "
+param+="recompute_granularity=full_attn "
+param+="run_mode=MP2-PP1-sharding16-mbs2-acc1 "
+param+="device_num=N4C32 "
+param+="global_batch_size=32 "
+param+="model_item=qwen-qwen-7b_seqlen4096_pretrain "
+param+="max_steps=100 "
+param+="gradient_accumulation_steps=1 "
+param+="pp_recompute_interval=1 "
+param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
+#多机新添加的参数
+param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads "
+param+="max_seq_length=4096 "
+param+="min_learning_rate=0.000005 "
+param+="save_steps=5000 "
+param+="eval_steps=1000 "
+param+="scale_loss=1024 "
+param+="sharding_parallel_config=split_param,enable_stage1_overlap "
+
+
+cd ./tests
+bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
index bf6952c135ca..deca6e05b6fc 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
@@ -20,9 +20,10 @@ python -m pip install tiktoken
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/qwen
+cd ../llm/
 python -m pip install tool_helpers
 
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh
index a2b40044b5f9..2b7c5682d1ee 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh
@@ -38,6 +38,17 @@ function _set_params(){
     pipeline_parallel_config=${pipeline_parallel_config:-""}
     recompute_use_reentrant=${recompute_use_reentrant:-"true"}
     recompute_granularity=${recompute_granularity:-"full"}
+    #添加多机脚本时需要提取下面的不同参数
+    max_seq_length=${max_seq_length:-2048}
+    min_learning_rate=${min_learning_rate:-0.000001}
+    save_steps=${save_steps:-50000}
+    eval_steps=${eval_steps:-1001}
+    scale_loss=${scale_loss:-1024}
+    sharding_parallel_config=${sharding_parallel_config:-"split_param enable_stage1_overlap"}
+    # #替换pipeline_parallel_config和sharding_parallel_config的参数为空格表示
+    # pipeline_parallel_config=$(echo $pipeline_parallel_config | tr ',' ' ')
+    # sharding_parallel_config=$(echo $sharding_parallel_config | tr ',' ' ')
+
 
     base_batch_size=${global_batch_size}
 
@@ -46,6 +57,14 @@ function _set_params(){
     speed_unit="tokens/s"         # (必选)速度指标单位
     skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
     keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    is_large_model=True
+    #添加多机脚本时直接取最后10步的平均值就可以
+    if [[ $device_num == "N4C32" ]]; then
+        skip_steps=10                # (必选)解析日志，跳过模型前几个性能不稳定的step
+        keyword="interval_tokens_per_second_per_device:"     # (必选)解析日志，筛选出性能数据所在行的关键字
+        is_large_model=False
+        model_mode=5
+    fi
 
     convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
 
@@ -68,7 +87,7 @@ function _set_params(){
     mkdir -p $(dirname ${speed_log_file})
 
     OUTPUT_PATH=${run_log_path}/output
-    is_large_model=True
+    
 }
 
 function _train(){
@@ -99,18 +118,32 @@ function _train(){
     # fi
 
     if [ "${pipeline_parallel_config}" != "" ]; then
+        #pipeline_parallel_config_args="--pipeline_parallel_config \"${pipeline_parallel_config}\""
         pipeline_parallel_config_args="--pipeline_parallel_config ${pipeline_parallel_config}"
     else
         pipeline_parallel_config_args=""
     fi
+    #qwen的多机需要额外设置的参数
+    if [ "${device_num}" == "N4C32" ]; then
+    #    diff_args="--sequence_parallel ${sequence_parallel} --sharding_parallel_degree ${sharding_parallel_degree} \
+    #    --scale_loss 1024 --recompute_granularity ${recompute_granularity} \
+    #    --sharding_parallel_config \"${sharding_parallel_config}\""
+       diff_args="--sequence_parallel ${sequence_parallel} --sharding_parallel_degree ${sharding_parallel_degree} \
+       --scale_loss 1024 --recompute_granularity ${recompute_granularity} \
+       --sharding_parallel_config ${sharding_parallel_config}"
+    else
+       diff_args="--recompute_use_reentrant ${recompute_use_reentrant} \
+        --skip_memory_metrics 0 \
+        --data_cache ./data_cache"
+    fi
 
     use_pure_fp16=False
     train_cmd="--model_name_or_path ${model_name_or_path} \
     --tokenizer_name_or_path ${model_name_or_path} \
-    --input_dir ./qwen/data \
+    --input_dir ./data \
     --output_dir ./output \
     --split 949,50,1 \
-    --max_seq_length 2048 \
+    --max_seq_length ${max_seq_length} \
     --per_device_train_batch_size ${per_device_train_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --use_flash_attention 1 \
@@ -123,15 +156,15 @@ function _train(){
     --virtual_pp_degree ${virtual_pp_degree} \
     --pp_recompute_interval ${pp_recompute_interval} \
     --learning_rate 0.00001 \
-    --min_learning_rate 0.000001 \
+    --min_learning_rate ${min_learning_rate} \
     --max_steps ${max_steps} \
-    --save_steps 50000 \
+    --save_steps ${save_steps} \
     --weight_decay 0.01 \
     --warmup_ratio 0.01 \
     --max_grad_norm 1.0 \
     --logging_steps 1 \
     --dataloader_num_workers 1 \
-    --eval_steps 1001 \
+    --eval_steps ${eval_steps} \
     --sharding ${sharding} \
     --disable_tqdm true \
     --continue_training 0 \
@@ -142,10 +175,7 @@ function _train(){
     --fuse_attention_qkv true \
     --fuse_attention_ffn true \
     --tensor_parallel_config ${tensor_parallel_config} ${pipeline_parallel_config_args} \
-    --recompute ${recompute} \
-    --recompute_use_reentrant ${recompute_use_reentrant} \
-    --skip_memory_metrics 0 \
-    --data_cache ./data_cache"
+    --recompute ${recompute} ${diff_args}"
 
     if [ ${PADDLE_TRAINER_ID} ]
     then
diff --git a/tests/test_tipc/dygraph/moe/gpt/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/moe/gpt/benchmark_common/prepare.sh
index e511447a8af1..7d7e059c9594 100644
--- a/tests/test_tipc/dygraph/moe/gpt/benchmark_common/prepare.sh
+++ b/tests/test_tipc/dygraph/moe/gpt/benchmark_common/prepare.sh
@@ -1,4 +1,4 @@
-cd ../examples/language_model/moe/data_tools/
+cd ../legacy/model_zoo/moe/data_tools/
 sed -i "s/python3/python3.10/g" Makefile
 cd -
 
@@ -8,7 +8,7 @@ python3 -m pip install -r ../requirements.txt #-i https://pypi.tuna.tsinghua.edu
 python3 -m pip install pybind11 regex sentencepiece tqdm visualdl #-i https://mirror.baidu.com/pypi/simple
 python3 -m pip install --upgrade paddlenlp
 # get data
-cd ../examples/language_model/moe/dygraph/
+cd ../legacy/model_zoo/moe/dygraph/
 rm -rf data
 mkdir data && cd data
 wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt2/train.data.json_ids.npz
diff --git a/tests/test_tipc/dygraph/moe/gpt/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/moe/gpt/benchmark_common/run_benchmark.sh
index a38eb67b47ec..ab5797d7270a 100644
--- a/tests/test_tipc/dygraph/moe/gpt/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/moe/gpt/benchmark_common/run_benchmark.sh
@@ -147,7 +147,7 @@ function _train(){
         ;;
     *) echo "choose run_mode "; exit 1;
     esac
-    cd ../examples/language_model/moe/dygraph/
+    cd ../legacy/model_zoo/moe/dygraph/
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
     python -c "import paddlenlp"
     if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
index 697d5d1d92e0..cd0f05c27314 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
@@ -18,13 +18,13 @@ python -m pip install -r ../requirements-dev.txt
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/llama
+cd ../llm/auto_parallel/llama
 python -m pip install tool_helpers
 
 # download data
-cd auto_parallel
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
 mkdir data
diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
index 16126a9198cc..03b2175d9465 100644
--- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh
@@ -132,7 +132,7 @@ function _train(){
             ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json"
         ;;
     esac
-    cd ../llm/llama/auto_parallel/
+    cd ../llm/auto_parallel/llama
     # rm -rf ./auto_config_${MODEL_TYPE}/*GBS*
     # rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log
     # rm -rf ./auto_config_${MODEL_TYPE}/*csv