PaddlePaddle · ZeyuChen · Jun 26, 2024 · Jun 1, 2023 · Jun 6, 2023 · Jun 6, 2023
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
@@ -1039,7 +1039,10 @@
                 strategy = fleet.DistributedStrategy()
                 assert self.data_parallel_config == "", "data_parallle_config is not supported in hybrid parallel"
                 if self.pipeline_parallel_degree > 1:
-                    pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
+                    if " " in self.pipeline_parallel_config:
+                        pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
+                    else:
+                        pipeline_parallel_config = set(self.pipeline_parallel_config.split(","))
                     for x in pipeline_parallel_config:
                         if len(x) > 0:
                             if x not in [
@@ -1222,7 +1225,10 @@
                 strategy.hybrid_configs = hybrid_configs
 
                 if self.sharding_parallel_degree > 1:
-                    sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
+                    if " " in self.sharding_parallel_config:
+                        sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
+                    else:
+                        sharding_parallel_config = set(self.sharding_parallel_config.split(","))
                     for x in sharding_parallel_config:
                         if len(x) > 0:
                             if x not in [
@@ -1378,7 +1384,10 @@
 
             # navie-pp: pipeline_parallel_degree > 1 and gradient_accumulation_steps == 1
             if self.pipeline_parallel_degree > 1 and self.gradient_accumulation_steps > 1:
-                pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
+                if " " in self.pipeline_parallel_config:
+                    pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
+                else:
+                    pipeline_parallel_config = set(self.pipeline_parallel_config.split(","))
                 for x in pipeline_parallel_config:
                     if len(x) > 0:
                         if x not in [
@@ -1464,7 +1473,10 @@
                 elif ShardingOption.FULL_SHARD in self.sharding:
                     sharding.stage = 3
 
-                sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
+                if " " in self.sharding_parallel_config:
+                    sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
+                else:
+                    sharding_parallel_config = set(self.sharding_parallel_config.split(","))
                 for x in sharding_parallel_config:
                     if len(x) > 0:
                         if x not in [

diff --git a/scripts/distribute/ci_case_dy.sh b/scripts/distribute/ci_case_dy.sh
@@ -22,7 +22,7 @@ export root_path=/workspace/PaddleNLP
 export gpt_case_path=$root_path/legacy/model_zoo/gpt-3
 export gpt_data_path=/fleetx_data
 
-export llm_gpt_case_path=$root_path/llm/gpt-3
+export llm_gpt_case_path=$root_path/llm
 export llm_gpt_data_path=/llm_gpt_data
 
 unset CUDA_VISIBLE_DEVICES

diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
@@ -33,15 +33,12 @@ else
   master_ip=${ip_lists[0]}
   rank=$PADDLE_TRAINER_ID
   echo $master_ip $rank
-  if [ $rank == 0 ]; then
-    net=$(netstat -anp | grep :2379 | grep "LISTEN")
-    if [ ${#net} == 0 ]; then
-        apt-get install -y --allow-downgrades etcd
-        nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
-        ps -ef |grep etcd
-    fi  
-  else
-      sleep 5
-  fi
+  #多机任务在每台机器上都启动服务，保证同步，否则多机运行会报错
+  net=$(netstat -anp | grep :2379 | grep "LISTEN")
+  if [ ${#net} == 0 ]; then
+      apt-get install -y --allow-downgrades etcd
+      nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+      ps -ef |grep etcd
+  fi  
   sleep 5
 fi
diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
@@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/llama
+cd ../llm
 python -m pip install tool_helpers
 
 rm -rf data && mkdir data
@@ -32,7 +33,7 @@ mv llama_openwebtext_100k.idx ./data
 
 # mv autoconfig
 rm -rf autoconfig
-cp -r ../../tests/test_tipc/auto_tuner/autoconfig ./
+cp -r ../tests/test_tipc/auto_tuner/autoconfig ./
 
 if [ -z "$1" ]; then  
   echo "单机任务"
@@ -43,15 +44,12 @@ else
   master_ip=${ip_lists[0]}
   rank=$PADDLE_TRAINER_ID
   echo $master_ip $rank
-  if [ $rank == 0 ]; then
-    net=$(netstat -anp | grep :2379 | grep "LISTEN")
-    if [ ${#net} == 0 ]; then
-        apt-get install -y --allow-downgrades etcd
-        nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
-        ps -ef |grep etcd
-    fi  
-  else
-      sleep 5
-  fi
+  #多机任务在每台机器上都启动服务，保证同步，否则多机运行会报错
+  net=$(netstat -anp | grep :2379 | grep "LISTEN")
+  if [ ${#net} == 0 ]; then
+      apt-get install -y --allow-downgrades etcd
+      nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+      ps -ef |grep etcd
+  fi  
   sleep 5
 fi
diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh
@@ -97,7 +97,7 @@ function _train(){
             --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}"
         ;;
     esac
-    cd ../llm/llama
+    cd ../llm
     if [[ ${model_item} =~ "resume" ]];then
         cp $PWD/autoconfig/resume.csv $PWD/autoconfig/llama7b_pretrain_history.csv
     fi

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh
@@ -28,7 +28,7 @@
 python -m pip install -r ../requirements.txt
 python -m pip install -r ../requirements-dev.txt
 # get data
-cd ../llm/gpt-3
+cd ../llm
 rm -rf data
 mkdir data
 wget -O data/gpt2-en-mmap.bin https://paddlenlp.bj.bcebos.com/datasets/PDC_DATASETS/PRETRAIN/openwebtext2/gpt/mmap/gpt2-en-mmap.bin

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh
@@ -178,7 +178,7 @@ function _train(){
         workerlog_id=0
         ;;
     esac
-    cd ../llm/gpt-3
+    cd ../llm
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
     if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
         ${train_cmd} > ${log_file} 2>&1

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh
@@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/llama
+cd ../llm/
 python -m pip install tool_helpers
 
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh
@@ -175,7 +175,7 @@ function _train(){
         workerlog_id=0
         ;;
     esac
-    cd ../llm/llama
+    cd ../llm
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
     python -c "import paddlenlp"
     if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh
@@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/llama
+cd ../llm/
 python -m pip install tool_helpers
 
 # download data
@@ -36,16 +37,13 @@ ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
 master_ip=${ip_lists[0]}
 rank=$PADDLE_TRAINER_ID
 echo $master_ip $rank
-if [ $rank == 0 ]; then
-    net=$(netstat -anp | grep "2379 " | grep "LISTEN")
-    if [ ${#net} == 0 ]; then
-        nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
-        ps -ef |grep etcd
-    fi  
-else
-    sleep 5
-fi
+#多机任务在每台机器上都启动服务，保证同步，否则多机运行会报错
+net=$(netstat -anp | grep ":2379" | grep "LISTEN")
+if [ ${#net} == 0 ]; then
+    nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+    ps -ef |grep etcd
+fi  
 
 # mv autoconfig
 rm -rf auto_config_*
-cp -r ../../tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_* ./
+cp -r ../tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_* ./
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh
@@ -128,7 +128,7 @@ function _train(){
             ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-auto_tuner.json"
         ;;
     esac
-    cd ../llm/llama
+    cd ../llm
     rm -rf ./auto_config_${MODEL_TYPE}/*GBS*
     rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log
     rm -rf ./auto_config_${MODEL_TYPE}/*csv

diff --git a/...ism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh b/...ism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh
@@ -0,0 +1,48 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=qwen/qwen-14b "
+param+="per_device_train_batch_size=1 "
+param+="data_parallel_degree=1 "
+param+="tensor_parallel_degree=1 "
+param+="pipeline_parallel_degree=4 "
+param+="virtual_pp_degree=1 "
+param+="sequence_parallel=0 "
+param+="sharding_parallel_degree=8 "
+param+="sharding=stage1 "
+param+="recompute=0 "
+param+="recompute_granularity=full_attn "
+param+="run_mode=MP1-PP4-sharding8-mbs1-acc4 "
+param+="device_num=N4C32 "
+param+="global_batch_size=32 "
+param+="model_item=qwen-qwen-14b_seqlen4096_pretrain "
+param+="max_steps=100 "
+param+="gradient_accumulation_steps=4 "
+param+="pp_recompute_interval=1 "
+param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, "
+#多机新添加的参数
+param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, "
+param+="max_seq_length=4096 "
+param+="min_learning_rate=0.000005 "
+param+="save_steps=5000 "
+param+="eval_steps=1000 "
+param+="scale_loss=1024 "
+param+="sharding_parallel_config=split_param,enable_stage1_overlap, "
+
+
+cd ./tests
+bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
diff --git a/...sm/qwen/N4C32/qwen-qwen-72b_seqlen4096_pretrain_bs32_bf16_MP8-PP4-sharding1-mbs1-acc32.sh b/...sm/qwen/N4C32/qwen-qwen-72b_seqlen4096_pretrain_bs32_bf16_MP8-PP4-sharding1-mbs1-acc32.sh
@@ -0,0 +1,48 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=qwen/qwen-72b "
+param+="per_device_train_batch_size=1 "
+param+="data_parallel_degree=1 "
+param+="tensor_parallel_degree=8 "
+param+="pipeline_parallel_degree=4 "
+param+="virtual_pp_degree=4 "
+param+="sequence_parallel=1 "
+param+="sharding_parallel_degree=1 "
+param+="sharding=stage1 "
+param+="recompute=0 "
+param+="recompute_granularity=full_attn "
+param+="run_mode=MP8-PP4-sharding1-mbs1-acc32 "
+param+="device_num=N4C32 "
+param+="global_batch_size=32 "
+param+="model_item=qwen-qwen-72b_seqlen4096_pretrain "
+param+="max_steps=100 "
+param+="gradient_accumulation_steps=32 "
+param+="pp_recompute_interval=1 "
+param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, "
+#多机新添加的参数
+param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, "
+param+="max_seq_length=4096 "
+param+="min_learning_rate=0.000005 "
+param+="save_steps=5000 "
+param+="eval_steps=1000 "
+param+="scale_loss=1024 "
+param+="sharding_parallel_config=split_param,enable_stage1_overlap, "
+
+
+cd ./tests
+bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
diff --git a/...ism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh b/...ism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh
@@ -0,0 +1,48 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+param="model_name_or_path=qwen/qwen-7b "
+param+="per_device_train_batch_size=2 "
+param+="data_parallel_degree=1 "
+param+="tensor_parallel_degree=2 "
+param+="pipeline_parallel_degree=1 "
+param+="virtual_pp_degree=1 "
+param+="sequence_parallel=1 "
+param+="sharding_parallel_degree=16 "
+param+="sharding=stage1 "
+param+="recompute=0 "
+param+="recompute_granularity=full_attn "
+param+="run_mode=MP2-PP1-sharding16-mbs2-acc1 "
+param+="device_num=N4C32 "
+param+="global_batch_size=32 "
+param+="model_item=qwen-qwen-7b_seqlen4096_pretrain "
+param+="max_steps=100 "
+param+="gradient_accumulation_steps=1 "
+param+="pp_recompute_interval=1 "
+param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
+#多机新添加的参数
+param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads "
+param+="max_seq_length=4096 "
+param+="min_learning_rate=0.000005 "
+param+="save_steps=5000 "
+param+="eval_steps=1000 "
+param+="scale_loss=1024 "
+param+="sharding_parallel_config=split_param,enable_stage1_overlap "
+
+
+cd ./tests
+bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
+
+bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
@@ -20,9 +20,10 @@ python -m pip install tiktoken
 # install fused_ln custom ops
 cd ../legacy/model_zoo/gpt-3/external_ops/
 python setup.py install
+cd -
 
 # install tool_helpers
-cd ../../../../llm/qwen
+cd ../llm/
 python -m pip install tool_helpers
 
 wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy