diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index ffb2b0c885b6..b027b1178247 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -1039,7 +1039,10 @@ def __post_init__(self): strategy = fleet.DistributedStrategy() assert self.data_parallel_config == "", "data_parallle_config is not supported in hybrid parallel" if self.pipeline_parallel_degree > 1: - pipeline_parallel_config = set(self.pipeline_parallel_config.split(" ")) + if " " in self.pipeline_parallel_config: + pipeline_parallel_config = set(self.pipeline_parallel_config.split(" ")) + else: + pipeline_parallel_config = set(self.pipeline_parallel_config.split(",")) for x in pipeline_parallel_config: if len(x) > 0: if x not in [ @@ -1222,7 +1225,10 @@ def is_segment_parallel_supported(): strategy.hybrid_configs = hybrid_configs if self.sharding_parallel_degree > 1: - sharding_parallel_config = set(self.sharding_parallel_config.split(" ")) + if " " in self.sharding_parallel_config: + sharding_parallel_config = set(self.sharding_parallel_config.split(" ")) + else: + sharding_parallel_config = set(self.sharding_parallel_config.split(",")) for x in sharding_parallel_config: if len(x) > 0: if x not in [ @@ -1378,7 +1384,10 @@ def is_segment_parallel_supported(): # navie-pp: pipeline_parallel_degree > 1 and gradient_accumulation_steps == 1 if self.pipeline_parallel_degree > 1 and self.gradient_accumulation_steps > 1: - pipeline_parallel_config = set(self.pipeline_parallel_config.split(" ")) + if " " in self.pipeline_parallel_config: + pipeline_parallel_config = set(self.pipeline_parallel_config.split(" ")) + else: + pipeline_parallel_config = set(self.pipeline_parallel_config.split(",")) for x in pipeline_parallel_config: if len(x) > 0: if x not in [ @@ -1464,7 +1473,10 @@ def is_segment_parallel_supported(): elif ShardingOption.FULL_SHARD in self.sharding: sharding.stage = 3 - sharding_parallel_config = set(self.sharding_parallel_config.split(" ")) + if " " in self.sharding_parallel_config: + sharding_parallel_config = set(self.sharding_parallel_config.split(" ")) + else: + sharding_parallel_config = set(self.sharding_parallel_config.split(",")) for x in sharding_parallel_config: if len(x) > 0: if x not in [ diff --git a/scripts/distribute/ci_case_dy.sh b/scripts/distribute/ci_case_dy.sh index b75900b9fa3f..90ae3de18d4f 100644 --- a/scripts/distribute/ci_case_dy.sh +++ b/scripts/distribute/ci_case_dy.sh @@ -22,7 +22,7 @@ export root_path=/workspace/PaddleNLP export gpt_case_path=$root_path/legacy/model_zoo/gpt-3 export gpt_data_path=/fleetx_data -export llm_gpt_case_path=$root_path/llm/gpt-3 +export llm_gpt_case_path=$root_path/llm export llm_gpt_data_path=/llm_gpt_data unset CUDA_VISIBLE_DEVICES diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh index 919913ebdf21..b80fcb253163 100644 --- a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh +++ b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh @@ -33,15 +33,12 @@ else master_ip=${ip_lists[0]} rank=$PADDLE_TRAINER_ID echo $master_ip $rank - if [ $rank == 0 ]; then - net=$(netstat -anp | grep :2379 | grep "LISTEN") - if [ ${#net} == 0 ]; then - apt-get install -y --allow-downgrades etcd - nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 & - ps -ef |grep etcd - fi - else - sleep 5 - fi + #多机任务在每台机器上都启动服务,保证同步,否则多机运行会报错 + net=$(netstat -anp | grep :2379 | grep "LISTEN") + if [ ${#net} == 0 ]; then + apt-get install -y --allow-downgrades etcd + nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 & + ps -ef |grep etcd + fi sleep 5 fi diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh index 3e51c5dc6fb7..3e9ba0d6e185 100644 --- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh +++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh @@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install +cd - # install tool_helpers -cd ../../../../llm/llama +cd ../llm python -m pip install tool_helpers rm -rf data && mkdir data @@ -32,7 +33,7 @@ mv llama_openwebtext_100k.idx ./data # mv autoconfig rm -rf autoconfig -cp -r ../../tests/test_tipc/auto_tuner/autoconfig ./ +cp -r ../tests/test_tipc/auto_tuner/autoconfig ./ if [ -z "$1" ]; then echo "单机任务" @@ -43,15 +44,12 @@ else master_ip=${ip_lists[0]} rank=$PADDLE_TRAINER_ID echo $master_ip $rank - if [ $rank == 0 ]; then - net=$(netstat -anp | grep :2379 | grep "LISTEN") - if [ ${#net} == 0 ]; then - apt-get install -y --allow-downgrades etcd - nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 & - ps -ef |grep etcd - fi - else - sleep 5 - fi + #多机任务在每台机器上都启动服务,保证同步,否则多机运行会报错 + net=$(netstat -anp | grep :2379 | grep "LISTEN") + if [ ${#net} == 0 ]; then + apt-get install -y --allow-downgrades etcd + nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 & + ps -ef |grep etcd + fi sleep 5 fi diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh index c0394a0bd681..6fad1dc36540 100644 --- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh @@ -97,7 +97,7 @@ function _train(){ --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}" ;; esac - cd ../llm/llama + cd ../llm if [[ ${model_item} =~ "resume" ]];then cp $PWD/autoconfig/resume.csv $PWD/autoconfig/llama7b_pretrain_history.csv fi diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh index 9807869fb9e6..f170bc54b6c1 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/prepare.sh @@ -28,7 +28,7 @@ python -m pip install -r ../requirements.txt python -m pip install -r ../requirements-dev.txt # get data -cd ../llm/gpt-3 +cd ../llm rm -rf data mkdir data wget -O data/gpt2-en-mmap.bin https://paddlenlp.bj.bcebos.com/datasets/PDC_DATASETS/PRETRAIN/openwebtext2/gpt/mmap/gpt2-en-mmap.bin diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh index fa05504c743b..315489b3111f 100755 --- a/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/ce_gpt/benchmark_common/run_benchmark.sh @@ -178,7 +178,7 @@ function _train(){ workerlog_id=0 ;; esac - cd ../llm/gpt-3 + cd ../llm echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh index 388b179e6905..e3ebf51a1908 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/prepare.sh @@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install +cd - # install tool_helpers -cd ../../../../llm/llama +cd ../llm/ python -m pip install tool_helpers wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh index 6e4862bf3c43..549a7a98f5a7 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh @@ -175,7 +175,7 @@ function _train(){ workerlog_id=0 ;; esac - cd ../llm/llama + cd ../llm echo "train_cmd: ${train_cmd} log_file: ${log_file}" python -c "import paddlenlp" if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间 diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh index 9405521c7b3f..9b2f06c9f434 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/prepare.sh @@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install +cd - # install tool_helpers -cd ../../../../llm/llama +cd ../llm/ python -m pip install tool_helpers # download data @@ -36,16 +37,13 @@ ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) master_ip=${ip_lists[0]} rank=$PADDLE_TRAINER_ID echo $master_ip $rank -if [ $rank == 0 ]; then - net=$(netstat -anp | grep "2379 " | grep "LISTEN") - if [ ${#net} == 0 ]; then - nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 & - ps -ef |grep etcd - fi -else - sleep 5 -fi +#多机任务在每台机器上都启动服务,保证同步,否则多机运行会报错 +net=$(netstat -anp | grep ":2379" | grep "LISTEN") +if [ ${#net} == 0 ]; then + nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 & + ps -ef |grep etcd +fi # mv autoconfig rm -rf auto_config_* -cp -r ../../tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_* ./ +cp -r ../tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_* ./ diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh index 7ddcab91e056..9afb2a0902c8 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh @@ -128,7 +128,7 @@ function _train(){ ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-auto_tuner.json" ;; esac - cd ../llm/llama + cd ../llm rm -rf ./auto_config_${MODEL_TYPE}/*GBS* rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log rm -rf ./auto_config_${MODEL_TYPE}/*csv diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh new file mode 100644 index 000000000000..001ccbf0a07f --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh @@ -0,0 +1,48 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=qwen/qwen-14b " +param+="per_device_train_batch_size=1 " +param+="data_parallel_degree=1 " +param+="tensor_parallel_degree=1 " +param+="pipeline_parallel_degree=4 " +param+="virtual_pp_degree=1 " +param+="sequence_parallel=0 " +param+="sharding_parallel_degree=8 " +param+="sharding=stage1 " +param+="recompute=0 " +param+="recompute_granularity=full_attn " +param+="run_mode=MP1-PP4-sharding8-mbs1-acc4 " +param+="device_num=N4C32 " +param+="global_batch_size=32 " +param+="model_item=qwen-qwen-14b_seqlen4096_pretrain " +param+="max_steps=100 " +param+="gradient_accumulation_steps=4 " +param+="pp_recompute_interval=1 " +param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, " +#多机新添加的参数 +param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, " +param+="max_seq_length=4096 " +param+="min_learning_rate=0.000005 " +param+="save_steps=5000 " +param+="eval_steps=1000 " +param+="scale_loss=1024 " +param+="sharding_parallel_config=split_param,enable_stage1_overlap, " + + +cd ./tests +bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-72b_seqlen4096_pretrain_bs32_bf16_MP8-PP4-sharding1-mbs1-acc32.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-72b_seqlen4096_pretrain_bs32_bf16_MP8-PP4-sharding1-mbs1-acc32.sh new file mode 100644 index 000000000000..3bbaee756bde --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-72b_seqlen4096_pretrain_bs32_bf16_MP8-PP4-sharding1-mbs1-acc32.sh @@ -0,0 +1,48 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=qwen/qwen-72b " +param+="per_device_train_batch_size=1 " +param+="data_parallel_degree=1 " +param+="tensor_parallel_degree=8 " +param+="pipeline_parallel_degree=4 " +param+="virtual_pp_degree=4 " +param+="sequence_parallel=1 " +param+="sharding_parallel_degree=1 " +param+="sharding=stage1 " +param+="recompute=0 " +param+="recompute_granularity=full_attn " +param+="run_mode=MP8-PP4-sharding1-mbs1-acc32 " +param+="device_num=N4C32 " +param+="global_batch_size=32 " +param+="model_item=qwen-qwen-72b_seqlen4096_pretrain " +param+="max_steps=100 " +param+="gradient_accumulation_steps=32 " +param+="pp_recompute_interval=1 " +param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, " +#多机新添加的参数 +param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, " +param+="max_seq_length=4096 " +param+="min_learning_rate=0.000005 " +param+="save_steps=5000 " +param+="eval_steps=1000 " +param+="scale_loss=1024 " +param+="sharding_parallel_config=split_param,enable_stage1_overlap, " + + +cd ./tests +bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh new file mode 100644 index 000000000000..69882b131a52 --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh @@ -0,0 +1,48 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=qwen/qwen-7b " +param+="per_device_train_batch_size=2 " +param+="data_parallel_degree=1 " +param+="tensor_parallel_degree=2 " +param+="pipeline_parallel_degree=1 " +param+="virtual_pp_degree=1 " +param+="sequence_parallel=1 " +param+="sharding_parallel_degree=16 " +param+="sharding=stage1 " +param+="recompute=0 " +param+="recompute_granularity=full_attn " +param+="run_mode=MP2-PP1-sharding16-mbs2-acc1 " +param+="device_num=N4C32 " +param+="global_batch_size=32 " +param+="model_item=qwen-qwen-7b_seqlen4096_pretrain " +param+="max_steps=100 " +param+="gradient_accumulation_steps=1 " +param+="pp_recompute_interval=1 " +param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add " +#多机新添加的参数 +param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads " +param+="max_seq_length=4096 " +param+="min_learning_rate=0.000005 " +param+="save_steps=5000 " +param+="eval_steps=1000 " +param+="scale_loss=1024 " +param+="sharding_parallel_config=split_param,enable_stage1_overlap " + + +cd ./tests +bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh index bf6952c135ca..deca6e05b6fc 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh @@ -20,9 +20,10 @@ python -m pip install tiktoken # install fused_ln custom ops cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install +cd - # install tool_helpers -cd ../../../../llm/qwen +cd ../llm/ python -m pip install tool_helpers wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh index a2b40044b5f9..2b7c5682d1ee 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh @@ -38,6 +38,17 @@ function _set_params(){ pipeline_parallel_config=${pipeline_parallel_config:-""} recompute_use_reentrant=${recompute_use_reentrant:-"true"} recompute_granularity=${recompute_granularity:-"full"} + #添加多机脚本时需要提取下面的不同参数 + max_seq_length=${max_seq_length:-2048} + min_learning_rate=${min_learning_rate:-0.000001} + save_steps=${save_steps:-50000} + eval_steps=${eval_steps:-1001} + scale_loss=${scale_loss:-1024} + sharding_parallel_config=${sharding_parallel_config:-"split_param enable_stage1_overlap"} + # #替换pipeline_parallel_config和sharding_parallel_config的参数为空格表示 + # pipeline_parallel_config=$(echo $pipeline_parallel_config | tr ',' ' ') + # sharding_parallel_config=$(echo $sharding_parallel_config | tr ',' ' ') + base_batch_size=${global_batch_size} @@ -46,6 +57,14 @@ function _set_params(){ speed_unit="tokens/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 + is_large_model=True + #添加多机脚本时直接取最后10步的平均值就可以 + if [[ $device_num == "N4C32" ]]; then + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="interval_tokens_per_second_per_device:" # (必选)解析日志,筛选出性能数据所在行的关键字 + is_large_model=False + model_mode=5 + fi convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" @@ -68,7 +87,7 @@ function _set_params(){ mkdir -p $(dirname ${speed_log_file}) OUTPUT_PATH=${run_log_path}/output - is_large_model=True + } function _train(){ @@ -99,18 +118,32 @@ function _train(){ # fi if [ "${pipeline_parallel_config}" != "" ]; then + #pipeline_parallel_config_args="--pipeline_parallel_config \"${pipeline_parallel_config}\"" pipeline_parallel_config_args="--pipeline_parallel_config ${pipeline_parallel_config}" else pipeline_parallel_config_args="" fi + #qwen的多机需要额外设置的参数 + if [ "${device_num}" == "N4C32" ]; then + # diff_args="--sequence_parallel ${sequence_parallel} --sharding_parallel_degree ${sharding_parallel_degree} \ + # --scale_loss 1024 --recompute_granularity ${recompute_granularity} \ + # --sharding_parallel_config \"${sharding_parallel_config}\"" + diff_args="--sequence_parallel ${sequence_parallel} --sharding_parallel_degree ${sharding_parallel_degree} \ + --scale_loss 1024 --recompute_granularity ${recompute_granularity} \ + --sharding_parallel_config ${sharding_parallel_config}" + else + diff_args="--recompute_use_reentrant ${recompute_use_reentrant} \ + --skip_memory_metrics 0 \ + --data_cache ./data_cache" + fi use_pure_fp16=False train_cmd="--model_name_or_path ${model_name_or_path} \ --tokenizer_name_or_path ${model_name_or_path} \ - --input_dir ./qwen/data \ + --input_dir ./data \ --output_dir ./output \ --split 949,50,1 \ - --max_seq_length 2048 \ + --max_seq_length ${max_seq_length} \ --per_device_train_batch_size ${per_device_train_batch_size} \ --gradient_accumulation_steps ${gradient_accumulation_steps} \ --use_flash_attention 1 \ @@ -123,15 +156,15 @@ function _train(){ --virtual_pp_degree ${virtual_pp_degree} \ --pp_recompute_interval ${pp_recompute_interval} \ --learning_rate 0.00001 \ - --min_learning_rate 0.000001 \ + --min_learning_rate ${min_learning_rate} \ --max_steps ${max_steps} \ - --save_steps 50000 \ + --save_steps ${save_steps} \ --weight_decay 0.01 \ --warmup_ratio 0.01 \ --max_grad_norm 1.0 \ --logging_steps 1 \ --dataloader_num_workers 1 \ - --eval_steps 1001 \ + --eval_steps ${eval_steps} \ --sharding ${sharding} \ --disable_tqdm true \ --continue_training 0 \ @@ -142,10 +175,7 @@ function _train(){ --fuse_attention_qkv true \ --fuse_attention_ffn true \ --tensor_parallel_config ${tensor_parallel_config} ${pipeline_parallel_config_args} \ - --recompute ${recompute} \ - --recompute_use_reentrant ${recompute_use_reentrant} \ - --skip_memory_metrics 0 \ - --data_cache ./data_cache" + --recompute ${recompute} ${diff_args}" if [ ${PADDLE_TRAINER_ID} ] then diff --git a/tests/test_tipc/dygraph/moe/gpt/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/moe/gpt/benchmark_common/prepare.sh index e511447a8af1..7d7e059c9594 100644 --- a/tests/test_tipc/dygraph/moe/gpt/benchmark_common/prepare.sh +++ b/tests/test_tipc/dygraph/moe/gpt/benchmark_common/prepare.sh @@ -1,4 +1,4 @@ -cd ../examples/language_model/moe/data_tools/ +cd ../legacy/model_zoo/moe/data_tools/ sed -i "s/python3/python3.10/g" Makefile cd - @@ -8,7 +8,7 @@ python3 -m pip install -r ../requirements.txt #-i https://pypi.tuna.tsinghua.edu python3 -m pip install pybind11 regex sentencepiece tqdm visualdl #-i https://mirror.baidu.com/pypi/simple python3 -m pip install --upgrade paddlenlp # get data -cd ../examples/language_model/moe/dygraph/ +cd ../legacy/model_zoo/moe/dygraph/ rm -rf data mkdir data && cd data wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt2/train.data.json_ids.npz diff --git a/tests/test_tipc/dygraph/moe/gpt/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/moe/gpt/benchmark_common/run_benchmark.sh index a38eb67b47ec..ab5797d7270a 100644 --- a/tests/test_tipc/dygraph/moe/gpt/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/moe/gpt/benchmark_common/run_benchmark.sh @@ -147,7 +147,7 @@ function _train(){ ;; *) echo "choose run_mode "; exit 1; esac - cd ../examples/language_model/moe/dygraph/ + cd ../legacy/model_zoo/moe/dygraph/ echo "train_cmd: ${train_cmd} log_file: ${log_file}" python -c "import paddlenlp" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh index 697d5d1d92e0..cd0f05c27314 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh @@ -18,13 +18,13 @@ python -m pip install -r ../requirements-dev.txt # install fused_ln custom ops cd ../legacy/model_zoo/gpt-3/external_ops/ python setup.py install +cd - # install tool_helpers -cd ../../../../llm/llama +cd ../llm/auto_parallel/llama python -m pip install tool_helpers # download data -cd auto_parallel wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz mkdir data diff --git a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh index 16126a9198cc..03b2175d9465 100644 --- a/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/static/auto_parallel/llama2/benchmark_common/run_benchmark.sh @@ -132,7 +132,7 @@ function _train(){ ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" ;; esac - cd ../llm/llama/auto_parallel/ + cd ../llm/auto_parallel/llama # rm -rf ./auto_config_${MODEL_TYPE}/*GBS* # rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log # rm -rf ./auto_config_${MODEL_TYPE}/*csv