From 6a21aa996c095c7539ea71cdbfb3ab24666143e7 Mon Sep 17 00:00:00 2001 From: heavyrain_lzy <1528794076@qq.com> Date: Fri, 23 Feb 2024 15:21:13 +0800 Subject: [PATCH] fix dataloader bug and add ci for static --- paddlenlp/trainer/auto_trainer.py | 7 ++- scripts/distribute/ci_case_auto.sh | 70 ++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/paddlenlp/trainer/auto_trainer.py b/paddlenlp/trainer/auto_trainer.py index c6d6f2962daf..386228122c0b 100644 --- a/paddlenlp/trainer/auto_trainer.py +++ b/paddlenlp/trainer/auto_trainer.py @@ -92,9 +92,12 @@ def _get_meshes_for_loader(self): def _get_mesh(pp_idx=0): return self.global_mesh.get_mesh_with_dim("pp")[pp_idx] + # Note(lizhiyu): If the values returned by `DataLoader` have the format `[images, labels]`, + # error may occurs here. meshes = [] - for pp_idx in range(self.args.pipeline_parallel_degree): - meshes.append(_get_mesh(pp_idx)) + meshes.append(_get_mesh(0)) + if self.args.pipeline_parallel_degree > 1: + meshes.append(_get_mesh(self.args.pipeline_parallel_degree - 1)) return meshes def _wrap_for_dist_loader(self, train_dataloader): diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index 93e58b38f6cb..5b8a0bd0caf2 100644 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -50,6 +50,8 @@ function llama_case_list_auto() { llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2 llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2 + llama_dygraph2static_auto_bs8_fp16_DP2-MP2-PP2 + llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1 llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1 llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP1 @@ -1509,6 +1511,74 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() { check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} echo "=========== $FUNCNAME run end ===========" } + +function llama_dygraph2static_auto_bs8_fp16_DP2-MP2-PP2() { + echo "=========== $FUNCNAME run begin ===========" + export PYTHONPATH=$root_path/:$PYTHONPATH + export FLAGS_call_stack_level=3 + export NVIDIA_TF32_OVERRIDE=0 + + task_name="llama_auto_bs8_fp16_dp2mp2pp2" + case_out_dir="output/$task_name" + case_log_dir="output/$task_name""_log" + rm -rf $case_out_dir + rm -rf $case_log_dir + + python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" --log_dir $case_log_dir run_pretrain_auto.py \ + --model_type "llama" \ + --model_name_or_path "facebook/llama-7b" \ + --tokenizer_name_or_path "facebook/llama-7b" \ + --input_dir "./data" \ + --output_dir $case_out_dir \ + --split 949,50,1 \ + --max_seq_length 2048 \ + --hidden_size 1024 \ + --intermediate_size 3072 \ + --num_hidden_layers 8 \ + --num_attention_heads 32 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --use_flash_attention 0 \ + --use_fused_rms_norm 0 \ + --fp16 1 \ + --fp16_opt_level "O2" \ + --amp_master_grad 1 \ + --scale_loss 1024 \ + --pipeline_parallel_degree 2 \ + --tensor_parallel_degree 2 \ + --sharding_parallel_degree 1 \ + --learning_rate 0.0001 \ + --min_learning_rate 0.00001 \ + --max_steps 10 \ + --save_steps 5000 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --logging_steps 1 \ + --dataloader_num_workers 1 \ + --sharding "" \ + --eval_steps 1000000 \ + --disable_tqdm true \ + --continue_training 0 \ + --recompute 0 \ + --do_train \ + --do_eval \ + --device "gpu" \ + --data_impl "mmap" \ + --enable_auto_parallel 1 \ + --to_static 1 \ + --max_grad_norm 1.0 \ + >>${log_path}/$FUNCNAME 2>&1 + loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=-1 + mem=-1 + echo "result: loss=$loss ips=$ips mem=$mem" + loss_base=9.38341904 + ips_base=-1 + mem_base=-1 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} ############ case end ############ function check_result() {