Skip to content

Commit

Permalink
fix dataloader bug and add ci for static
Browse files Browse the repository at this point in the history
  • Loading branch information
heavyrain-lzy committed Feb 23, 2024
1 parent 6a6a9fe commit 6a21aa9
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 2 deletions.
7 changes: 5 additions & 2 deletions paddlenlp/trainer/auto_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,12 @@ def _get_meshes_for_loader(self):
def _get_mesh(pp_idx=0):
return self.global_mesh.get_mesh_with_dim("pp")[pp_idx]

# Note(lizhiyu): If the values returned by `DataLoader` have the format `[images, labels]`,
# error may occurs here.
meshes = []
for pp_idx in range(self.args.pipeline_parallel_degree):
meshes.append(_get_mesh(pp_idx))
meshes.append(_get_mesh(0))
if self.args.pipeline_parallel_degree > 1:
meshes.append(_get_mesh(self.args.pipeline_parallel_degree - 1))

Check warning on line 100 in paddlenlp/trainer/auto_trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L98-L100

Added lines #L98 - L100 were not covered by tests
return meshes

def _wrap_for_dist_loader(self, train_dataloader):
Expand Down
70 changes: 70 additions & 0 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ function llama_case_list_auto() {
llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2

llama_dygraph2static_auto_bs8_fp16_DP2-MP2-PP2

llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1
llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP1
Expand Down Expand Up @@ -1509,6 +1511,74 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

function llama_dygraph2static_auto_bs8_fp16_DP2-MP2-PP2() {
echo "=========== $FUNCNAME run begin ==========="
export PYTHONPATH=$root_path/:$PYTHONPATH
export FLAGS_call_stack_level=3
export NVIDIA_TF32_OVERRIDE=0

task_name="llama_auto_bs8_fp16_dp2mp2pp2"
case_out_dir="output/$task_name"
case_log_dir="output/$task_name""_log"
rm -rf $case_out_dir
rm -rf $case_log_dir

python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" --log_dir $case_log_dir run_pretrain_auto.py \
--model_type "llama" \
--model_name_or_path "facebook/llama-7b" \
--tokenizer_name_or_path "facebook/llama-7b" \
--input_dir "./data" \
--output_dir $case_out_dir \
--split 949,50,1 \
--max_seq_length 2048 \
--hidden_size 1024 \
--intermediate_size 3072 \
--num_hidden_layers 8 \
--num_attention_heads 32 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 4 \
--use_flash_attention 0 \
--use_fused_rms_norm 0 \
--fp16 1 \
--fp16_opt_level "O2" \
--amp_master_grad 1 \
--scale_loss 1024 \
--pipeline_parallel_degree 2 \
--tensor_parallel_degree 2 \
--sharding_parallel_degree 1 \
--learning_rate 0.0001 \
--min_learning_rate 0.00001 \
--max_steps 10 \
--save_steps 5000 \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--logging_steps 1 \
--dataloader_num_workers 1 \
--sharding "" \
--eval_steps 1000000 \
--disable_tqdm true \
--continue_training 0 \
--recompute 0 \
--do_train \
--do_eval \
--device "gpu" \
--data_impl "mmap" \
--enable_auto_parallel 1 \
--to_static 1 \
--max_grad_norm 1.0 \
>>${log_path}/$FUNCNAME 2>&1
loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
ips=-1
mem=-1
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=9.38341904
ips_base=-1
mem_base=-1
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}
############ case end ############

function check_result() {
Expand Down

0 comments on commit 6a21aa9

Please sign in to comment.