From f75ea54671f3656d6c685ebd65861c930cd85cdf Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 26 Sep 2024 06:20:57 +0000 Subject: [PATCH 1/5] add ut --- scripts/distribute/ci_case_auto.sh | 166 +++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index 91ca0dffdec9..06b4eeff9c35 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -58,6 +58,7 @@ function llama_case_list_auto() { llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP2-SP llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1 + llama_pir_auto_fuse_ffn_attention_qkv_MP2 } function llm_gpt_case_list_auto() { @@ -1068,6 +1069,171 @@ function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP() { echo "=========== $FUNCNAME run end ===========" } +function llama_pir_auto_fuse_ffn_attention_qkv_MP2() { + echo "=========== $FUNCNAME run begin ===========" + export PYTHONPATH=$root_path/:$PYTHONPATH + export FLAGS_call_stack_level=3 + export FLAGS_max_inplace_grad_add=100 + export FLAGS_cudnn_deterministic=1 + export NVIDIA_TF32_OVERRIDE=0 + export FLAGS_embedding_deterministic=1 + export FLAGS_flash_attn_version=v1 + export PARALLEL_CROSS_ENTROPY=true + export FLAGS_enable_auto_parallel_align_mode=1 + + export FLAGS_enable_pir_api=1 + export FLAGS_enable_fused_ffn_qkv_pass=1 + + echo "---- run llama-7b with fused_ffn_qkv_pass and save ckpt ----" + auto_task_name="llama_pir_auto_fuse_ffn_attention_qkv_MP2" + auto_case_out_dir="auto_output/$auto_task_name" + auto_case_log_dir="auto_output/$auto_task_name""_log" + rm -rf $auto_case_out_dir + rm -rf $auto_case_log_dir + + python -u -m paddle.distributed.launch \ + --gpus "0,1" \ + --log_dir $auto_case_log_dir \ + run_pretrain_auto.py \ + --model_name_or_path "facebook/llama-7b" \ + --tokenizer_name_or_path "facebook/llama-7b" \ + --input_dir "./data" \ + --output_dir $auto_case_out_dir \ + --split 949,50,1 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --warmup_steps 30 \ + --max_grad_norm 0.0 \ + --learning_rate 3e-05 \ + --min_learning_rate 3e-06 \ + --max_steps 5 \ + --logging_steps 1 \ + --eval_steps 1000 \ + --save_steps 3 \ + --continue_training 0 \ + --do_train true \ + --do_eval false \ + --do_predict false \ + --disable_tqdm true \ + --skip_profile_timer true \ + --save_total_limit 2 \ + --device gpu \ + --disable_tqdm true \ + --dataloader_num_workers 1 \ + --distributed_dataloader 0 \ + --enable_auto_parallel 1 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --per_device_eval_batch_size 2 \ + --recompute false \ + --recompute_use_reentrant true \ + --recompute_granularity full \ + --pp_recompute_interval 0 \ + --bf16 1 \ + --fp16_opt_level "O2" \ + --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ + --amp_custom_white_list "lookup_table" "lookup_table_v2" \ + --amp_master_grad true \ + --fuse_attention_ffn false \ + --fuse_attention_qkv false \ + --use_flash_attention true \ + --use_fused_rope true \ + --use_fused_rms_norm true \ + --max_seq_length 4096 \ + --sequence_parallel false \ + --pipeline_parallel_degree 1 \ + --sharding_parallel_degree 1 \ + --tensor_parallel_degree 2 \ + --virtual_pp_degree 1 \ + --pipeline_schedule_mode "VPP" \ + --sharding "" \ + --to_static 1 \ + --num_hidden_layers 2 \ + >>${log_path}/$FUNCNAME 2>&1 + + auto_loss=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 5' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + auto_ips=-1 + auto_mem=-1 + echo "auto result: step 5 loss=$auto_loss ips=$auto_ips mem=$auto_mem" + + echo "---- resune llama-7b with fused_ffn_qkv_pass from checkpoint3 ----" + resume_auto_task_name="resume_llama_pir_auto_fuse_ffn_attention_qkv_MP2" + resume_auto_case_out_dir="auto_output/$resume_auto_task_name" + resume_auto_case_log_dir="auto_output/$resume_auto_task_name""_log" + rm -rf $resume_auto_case_out_dir + rm -rf $resume_auto_case_log_dir + + python -u -m paddle.distributed.launch \ + --gpus "0,1" \ + --log_dir $resume_auto_case_log_dir \ + run_pretrain_auto.py \ + --model_name_or_path "facebook/llama-7b" \ + --tokenizer_name_or_path "facebook/llama-7b" \ + --input_dir "./data" \ + --output_dir $auto_case_out_dir \ + --split 949,50,1 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --warmup_steps 30 \ + --max_grad_norm 0.0 \ + --learning_rate 3e-05 \ + --min_learning_rate 3e-06 \ + --max_steps 5 \ + --logging_steps 1 \ + --eval_steps 1000 \ + --save_steps 3 \ + --continue_training 0 \ + --do_train true \ + --do_eval false \ + --do_predict false \ + --disable_tqdm true \ + --skip_profile_timer true \ + --save_total_limit 2 \ + --device gpu \ + --disable_tqdm true \ + --dataloader_num_workers 1 \ + --distributed_dataloader 0 \ + --enable_auto_parallel 1 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --per_device_eval_batch_size 2 \ + --recompute false \ + --recompute_use_reentrant true \ + --recompute_granularity full \ + --pp_recompute_interval 0 \ + --bf16 1 \ + --fp16_opt_level "O2" \ + --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ + --amp_custom_white_list "lookup_table" "lookup_table_v2" \ + --amp_master_grad true \ + --fuse_attention_ffn false \ + --fuse_attention_qkv false \ + --use_flash_attention true \ + --use_fused_rope true \ + --use_fused_rms_norm true \ + --max_seq_length 4096 \ + --sequence_parallel false \ + --pipeline_parallel_degree 1 \ + --sharding_parallel_degree 1 \ + --tensor_parallel_degree 2 \ + --virtual_pp_degree 1 \ + --pipeline_schedule_mode "VPP" \ + --sharding "" \ + --to_static 1 \ + --num_hidden_layers 2 \ + --resume_from_checkpoint "auto_output/llama_pir_auto_fuse_ffn_attention_qkv_MP2/checkpoint-3" \ + >>${log_path}/$FUNCNAME 2>&1 + + + resume_auto_loss=`cat $resume_auto_task_name/workerlog.0 | grep 'global_step: 5' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + resume_auto_ips=-1 + resume_auto_mem=-1 + echo "resume auto result: step 5 loss=$resume_auto_loss ips=$resume_auto_ips mem=$resume_auto_mem" + + check_result $FUNCNAME ${auto_ips} ${resume_auto_loss} ${auto_ips} ${resume_auto_ips} ${auto_mem} ${resume_auto_mem} + echo "=========== $FUNCNAME run end ===========" +} + function llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP2-SP() { echo "=========== $FUNCNAME run begin ===========" export PYTHONPATH=$root_path/:$PYTHONPATH From b6284303352e8bda936b015a6589ff7ed90ebdd5 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 26 Sep 2024 09:50:58 +0000 Subject: [PATCH 2/5] add ut --- scripts/distribute/ci_case_auto.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index 06b4eeff9c35..994a41d141ab 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -1129,14 +1129,14 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() { --recompute_use_reentrant true \ --recompute_granularity full \ --pp_recompute_interval 0 \ - --bf16 1 \ + --bf16 0 \ --fp16_opt_level "O2" \ --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ --amp_custom_white_list "lookup_table" "lookup_table_v2" \ - --amp_master_grad true \ + --amp_master_grad false \ --fuse_attention_ffn false \ --fuse_attention_qkv false \ - --use_flash_attention true \ + --use_flash_attention false \ --use_fused_rope true \ --use_fused_rms_norm true \ --max_seq_length 4096 \ @@ -1201,14 +1201,14 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() { --recompute_use_reentrant true \ --recompute_granularity full \ --pp_recompute_interval 0 \ - --bf16 1 \ + --bf16 0 \ --fp16_opt_level "O2" \ --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ --amp_custom_white_list "lookup_table" "lookup_table_v2" \ - --amp_master_grad true \ + --amp_master_grad false \ --fuse_attention_ffn false \ --fuse_attention_qkv false \ - --use_flash_attention true \ + --use_flash_attention false \ --use_fused_rope true \ --use_fused_rms_norm true \ --max_seq_length 4096 \ From 84c0c3ce57c0f4043ea6b59bfa1c14ef7645a987 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Wed, 9 Oct 2024 02:07:51 +0000 Subject: [PATCH 3/5] sovle conflict --- scripts/distribute/ci_case_auto.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index cb82a68c6649..bbb820b60b15 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -52,13 +52,12 @@ function llama_case_list_auto() { # llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2 # llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2-VPP2-Sharding2_stage2 # llama_static_auto_recompute_bs16_fp16_DP2-MP2-PP2-VPP2-Sharding2_stage2 - + llama_pir_auto_fuse_ffn_attention_qkv_MP2 llama_align_dygraph_dy2st_auto_bs2_bf16_DP2-MP1-PP1 llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1 llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP2-SP llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1 - llama_pir_auto_fuse_ffn_attention_qkv_MP2 llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1-MP1-PP4 llama_align_dygraph_dy2st_pir_auto_pp_bs2_bf16_DP1-MP1-PP4 } From 54ab1c364360a9dd18dcabec9cbac2db40940bdd Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Wed, 9 Oct 2024 07:13:31 +0000 Subject: [PATCH 4/5] sovle conflict --- scripts/distribute/ci_case_auto.sh | 81 ++---------------------------- 1 file changed, 4 insertions(+), 77 deletions(-) diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index bbb820b60b15..fd0c6d5ba5c2 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -1086,7 +1086,6 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() { export FLAGS_enable_pir_api=1 export FLAGS_enable_fused_ffn_qkv_pass=1 - echo "---- run llama-7b with fused_ffn_qkv_pass and save ckpt ----" auto_task_name="llama_pir_auto_fuse_ffn_attention_qkv_MP2" auto_case_out_dir="auto_output/$auto_task_name" auto_case_log_dir="auto_output/$auto_task_name""_log" @@ -1157,82 +1156,10 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() { auto_ips=-1 auto_mem=-1 echo "auto result: step 5 loss=$auto_loss ips=$auto_ips mem=$auto_mem" - - echo "---- resune llama-7b with fused_ffn_qkv_pass from checkpoint3 ----" - resume_auto_task_name="resume_llama_pir_auto_fuse_ffn_attention_qkv_MP2" - resume_auto_case_out_dir="auto_output/$resume_auto_task_name" - resume_auto_case_log_dir="auto_output/$resume_auto_task_name""_log" - rm -rf $resume_auto_case_out_dir - rm -rf $resume_auto_case_log_dir - - python -u -m paddle.distributed.launch \ - --gpus "0,1" \ - --log_dir $resume_auto_case_log_dir \ - run_pretrain_auto.py \ - --model_name_or_path "facebook/llama-7b" \ - --tokenizer_name_or_path "facebook/llama-7b" \ - --input_dir "./data" \ - --output_dir $auto_case_out_dir \ - --split 949,50,1 \ - --weight_decay 0.01 \ - --warmup_ratio 0.01 \ - --warmup_steps 30 \ - --max_grad_norm 0.0 \ - --learning_rate 3e-05 \ - --min_learning_rate 3e-06 \ - --max_steps 5 \ - --logging_steps 1 \ - --eval_steps 1000 \ - --save_steps 3 \ - --continue_training 0 \ - --do_train true \ - --do_eval false \ - --do_predict false \ - --disable_tqdm true \ - --skip_profile_timer true \ - --save_total_limit 2 \ - --device gpu \ - --disable_tqdm true \ - --dataloader_num_workers 1 \ - --distributed_dataloader 0 \ - --enable_auto_parallel 1 \ - --per_device_train_batch_size 1 \ - --gradient_accumulation_steps 1 \ - --per_device_eval_batch_size 2 \ - --recompute false \ - --recompute_use_reentrant true \ - --recompute_granularity full \ - --pp_recompute_interval 0 \ - --bf16 0 \ - --fp16_opt_level "O2" \ - --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ - --amp_custom_white_list "lookup_table" "lookup_table_v2" \ - --amp_master_grad false \ - --fuse_attention_ffn false \ - --fuse_attention_qkv false \ - --use_flash_attention false \ - --use_fused_rope true \ - --use_fused_rms_norm true \ - --max_seq_length 4096 \ - --sequence_parallel false \ - --pipeline_parallel_degree 1 \ - --sharding_parallel_degree 1 \ - --tensor_parallel_degree 2 \ - --virtual_pp_degree 1 \ - --pipeline_schedule_mode "VPP" \ - --sharding "" \ - --to_static 1 \ - --num_hidden_layers 2 \ - --resume_from_checkpoint "auto_output/llama_pir_auto_fuse_ffn_attention_qkv_MP2/checkpoint-3" \ - >>${log_path}/$FUNCNAME 2>&1 - - - resume_auto_loss=`cat $resume_auto_task_name/workerlog.0 | grep 'global_step: 5' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` - resume_auto_ips=-1 - resume_auto_mem=-1 - echo "resume auto result: step 5 loss=$resume_auto_loss ips=$resume_auto_ips mem=$resume_auto_mem" - - check_result $FUNCNAME ${auto_ips} ${resume_auto_loss} ${auto_ips} ${resume_auto_ips} ${auto_mem} ${resume_auto_mem} + loss_base=10.21024895 + ips_base=-1 + mem_base=-1 + check_result $FUNCNAME ${auto_loss} ${loss_base} ${auto_ips} ${ips_base} ${auto_mem} ${mem_base} echo "=========== $FUNCNAME run end ===========" } From 3fc1af433a428a6f5408930a9c743944874d6fec Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 10 Oct 2024 02:07:30 +0000 Subject: [PATCH 5/5] fix --- scripts/distribute/ci_case_auto.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index fd0c6d5ba5c2..d73b1452f652 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -1159,7 +1159,10 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() { loss_base=10.21024895 ips_base=-1 mem_base=-1 - check_result $FUNCNAME ${auto_loss} ${loss_base} ${auto_ips} ${ips_base} ${auto_mem} ${mem_base} + if [ $IS_A100 -ne 0 ];then + loss_base=10.27925682 + fi + check_result $FUNCNAME ${loss_base} ${auto_loss} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem} echo "=========== $FUNCNAME run end ===========" }