From 23d1c0f8a5c5c365202978b378f51d4f6957a3d9 Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Tue, 17 Dec 2024 14:25:52 +0800 Subject: [PATCH 1/8] fix qwen&baichaun&gpt ci error --- .../transformers/qwen/modeling_3D_auto.py | 33 +++++++++++++++---- ...bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh} | 2 +- .../baichuan2/benchmark_common/prepare.sh | 1 + .../gpt3/benchmark_common/prepare.sh | 2 +- .../pretrain-gpt3_13b.json | 2 +- .../qwen/benchmark_common/prepare.sh | 1 + .../qwen/benchmark_common/run_benchmark.sh | 6 ++-- .../pretrain-qwen_14b.json} | 0 8 files changed, 34 insertions(+), 13 deletions(-) rename tests/test_tipc/static/auto_parallel/baichuan2/N4C32/{meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh => meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh} (97%) rename tests/test_tipc/static/auto_parallel/qwen/{pretrain_config_qwen2_14b/pretrain-qwen2_14b.json => pretrain_config_qwen_14b/pretrain-qwen_14b.json} (100%) diff --git a/paddlenlp/transformers/qwen/modeling_3D_auto.py b/paddlenlp/transformers/qwen/modeling_3D_auto.py index ef26a29bbd67..46d48a9896df 100644 --- a/paddlenlp/transformers/qwen/modeling_3D_auto.py +++ b/paddlenlp/transformers/qwen/modeling_3D_auto.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import math +import os import warnings from functools import partial from typing import List @@ -84,6 +84,17 @@ def get_triangle_upper_mask(x, mask=None): return mask +def enable_fuse_ffn_qkv_pass(): + if os.getenv("FLAGS_enable_fused_ffn_qkv_pass") in [ + "True", + "true", + "1", + ]: + return True + else: + return False + + attention_cnt = 0 @@ -304,12 +315,20 @@ def __init__(self, config, ipp=None): super().__init__() ff_dim_in = config.intermediate_size // 2 self.fuse_attention_ffn = config.fuse_attention_ffn - self.w1 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias) - self.w2 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias) - self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias_attr=not config.no_bias) self.ipp = ipp - self.w1.weight = dist.shard_tensor(self.w1.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)]) - self.w2.weight = dist.shard_tensor(self.w2.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)]) + if config.fuse_attention_ffn and not enable_fuse_ffn_qkv_pass(): + self.gate_up_fused_proj = nn.Linear(config.hidden_size, ff_dim_in * 2, bias_attr=not config.no_bias) + self.gate_up_fused_proj.weight = dist.shard_tensor( + self.gate_up_fused_proj.weight, + get_mesh(self.ipp), + [dist.Replicate(), dist.Shard(1)], + ) + else: + self.w1 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias) + self.w2 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias) + self.w1.weight = dist.shard_tensor(self.w1.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)]) + self.w2.weight = dist.shard_tensor(self.w2.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)]) + self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias_attr=not config.no_bias) self.c_proj.weight = dist.shard_tensor( self.c_proj.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(0)] ) @@ -321,7 +340,7 @@ def forward(self, hidden_states): # a2 = self.w2(hidden_states) # intermediate_parallel = a1 * F.silu(a2) # down - if self.fuse_attention_ffn: + if self.fuse_attention_ffn and not enable_fuse_ffn_qkv_pass(): intermediate_parallel = swiglu(self.gate_up_fused_proj(hidden_states)) else: intermediate_parallel = swiglu(self.w2(hidden_states), self.w1(hidden_states)) diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh similarity index 97% rename from tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh rename to tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh index 4adc579b79de..41f6f9ded6a1 100644 --- a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh @@ -14,7 +14,7 @@ param="model_item=baichuan-inc-baichaun-2-13b_pretrain " param+="run_mode=DP1_MP2_PP4_1F1B_Sharding8_Stage2 " -param+="device_num=N4C32 " +param+="device_num=N1C8 " param+="global_batch_size=32 " param+="nnodes=4 " param+="model_type=baichuan2_13b " diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh index 90f4ce0363eb..7aeac7cfabaa 100644 --- a/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh +++ b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh @@ -27,6 +27,7 @@ python -m pip install fast_dataindex # download data wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +rm -rf data mkdir data mv llama_openwebtext_100k_ids.npy ./data mv llama_openwebtext_100k_idx.npz ./data diff --git a/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/prepare.sh index a7be57632792..df14f1e10816 100644 --- a/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/prepare.sh +++ b/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/prepare.sh @@ -27,7 +27,7 @@ python -m pip install fast_dataindex # download data wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz - +rm -rf data mkdir data mv gpt_en_dataset_300m_ids.npy ./data mv gpt_en_dataset_300m_idx.npz ./data diff --git a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json index 129f55a349e2..fdcf2b9580be 100644 --- a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json +++ b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json @@ -3,7 +3,7 @@ "tokenizer_name_or_path": "gpt3-13B-en", "to_static": true, "enable_auto_parallel": 1, - "input_dir": "../llama_data", + "input_dir": "./data", "output_dir": "./checkpoints/gpt_pretrain_ckpts", "split": "949,50,1", "max_seq_length": 4096, diff --git a/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh index 7239135a96c5..aa048413d9bd 100644 --- a/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh +++ b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh @@ -27,6 +27,7 @@ python -m pip install fast_dataindex # download data wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +rm -rf data mkdir data mv llama_openwebtext_100k_ids.npy ./data mv llama_openwebtext_100k_idx.npz ./data diff --git a/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh index 41471ef5b21f..704081a29214 100644 --- a/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh @@ -173,17 +173,17 @@ function _train(){ N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ --nnodes 1 --nproc_per_node 8 \ - --log_dir mylog run_pretrain_auto.py \ + --log_dir mylog run_pretrain_3D_auto.py \ ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" ;; N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}" train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ - --log_dir mylog run_pretrain_auto.py \ + --log_dir mylog run_pretrain_3D_auto.py \ ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" ;; *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ - --log_dir mylog run_pretrain_auto.py \ + --log_dir mylog run_pretrain_3D_auto.py \ ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" ;; esac diff --git a/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen2_14b/pretrain-qwen2_14b.json b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json similarity index 100% rename from tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen2_14b/pretrain-qwen2_14b.json rename to tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json From 1997808c3806afb454c4864621a4392c8f77315b Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Tue, 17 Dec 2024 14:28:32 +0800 Subject: [PATCH 2/8] fix qwen&baichaun&gpt ci error --- ...retrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh index 41f6f9ded6a1..4adc579b79de 100644 --- a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh @@ -14,7 +14,7 @@ param="model_item=baichuan-inc-baichaun-2-13b_pretrain " param+="run_mode=DP1_MP2_PP4_1F1B_Sharding8_Stage2 " -param+="device_num=N1C8 " +param+="device_num=N4C32 " param+="global_batch_size=32 " param+="nnodes=4 " param+="model_type=baichuan2_13b " From 96cf93abc0fcf74f6f8383c4555eb4d7d8ecdb68 Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Tue, 17 Dec 2024 16:00:15 +0800 Subject: [PATCH 3/8] fix qwen&baichaun&gpt ci error --- ...bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh} | 2 +- .../pretrain-baichun2_13b-config.json | 6 +-- .../gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh | 30 ++++++++++++ .../gpt/benchmark_common/run_benchmark.sh | 7 ++- ...bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh | 49 +++++++++++++++++++ ...bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh} | 2 +- ...bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh} | 2 +- 7 files changed, 90 insertions(+), 8 deletions(-) rename tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/{baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh => baichuan-inc-Baichun2-13b_pretrain_bs128_bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh} (96%) create mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh create mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh rename tests/test_tipc/static/auto_parallel/baichuan2/N4C32/{meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh => meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh} (96%) rename tests/test_tipc/static/auto_parallel/qwen/N4C32/{qwen-14b_pretrain_dy2st_bs32_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh => qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh} (96%) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs128_bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh similarity index 96% rename from tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh rename to tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs128_bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh index e4cb99e2eedc..5bcdf0bad512 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs128_bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh @@ -15,7 +15,7 @@ param="model_item=baichuan-inc-Baichun2-13b_pretrain " param+="run_mode=DP1_MP4_PP1_VPP1_Sharding8_Stage1 " param+="device_num=N4C32 " -param+="global_batch_size=32 " +param+="global_batch_size=128 " param+="nnodes=4 " param+="model_type=baichun2_13b " diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json index 307b9e9a6775..71248c6cd44a 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json @@ -5,11 +5,11 @@ "output_dir": "./output/baichun2-13b_pretrain_ckpts", "split": "949,50,1", "max_seq_length": 4096, - "gradient_accumulation_steps": 2, + "gradient_accumulation_steps": 32, "tensor_parallel_degree": 4, - "pipeline_parallel_degree": 1, + "pipeline_parallel_degree": 2, "virtual_pp_degree": 1, - "sequence_parallel": 1, + "sequence_parallel": 0, "sharding_parallel_degree": 8, "sharding": "stage1", "pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ", diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh new file mode 100644 index 000000000000..9dcba4ba71dc --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh @@ -0,0 +1,30 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model_item=gpt3 +dp_degree=4 +mp_degree=2 +pp_degree=4 +bs_item=16 +fp_item=bf16 +run_mode=DP4-MP2-PP4 +device_num=N4C32 + +model=gpt +micro_bs=4 +# get data +cd ./tests +bash ./test_tipc/dygraph/hybrid_parallelism/${model}/benchmark_common/prepare.sh +# run +bash ./test_tipc/dygraph/hybrid_parallelism/${model}/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt/benchmark_common/run_benchmark.sh index bb8b442133c5..a2af40e30e1c 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/gpt/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt/benchmark_common/run_benchmark.sh @@ -114,7 +114,10 @@ function _train(){ --use_recompute ${use_recompute}\ --sharding_stage ${sharding_stage}\ --sharding_offload ${sharding_offload}\ - --fuse_transformer True" + --fuse_transformer True \ + --sharding_parallel_config 'enable_stage1_tensor_fusion enable_stage1_overlap' \ + --tensor_parallel_config 'enable_mp_async_allreduce' \ + --pipeline_parallel_config 'enable_sharding_comm_overlap enable_dp_comm_overlap enable_overlap_p2p_comm' " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" @@ -141,7 +144,7 @@ function _train(){ run_pretrain.py ${train_cmd}" workerlog_id=0 ;; - DP1-MP2-PP4|DP1-MP4-PP2|DP2-MP2-PP2|DP2-MP8-PP2|DP4-MP8-PP1|DP1-MP8-PP4) echo "run run_mode: ${run_mode}" + DP1-MP2-PP4|DP1-MP4-PP2|DP2-MP2-PP2|DP2-MP8-PP2|DP4-MP8-PP1|DP4-MP2-PP4|DP1-MP8-PP4) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ run_pretrain.py ${train_cmd}" workerlog_id=0 diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh new file mode 100644 index 000000000000..0bf37fd2ac62 --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh @@ -0,0 +1,49 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=qwen/qwen-14b " +param+="per_device_train_batch_size=1 " +param+="data_parallel_degree=1 " +param+="tensor_parallel_degree=2 " +param+="pipeline_parallel_degree=4 " +param+="virtual_pp_degree=5 " +param+="sequence_parallel=0 " +param+="sharding_parallel_degree=4 " +param+="sharding=stage1 " +param+="recompute=0 " +param+="recompute_granularity=full_attn " +param+="run_mode=MP2-PP4-sharding4-mbs1-acc32 " +param+="device_num=N4C32 " +param+="global_batch_size=128 " +param+="model_item=qwen-qwen-14b_seqlen4096_pretrain " +param+="max_steps=200 " +param+="logging_steps=5 " +param+="gradient_accumulation_steps=32 " +param+="pp_recompute_interval=1 " +param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, " +#多机新添加的参数 +param+="pipeline_parallel_config=enable_sharding_comm_overlap,enable_release_grads,enable_dp_comm_overlap,enable_overlap_p2p_comm " +param+="max_seq_length=4096 " +param+="min_learning_rate=0.000005 " +param+="save_steps=5000 " +param+="eval_steps=1000 " +param+="scale_loss=1024 " +param+="sharding_parallel_config=split_param,enable_stage1_overlap,enable_stage1_allgather_overlap " + + +cd ./tests +bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh similarity index 96% rename from tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh rename to tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh index 4adc579b79de..7fa6b08e795b 100644 --- a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-baichuan-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP4_PP2_1F1B_Sharding4_Stage1.sh @@ -15,7 +15,7 @@ param="model_item=baichuan-inc-baichaun-2-13b_pretrain " param+="run_mode=DP1_MP2_PP4_1F1B_Sharding8_Stage2 " param+="device_num=N4C32 " -param+="global_batch_size=32 " +param+="global_batch_size=128 " param+="nnodes=4 " param+="model_type=baichuan2_13b " diff --git a/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs32_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh similarity index 96% rename from tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs32_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh rename to tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh index be88c3dcddb9..a52d8a10e34a 100644 --- a/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs32_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs128_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh @@ -15,7 +15,7 @@ param="model_item=qwen-2-14b_pretrain_dy2st " param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 " param+="device_num=N4C32 " -param+="global_batch_size=32 " +param+="global_batch_size=128 " param+="nnodes=4 " param+="model_type=qwen_14b " From 41a7c7a6e9444dfc72ab6b0569e2b0a2b3829e80 Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Tue, 17 Dec 2024 16:26:03 +0800 Subject: [PATCH 4/8] fix qwen&baichaun&gpt ci error --- ...train_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename tests/test_tipc/static/auto_parallel/gpt3/N4C32/{meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh => meta-llama-Llama-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh} (96%) diff --git a/tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh similarity index 96% rename from tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh rename to tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh index 3fe89367fb88..2ab99d620712 100644 --- a/tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh +++ b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh @@ -15,7 +15,7 @@ param="model_item=gpt3-13b_pretrain_dy2st " param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 " param+="device_num=N4C32 " -param+="global_batch_size=32 " +param+="global_batch_size=128 " param+="nnodes=4 " param+="model_type=gpt3_13b " From ae7518cd1fa7eba964b155182d0baf0bf149ee03 Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Tue, 17 Dec 2024 16:31:30 +0800 Subject: [PATCH 5/8] fix qwen&baichaun&gpt ci error --- ...retrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/test_tipc/static/auto_parallel/gpt3/N4C32/{meta-llama-Llama-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh => gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh} (100%) diff --git a/tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh similarity index 100% rename from tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh rename to tests/test_tipc/static/auto_parallel/gpt3/N4C32/gpt-3-13b_pretrain_dy2st_bs128_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh From 272423699c2c8ae6326c0a75b573232b9a494630 Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Tue, 17 Dec 2024 20:42:40 +0800 Subject: [PATCH 6/8] add gpt_dy ce --- .../qwen/run_pretrain_3D_auto.py | 1 + paddlenlp/transformers/gpt/configuration.py | 1 + ..._bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh | 25 ++++ .../pretrain-gpt3_13b-config.json | 45 ++++++ .../gpt3/benchmark_common/prepare.sh | 37 +++++ .../gpt3/benchmark_common/run_benchmark.sh | 134 ++++++++++++++++++ 6 files changed, 243 insertions(+) create mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh create mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/gpt3/auto_config_gpt3_13b/pretrain-gpt3_13b-config.json create mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/prepare.sh create mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh diff --git a/llm/auto_parallel/qwen/run_pretrain_3D_auto.py b/llm/auto_parallel/qwen/run_pretrain_3D_auto.py index a476f68cc5c9..05726f688cc8 100644 --- a/llm/auto_parallel/qwen/run_pretrain_3D_auto.py +++ b/llm/auto_parallel/qwen/run_pretrain_3D_auto.py @@ -356,6 +356,7 @@ def get_train_data_file(args): class PretrainingTrainer(AutoTrainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.is_pretraining = True def _wrap_for_dist_loader(self, train_dataloader): dist_loader = super()._wrap_for_dist_loader(train_dataloader) diff --git a/paddlenlp/transformers/gpt/configuration.py b/paddlenlp/transformers/gpt/configuration.py index ad4730a48541..2d3e3fc70feb 100644 --- a/paddlenlp/transformers/gpt/configuration.py +++ b/paddlenlp/transformers/gpt/configuration.py @@ -86,6 +86,7 @@ "eol_token_id": 198, }, "gpt3-13B-en": { # 13B + "architectures": ["GPTForCausalLM"], "vocab_size": 50304, "hidden_size": 5120, "num_hidden_layers": 40, diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh new file mode 100644 index 000000000000..a0e988a50cb5 --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/N4C32/gpt3-13b_pretrain_bs128_bf16_DP1_MP2_PP4_VPP1_Sharding4_Stage1.sh @@ -0,0 +1,25 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +param="model_item=gpt3-13b_pretrain " +param+="run_mode=DP1_MP2_PP4_VPP5_Sharding4_Stage1 " +param+="device_num=N4C32 " +param+="global_batch_size=128 " +param+="nnodes=4 " +param+="model_type=gpt3_13b " + +cd ./tests +bash ./test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh" \ No newline at end of file diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/auto_config_gpt3_13b/pretrain-gpt3_13b-config.json b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/auto_config_gpt3_13b/pretrain-gpt3_13b-config.json new file mode 100644 index 000000000000..67504a1f8518 --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/auto_config_gpt3_13b/pretrain-gpt3_13b-config.json @@ -0,0 +1,45 @@ +{ + "model_name_or_path": "gpt3-13B-en", + "tokenizer_name_or_path": "gpt3-13B-en", + "input_dir": "./data", + "output_dir": "./output/gpt3-13b_pretrain_ckpts", + "split": "949,50,1", + "max_seq_length": 4096, + "gradient_accumulation_steps": 32, + "tensor_parallel_degree": 2, + "pipeline_parallel_degree": 4, + "virtual_pp_degree": 5, + "sequence_parallel": 0, + "sharding": "stage1", + "pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ", + "tensor_parallel_config": "enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add", + "per_device_train_batch_size": 1, + "use_flash_attention": true, + "use_fused_rms_norm": true, + "fuse_attention_qkv": true, + "use_fused_rope": true, + "fuse_attention_ffn": true, + "enable_linear_fused_grad_add": true, + "bf16": true, + "fp16_opt_level": "O2", + "scale_loss": 1024, + "learning_rate": 1e-05, + "min_learning_rate": 5e-06, + "max_steps": 200, + "save_steps": 5000, + "weight_decay": 0.01, + "warmup_ratio": 0.01, + "max_grad_norm": 1.0, + "logging_steps": 2, + "dataloader_num_workers": 1, + "eval_steps": 1000, + "disable_tqdm": true, + "continue_training": 0, + "recompute": false, + "recompute_granularity": "full_attn", + "do_train": true, + "pp_recompute_interval": 1, + "device": "gpu", + "amp_master_grad": true, + "sharding_parallel_config": "split_param enable_stage1_overlap enable_stage1_allgather_overlap" + } \ No newline at end of file diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/prepare.sh new file mode 100644 index 000000000000..a33af3117851 --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/prepare.sh @@ -0,0 +1,37 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python -m pip install -r ../requirements.txt +python -m pip install -r ../requirements-dev.txt + +# install fused_ln custom ops +cd ../slm/model_zoo/gpt-3/external_ops/ +python setup.py install +cd - + +python -m pip install tiktoken + +# install fast_dataindex +cd ../llm/ +rm -rf data +mkdir data +cd data +# download data +wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz +cd - + +# mv autoconfig +rm -rf auto_config_* +cp -r ../tests/test_tipc/dygraph/hybrid_parallelism/gpt3/auto_config_* ./ \ No newline at end of file diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..2b5a830a7c52 --- /dev/null +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt3/benchmark_common/run_benchmark.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash + +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} +function _set_params(){ + model_item=${model_item:-"gpt3-13b_pretrain"} + run_mode=${run_mode:-"MP2-PP1"} + device_num=${device_num:-"N1C8"} + global_batch_size=${global_batch_size:-64} + fp_item="bf16" + MODEL_TYPE=${model_type:-"gpt3_13b"} + + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + nnodes=${nnodes:-1} + + base_batch_size=${global_batch_size} + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="tokens/s" # (必选)速度指标单位 + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="interval_tokens_per_second_per_device:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + model_mode=5 # 获取ips数据及单位,仅跳过skip_steps后计算均值,单位保持token/s不变 + + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + mkdir -p $(dirname ${train_log_file}) + + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + mkdir -p $(dirname ${profiling_log_file}) + + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed + mkdir -p $(dirname ${speed_log_file}) + + OUTPUT_PATH=${run_log_path}/output +} + +function _train(){ + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + + if [ -d $OUTPUT_PATH ]; then + rm -rf $OUTPUT_PATH + fi + mkdir $OUTPUT_PATH + + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} == "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + + if [ ${PADDLE_TRAINER_ID} ]; then + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" + else + PADDLE_RANK_OPTION="" + fi + + distributed_args="--master $master_ip:36677 --nnodes $nnodes ${PADDLE_RANK_OPTION} --run_mode=collective" + + echo "==========System Env=============" + env + echo "=================================" + + # 以下为通用执行命令,无特殊可不用修改 + case ${device_num} in + N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --nnodes 1 --nproc_per_node 8 \ + --log_dir mylog run_pretrain.py \ + ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json" + ;; + N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + ${distributed_args} --log_dir mylog run_pretrain.py \ + ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json" + ;; + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + ${distributed_args} --log_dir mylog run_pretrain.py \ + ./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-config.json" + ;; + esac + cd ../llm + rm -rf mylog && rm -rf checkpoints + + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + timeout 40m ${train_cmd} > ${log_file} 2>&1 + + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" -a -d mylog ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + fi +} + +export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH + +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +#_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 \ No newline at end of file From 1745f17c682596e98f2c9d9f8a5ed152164f1baa Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Wed, 18 Dec 2024 19:43:31 +0800 Subject: [PATCH 7/8] revert qwen&baichuan dygraph ce --- .../transformers/qwen/modeling_3D_auto.py | 29 +++++++---- ...bf16_DP1_MP4_PP1_VPP1_Sharding4_Stage1.sh} | 2 +- .../pretrain-baichun2_13b-config.json | 6 +-- .../gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh | 30 ------------ ...bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh | 49 ------------------- 5 files changed, 23 insertions(+), 93 deletions(-) rename tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/{baichuan-inc-Baichun2-13b_pretrain_bs128_bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh => baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding4_Stage1.sh} (96%) delete mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh delete mode 100644 tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh diff --git a/paddlenlp/transformers/qwen/modeling_3D_auto.py b/paddlenlp/transformers/qwen/modeling_3D_auto.py index 46d48a9896df..6c083504ac08 100644 --- a/paddlenlp/transformers/qwen/modeling_3D_auto.py +++ b/paddlenlp/transformers/qwen/modeling_3D_auto.py @@ -95,6 +95,11 @@ def enable_fuse_ffn_qkv_pass(): return False +def get_use_casual_mask(): + """Get the value of the 'USE_CASUAL_MASK' environment variable.""" + return os.getenv("USE_CASUAL_MASK", "False") == "True" + + attention_cnt = 0 @@ -671,16 +676,20 @@ def forward( hidden_states = inputs_embeds - # bool 4D mask - attention_mask = self.get_masks( - input_shape[0], input_shape[1], past_length, dtype=hidden_states.dtype, padding_mask=attention_mask - ) - # TODO(GhostScreaming): how to fix paddle.finfo? - zero = paddle.zeros(attention_mask.shape, dtype=paddle.bfloat16) - neg_inf = paddle.full_like(attention_mask, paddle.finfo(paddle.bfloat16).min, dtype=paddle.bfloat16) - # dtype 4D mask - attention_mask = paddle.where(attention_mask, zero, neg_inf) - attention_mask = dist.shard_tensor(attention_mask, get_mesh(), [dist.Replicate(), dist.Replicate()]) + use_casual_mask = get_use_casual_mask() + if use_casual_mask: + attention_mask = None + else: + # bool 4D mask + attention_mask = self.get_masks( + input_shape[0], input_shape[1], past_length, dtype=hidden_states.dtype, padding_mask=attention_mask + ) + # TODO(GhostScreaming): how to fix paddle.finfo? + zero = paddle.zeros(attention_mask.shape, dtype=paddle.bfloat16) + neg_inf = paddle.full_like(attention_mask, paddle.finfo(paddle.bfloat16).min, dtype=paddle.bfloat16) + # dtype 4D mask + attention_mask = paddle.where(attention_mask, zero, neg_inf) + attention_mask = dist.shard_tensor(attention_mask, get_mesh(), [dist.Replicate(), dist.Replicate()]) hidden_states = self.drop(hidden_states) hidden_states = dist.reshard(hidden_states, get_mesh(), [dist.Shard(0), dist.Replicate()]) output_shape = input_shape + [ diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs128_bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding4_Stage1.sh similarity index 96% rename from tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs128_bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh rename to tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding4_Stage1.sh index 5bcdf0bad512..e4cb99e2eedc 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs128_bf16_DP1_MP4_PP2_VPP1_Sharding4_Stage1.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding4_Stage1.sh @@ -15,7 +15,7 @@ param="model_item=baichuan-inc-Baichun2-13b_pretrain " param+="run_mode=DP1_MP4_PP1_VPP1_Sharding8_Stage1 " param+="device_num=N4C32 " -param+="global_batch_size=128 " +param+="global_batch_size=32 " param+="nnodes=4 " param+="model_type=baichun2_13b " diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json index 71248c6cd44a..fdf27a753f12 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json @@ -5,11 +5,11 @@ "output_dir": "./output/baichun2-13b_pretrain_ckpts", "split": "949,50,1", "max_seq_length": 4096, - "gradient_accumulation_steps": 32, + "gradient_accumulation_steps": 2, "tensor_parallel_degree": 4, - "pipeline_parallel_degree": 2, + "pipeline_parallel_degree": 1, "virtual_pp_degree": 1, - "sequence_parallel": 0, + "sequence_parallel": 1, "sharding_parallel_degree": 8, "sharding": "stage1", "pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ", diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh deleted file mode 100644 index 9dcba4ba71dc..000000000000 --- a/tests/test_tipc/dygraph/hybrid_parallelism/gpt/N4C32/gpt3_bs16_fp16_DP4-MP2-PP4.sh +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -model_item=gpt3 -dp_degree=4 -mp_degree=2 -pp_degree=4 -bs_item=16 -fp_item=bf16 -run_mode=DP4-MP2-PP4 -device_num=N4C32 - -model=gpt -micro_bs=4 -# get data -cd ./tests -bash ./test_tipc/dygraph/hybrid_parallelism/${model}/benchmark_common/prepare.sh -# run -bash ./test_tipc/dygraph/hybrid_parallelism/${model}/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh deleted file mode 100644 index 0bf37fd2ac62..000000000000 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs128_bf16_MP2-PP4-sharding4-mbs1-acc32.sh +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -param="model_name_or_path=qwen/qwen-14b " -param+="per_device_train_batch_size=1 " -param+="data_parallel_degree=1 " -param+="tensor_parallel_degree=2 " -param+="pipeline_parallel_degree=4 " -param+="virtual_pp_degree=5 " -param+="sequence_parallel=0 " -param+="sharding_parallel_degree=4 " -param+="sharding=stage1 " -param+="recompute=0 " -param+="recompute_granularity=full_attn " -param+="run_mode=MP2-PP4-sharding4-mbs1-acc32 " -param+="device_num=N4C32 " -param+="global_batch_size=128 " -param+="model_item=qwen-qwen-14b_seqlen4096_pretrain " -param+="max_steps=200 " -param+="logging_steps=5 " -param+="gradient_accumulation_steps=32 " -param+="pp_recompute_interval=1 " -param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, " -#多机新添加的参数 -param+="pipeline_parallel_config=enable_sharding_comm_overlap,enable_release_grads,enable_dp_comm_overlap,enable_overlap_p2p_comm " -param+="max_seq_length=4096 " -param+="min_learning_rate=0.000005 " -param+="save_steps=5000 " -param+="eval_steps=1000 " -param+="scale_loss=1024 " -param+="sharding_parallel_config=split_param,enable_stage1_overlap,enable_stage1_allgather_overlap " - - -cd ./tests -bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh - -bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh" From 899e57214f2bfb631740832f5fec7c52a3b92425 Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Wed, 18 Dec 2024 19:49:40 +0800 Subject: [PATCH 8/8] revert qwen&baichuan dygraph ce --- ...retrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh} | 0 .../gpt/benchmark_common/run_benchmark.sh | 7 ++----- 2 files changed, 2 insertions(+), 5 deletions(-) rename tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/{baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding4_Stage1.sh => baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh} (100%) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding4_Stage1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh similarity index 100% rename from tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding4_Stage1.sh rename to tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/gpt/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/gpt/benchmark_common/run_benchmark.sh index a2af40e30e1c..bb8b442133c5 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/gpt/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/gpt/benchmark_common/run_benchmark.sh @@ -114,10 +114,7 @@ function _train(){ --use_recompute ${use_recompute}\ --sharding_stage ${sharding_stage}\ --sharding_offload ${sharding_offload}\ - --fuse_transformer True \ - --sharding_parallel_config 'enable_stage1_tensor_fusion enable_stage1_overlap' \ - --tensor_parallel_config 'enable_mp_async_allreduce' \ - --pipeline_parallel_config 'enable_sharding_comm_overlap enable_dp_comm_overlap enable_overlap_p2p_comm' " + --fuse_transformer True" if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" @@ -144,7 +141,7 @@ function _train(){ run_pretrain.py ${train_cmd}" workerlog_id=0 ;; - DP1-MP2-PP4|DP1-MP4-PP2|DP2-MP2-PP2|DP2-MP8-PP2|DP4-MP8-PP1|DP4-MP2-PP4|DP1-MP8-PP4) echo "run run_mode: ${run_mode}" + DP1-MP2-PP4|DP1-MP4-PP2|DP2-MP2-PP2|DP2-MP8-PP2|DP4-MP8-PP1|DP1-MP8-PP4) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ run_pretrain.py ${train_cmd}" workerlog_id=0