Skip to content

Commit

Permalink
fix benchmark dir because of PR#8627 (#8649)
Browse files Browse the repository at this point in the history
* remove tsinghua pypi

* modify gpt dateset addr for benchmark

* fix run_benchmark for llama2_70b in auto_parallel

* fix autotunner benchmark error and fix llama2 dy2st benchmark

* fix benchmark dir because of PR#8627

* Update run_benchmark.sh

Update run_benchmark.sh

* fix benchmark dir because of PR#8627

* add qwen N4C32

* fix etcd

* fix qwen N4C32 for per_device_type
  • Loading branch information
fightfat authored Jun 26, 2024
1 parent 4e971f8 commit 80e7ef5
Show file tree
Hide file tree
Showing 20 changed files with 242 additions and 61 deletions.
20 changes: 16 additions & 4 deletions paddlenlp/trainer/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,7 +1039,10 @@ def __post_init__(self):
strategy = fleet.DistributedStrategy()
assert self.data_parallel_config == "", "data_parallle_config is not supported in hybrid parallel"
if self.pipeline_parallel_degree > 1:
pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
if " " in self.pipeline_parallel_config:
pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
else:
pipeline_parallel_config = set(self.pipeline_parallel_config.split(","))
for x in pipeline_parallel_config:
if len(x) > 0:
if x not in [
Expand Down Expand Up @@ -1222,7 +1225,10 @@ def is_segment_parallel_supported():
strategy.hybrid_configs = hybrid_configs

if self.sharding_parallel_degree > 1:
sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
if " " in self.sharding_parallel_config:
sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
else:
sharding_parallel_config = set(self.sharding_parallel_config.split(","))
for x in sharding_parallel_config:
if len(x) > 0:
if x not in [
Expand Down Expand Up @@ -1378,7 +1384,10 @@ def is_segment_parallel_supported():

# navie-pp: pipeline_parallel_degree > 1 and gradient_accumulation_steps == 1
if self.pipeline_parallel_degree > 1 and self.gradient_accumulation_steps > 1:
pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
if " " in self.pipeline_parallel_config:
pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
else:
pipeline_parallel_config = set(self.pipeline_parallel_config.split(","))
for x in pipeline_parallel_config:
if len(x) > 0:
if x not in [
Expand Down Expand Up @@ -1464,7 +1473,10 @@ def is_segment_parallel_supported():
elif ShardingOption.FULL_SHARD in self.sharding:
sharding.stage = 3

sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
if " " in self.sharding_parallel_config:
sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
else:
sharding_parallel_config = set(self.sharding_parallel_config.split(","))
for x in sharding_parallel_config:
if len(x) > 0:
if x not in [
Expand Down
2 changes: 1 addition & 1 deletion scripts/distribute/ci_case_dy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export root_path=/workspace/PaddleNLP
export gpt_case_path=$root_path/legacy/model_zoo/gpt-3
export gpt_data_path=/fleetx_data

export llm_gpt_case_path=$root_path/llm/gpt-3
export llm_gpt_case_path=$root_path/llm
export llm_gpt_data_path=/llm_gpt_data

unset CUDA_VISIBLE_DEVICES
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,12 @@ else
master_ip=${ip_lists[0]}
rank=$PADDLE_TRAINER_ID
echo $master_ip $rank
if [ $rank == 0 ]; then
net=$(netstat -anp | grep :2379 | grep "LISTEN")
if [ ${#net} == 0 ]; then
apt-get install -y --allow-downgrades etcd
nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
ps -ef |grep etcd
fi
else
sleep 5
fi
#多机任务在每台机器上都启动服务,保证同步,否则多机运行会报错
net=$(netstat -anp | grep :2379 | grep "LISTEN")
if [ ${#net} == 0 ]; then
apt-get install -y --allow-downgrades etcd
nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
ps -ef |grep etcd
fi
sleep 5
fi
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
# install fused_ln custom ops
cd ../legacy/model_zoo/gpt-3/external_ops/
python setup.py install
cd -

# install tool_helpers
cd ../../../../llm/llama
cd ../llm
python -m pip install tool_helpers

rm -rf data && mkdir data
Expand All @@ -32,7 +33,7 @@ mv llama_openwebtext_100k.idx ./data

# mv autoconfig
rm -rf autoconfig
cp -r ../../tests/test_tipc/auto_tuner/autoconfig ./
cp -r ../tests/test_tipc/auto_tuner/autoconfig ./

if [ -z "$1" ]; then
echo "单机任务"
Expand All @@ -43,15 +44,12 @@ else
master_ip=${ip_lists[0]}
rank=$PADDLE_TRAINER_ID
echo $master_ip $rank
if [ $rank == 0 ]; then
net=$(netstat -anp | grep :2379 | grep "LISTEN")
if [ ${#net} == 0 ]; then
apt-get install -y --allow-downgrades etcd
nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
ps -ef |grep etcd
fi
else
sleep 5
fi
#多机任务在每台机器上都启动服务,保证同步,否则多机运行会报错
net=$(netstat -anp | grep :2379 | grep "LISTEN")
if [ ${#net} == 0 ]; then
apt-get install -y --allow-downgrades etcd
nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
ps -ef |grep etcd
fi
sleep 5
fi
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ function _train(){
--auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}"
;;
esac
cd ../llm/llama
cd ../llm
if [[ ${model_item} =~ "resume" ]];then
cp $PWD/autoconfig/resume.csv $PWD/autoconfig/llama7b_pretrain_history.csv
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
python -m pip install -r ../requirements.txt
python -m pip install -r ../requirements-dev.txt
# get data
cd ../llm/gpt-3
cd ../llm
rm -rf data
mkdir data
wget -O data/gpt2-en-mmap.bin https://paddlenlp.bj.bcebos.com/datasets/PDC_DATASETS/PRETRAIN/openwebtext2/gpt/mmap/gpt2-en-mmap.bin
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ function _train(){
workerlog_id=0
;;
esac
cd ../llm/gpt-3
cd ../llm
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
${train_cmd} > ${log_file} 2>&1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
# install fused_ln custom ops
cd ../legacy/model_zoo/gpt-3/external_ops/
python setup.py install
cd -

# install tool_helpers
cd ../../../../llm/llama
cd ../llm/
python -m pip install tool_helpers

wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ function _train(){
workerlog_id=0
;;
esac
cd ../llm/llama
cd ../llm
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
python -c "import paddlenlp"
if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ python -m pip install -r ../requirements-dev.txt
# install fused_ln custom ops
cd ../legacy/model_zoo/gpt-3/external_ops/
python setup.py install
cd -

# install tool_helpers
cd ../../../../llm/llama
cd ../llm/
python -m pip install tool_helpers

# download data
Expand All @@ -36,16 +37,13 @@ ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
master_ip=${ip_lists[0]}
rank=$PADDLE_TRAINER_ID
echo $master_ip $rank
if [ $rank == 0 ]; then
net=$(netstat -anp | grep "2379 " | grep "LISTEN")
if [ ${#net} == 0 ]; then
nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
ps -ef |grep etcd
fi
else
sleep 5
fi
#多机任务在每台机器上都启动服务,保证同步,否则多机运行会报错
net=$(netstat -anp | grep ":2379" | grep "LISTEN")
if [ ${#net} == 0 ]; then
nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
ps -ef |grep etcd
fi

# mv autoconfig
rm -rf auto_config_*
cp -r ../../tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_* ./
cp -r ../tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_* ./
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ function _train(){
./auto_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}-auto_tuner.json"
;;
esac
cd ../llm/llama
cd ../llm
rm -rf ./auto_config_${MODEL_TYPE}/*GBS*
rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log
rm -rf ./auto_config_${MODEL_TYPE}/*csv
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


param="model_name_or_path=qwen/qwen-14b "
param+="per_device_train_batch_size=1 "
param+="data_parallel_degree=1 "
param+="tensor_parallel_degree=1 "
param+="pipeline_parallel_degree=4 "
param+="virtual_pp_degree=1 "
param+="sequence_parallel=0 "
param+="sharding_parallel_degree=8 "
param+="sharding=stage1 "
param+="recompute=0 "
param+="recompute_granularity=full_attn "
param+="run_mode=MP1-PP4-sharding8-mbs1-acc4 "
param+="device_num=N4C32 "
param+="global_batch_size=32 "
param+="model_item=qwen-qwen-14b_seqlen4096_pretrain "
param+="max_steps=100 "
param+="gradient_accumulation_steps=4 "
param+="pp_recompute_interval=1 "
param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, "
#多机新添加的参数
param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, "
param+="max_seq_length=4096 "
param+="min_learning_rate=0.000005 "
param+="save_steps=5000 "
param+="eval_steps=1000 "
param+="scale_loss=1024 "
param+="sharding_parallel_config=split_param,enable_stage1_overlap, "


cd ./tests
bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh

bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


param="model_name_or_path=qwen/qwen-72b "
param+="per_device_train_batch_size=1 "
param+="data_parallel_degree=1 "
param+="tensor_parallel_degree=8 "
param+="pipeline_parallel_degree=4 "
param+="virtual_pp_degree=4 "
param+="sequence_parallel=1 "
param+="sharding_parallel_degree=1 "
param+="sharding=stage1 "
param+="recompute=0 "
param+="recompute_granularity=full_attn "
param+="run_mode=MP8-PP4-sharding1-mbs1-acc32 "
param+="device_num=N4C32 "
param+="global_batch_size=32 "
param+="model_item=qwen-qwen-72b_seqlen4096_pretrain "
param+="max_steps=100 "
param+="gradient_accumulation_steps=32 "
param+="pp_recompute_interval=1 "
param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, "
#多机新添加的参数
param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, "
param+="max_seq_length=4096 "
param+="min_learning_rate=0.000005 "
param+="save_steps=5000 "
param+="eval_steps=1000 "
param+="scale_loss=1024 "
param+="sharding_parallel_config=split_param,enable_stage1_overlap, "


cd ./tests
bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh

bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


param="model_name_or_path=qwen/qwen-7b "
param+="per_device_train_batch_size=2 "
param+="data_parallel_degree=1 "
param+="tensor_parallel_degree=2 "
param+="pipeline_parallel_degree=1 "
param+="virtual_pp_degree=1 "
param+="sequence_parallel=1 "
param+="sharding_parallel_degree=16 "
param+="sharding=stage1 "
param+="recompute=0 "
param+="recompute_granularity=full_attn "
param+="run_mode=MP2-PP1-sharding16-mbs2-acc1 "
param+="device_num=N4C32 "
param+="global_batch_size=32 "
param+="model_item=qwen-qwen-7b_seqlen4096_pretrain "
param+="max_steps=100 "
param+="gradient_accumulation_steps=1 "
param+="pp_recompute_interval=1 "
param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
#多机新添加的参数
param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads "
param+="max_seq_length=4096 "
param+="min_learning_rate=0.000005 "
param+="save_steps=5000 "
param+="eval_steps=1000 "
param+="scale_loss=1024 "
param+="sharding_parallel_config=split_param,enable_stage1_overlap "


cd ./tests
bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh

bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ python -m pip install tiktoken
# install fused_ln custom ops
cd ../legacy/model_zoo/gpt-3/external_ops/
python setup.py install
cd -

# install tool_helpers
cd ../../../../llm/qwen
cd ../llm/
python -m pip install tool_helpers

wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
Expand Down
Loading

0 comments on commit 80e7ef5

Please sign in to comment.