diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-13b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh new file mode 100644 index 000000000000..c56ca59bbb9f --- /dev/null +++ b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=meta-llama/Llama-2-13b " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=4 " +param+="sharding_parallel_degree=2 " +param+="sharding=stage2 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=4 " +param+="run_stage=dpo " +param+="run_mode=tp4_sd2_acc4_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=llama2-13b_dpo " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-13b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh new file mode 100644 index 000000000000..eea7407dc63e --- /dev/null +++ b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=meta-llama/Llama-2-13b " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=1 " +param+="sharding_parallel_degree=8 " +param+="sharding=stage1 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=lora " +param+="run_mode=tp1_sd8_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=llama2-13b_lora " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-13b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh new file mode 100644 index 000000000000..78a0e1a1d7e9 --- /dev/null +++ b/tests/test_tipc/llm/llama2/N1C8/llama2-13b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=meta-llama/Llama-2-13b " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=2 " +param+="sharding_parallel_degree=4 " +param+="sharding=stage2 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=2 " +param+="run_stage=sft " +param+="run_mode=tp2_sd4_acc2_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=llama2-13b_sft " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh new file mode 100644 index 000000000000..86906fc58af7 --- /dev/null +++ b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=meta-llama/Llama-2-7b " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=4 " +param+="sharding_parallel_degree=2 " +param+="sharding=stage2 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=dpo " +param+="run_mode=tp4_sd2_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=llama2-7b_dpo " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh new file mode 100644 index 000000000000..0332b92f84e3 --- /dev/null +++ b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh @@ -0,0 +1,33 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=meta-llama/Llama-2-7b " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=1 " +param+="pipeline_parallel_degree=1 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=lora " +param+="run_mode=tp1_pp1_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=llama2-7b_lora " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/llama2/N1C8/llama2-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh new file mode 100644 index 000000000000..3174fdc2797e --- /dev/null +++ b/tests/test_tipc/llm/llama2/N1C8/llama2-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=meta-llama/Llama-2-7b " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=2 " +param+="sharding_parallel_degree=4 " +param+="sharding=stage2 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=sft " +param+="run_mode=tp2_sd4_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=llama2-7b_sft " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/llama2/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/llama2/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/dpo.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/dpo.json new file mode 100644 index 000000000000..d2e3e07ac54d --- /dev/null +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/dpo.json @@ -0,0 +1,36 @@ +{ + "model_name_or_path": "meta-llama/Llama-2-13b", + "train_dataset_path": "./data/dpo_benchmark_train/train.json", + "dev_dataset_path": "./data/dpo_benchmark_train/dev.json", + "output_dir": "./checkpoints/dpo_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 1, + "num_train_epochs": 1, + "learning_rate": 1e-06, + "warmup_steps": 10, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "max_seq_len": 8192, + "max_prompt_len": 4096, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "tensor_parallel_degree": 4, + "sharding_parallel_degree": 2, + "sharding": "stage2", + "use_flash_attention": true, + "flash_mask": true, + "recompute": true, + "recompute_granularity": "full", + "benchmark": true, + "unified_checkpoint": true, + "autotuner_benchmark":false, + "beta": 0.1, + "loss_type": "sigmoid", + "label_smoothing": 0.0 + } diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/lora.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/lora.json new file mode 100644 index 000000000000..170c59ab42b4 --- /dev/null +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/lora.json @@ -0,0 +1,39 @@ +{ + "model_name_or_path": "meta-llama/Llama-2-13b", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/lora_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "recompute_granularity": "full", + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "sharding_parallel_degree": 8, + "sharding": "stage1", + "pipeline_parallel_degree": 1, + "lora": true, + "unified_checkpoint": true, + "benchmark": true, + "zero_padding": true, + "flash_mask": true, + "use_flash_attention": true, + "pissa": false + } diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/sft.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/sft.json new file mode 100644 index 000000000000..fd538fff6c73 --- /dev/null +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-13b/sft.json @@ -0,0 +1,37 @@ +{ + "model_name_or_path": "meta-llama/Llama-2-13b", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/sft_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 2, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-05, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "recompute_granularity": "full", + "save_total_limit": 1, + "benchmark": true, + "tensor_parallel_degree": 2, + "sharding_parallel_degree": 4, + "pipeline_parallel_degree": 1, + "sharding": "stage2", + "zero_padding": true, + "flash_mask": true, + "unified_checkpoint": true, + "use_flash_attention": true + } diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/dpo.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/dpo.json new file mode 100644 index 000000000000..cb8d0afdd74a --- /dev/null +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/dpo.json @@ -0,0 +1,36 @@ +{ + "model_name_or_path": "meta-llama/Llama-2-7b", + "train_dataset_path": "./data/dpo_benchmark_train/train.json", + "dev_dataset_path": "./data/dpo_benchmark_train/dev.json", + "output_dir": "./checkpoints/dpo_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 1, + "num_train_epochs": 1, + "learning_rate": 1e-06, + "warmup_steps": 10, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "max_seq_len": 8192, + "max_prompt_len": 4096, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "tensor_parallel_degree": 4, + "sharding_parallel_degree": 2, + "sharding": "stage2", + "use_flash_attention": true, + "flash_mask": true, + "recompute": true, + "recompute_granularity": "full", + "benchmark": true, + "unified_checkpoint": true, + "autotuner_benchmark":false, + "beta": 0.1, + "loss_type": "sigmoid", + "label_smoothing": 0.0 + } diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/lora.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/lora.json new file mode 100644 index 000000000000..17fa0b8a8474 --- /dev/null +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/lora.json @@ -0,0 +1,37 @@ +{ + "model_name_or_path": "meta-llama/Llama-2-7b", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/lora_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "recompute_granularity": "full", + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "lora": true, + "unified_checkpoint": true, + "benchmark": true, + "zero_padding": true, + "flash_mask": true, + "use_flash_attention": true, + "pissa": false + } diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/sft.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/sft.json new file mode 100644 index 000000000000..cd496681ecb6 --- /dev/null +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-7b/sft.json @@ -0,0 +1,37 @@ +{ + "model_name_or_path": "meta-llama/Llama-2-7b", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/sft_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-05, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "recompute_granularity": "full", + "save_total_limit": 1, + "benchmark": true, + "tensor_parallel_degree": 2, + "sharding_parallel_degree": 4, + "pipeline_parallel_degree": 1, + "sharding": "stage2", + "zero_padding": true, + "flash_mask": true, + "unified_checkpoint": true, + "use_flash_attention": true + } diff --git a/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh b/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh new file mode 100644 index 000000000000..a1b917731589 --- /dev/null +++ b/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh @@ -0,0 +1,35 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python -m pip install -r ../requirements.txt +python -m pip install -r ../requirements-dev.txt +export PYTHONPATH=../:${PYTHONPATH} +echo ${PYTHONPATH} + + +# install fused_ln custom ops +cd ../slm/model_zoo/gpt-3/external_ops/ +python setup.py install +cd - + +# install paddlenlp_ops +cd ../csrc/ +python setup_cuda.py install +cd - + +cd ../llm +cp -r ../tests/test_tipc/llm/llama2/benchmark_common/benchmark_json ./ + +wget https://paddlenlp.bj.bcebos.com/llm_benchmark_data/paddle_data.tar.gz +tar zxvf paddle_data.tar.gz && rm -rf paddle_data.tar.gz \ No newline at end of file diff --git a/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..6f6b27849ea9 --- /dev/null +++ b/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} +function _set_params(){ + model_name_or_path=${model_name_or_path:-"Qwen/Qwen2.5-7B"} + per_device_train_batch_size=${per_device_train_batch_size:-1} + tensor_parallel_degree=${tensor_parallel_degree:-1} + pipeline_parallel_degree=${pipeline_parallel_degree:-1} + sharding_parallel_degree=${sharding_parallel_degree:-1} + sharding=${sharding:-"stage1"} + recompute=${recompute:-1} + recompute_granularity=${recompute_granularity:-"full"} + gradient_accumulation_steps=${gradient_accumulation_steps:-1} + run_stage=${run_stage:-"sft"} + run_mode=${run_mode:-"DP1-MP1-PP4-mbs1-acc8-recompute"} + device_num=${device_num:-"N1C8"} + global_batch_size=${global_batch_size:-16} + model_item=${model_item:-"qwen-qwen-7b_seqlen2048_pretrain"} + max_steps=${max_steps:-150} + logging_steps=${logging_steps:-1} + base_batch_size=${global_batch_size} + + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="tokens/s" # (必选)速度指标单位 + skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="Effective_Tokens_per_second_per_gpu:" # (必选)解析日志,筛选出性能数据所在行的关键字 + is_large_model=True # (可选)普通模型默认为False,如果添加大模型且只取一条ips设置为True + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + + fp_item="bf16" + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + mkdir -p $(dirname ${train_log_file}) + + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + mkdir -p $(dirname ${profiling_log_file}) + + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed + mkdir -p $(dirname ${speed_log_file}) + + OUTPUT_PATH=${run_log_path}/output + +} + +function _train(){ + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + + if [ -d $OUTPUT_PATH ]; then + rm -rf $OUTPUT_PATH + fi + mkdir $OUTPUT_PATH + + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} = "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + + case ${run_stage} in + sft) train_cmd="run_finetune.py benchmark_json/${model_item%_*}/sft.json" ;; + lora) train_cmd="run_finetune.py benchmark_json/${model_item%_*}/lora.json" ;; + dpo) train_cmd="alignment/dpo/run_dpo.py benchmark_json/${model_item%_*}/dpo.json" ;; + *) echo "choose run_stage(sft | lora | dpo)"; exit 1; + esac + + if [ ${PADDLE_TRAINER_ID} ] + then + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" + else + PADDLE_RANK_OPTION="" + fi + # 以下为通用执行命令,无特殊可不用修改 + case ${device_num} in + N1C1) echo "Run with: device_num=${device_num}, run_mode=${run_mode}, run_stage=${run_stage}" + train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog --gpus=0 ${PADDLE_RANK_OPTION}\ + ${train_cmd}" + workerlog_id=0 + ;; + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}, run_stage=${run_stage}" + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ + ${train_cmd}" + workerlog_id=0 + ;; + esac + cd ../llm/ + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + python -c "import paddlenlp" + if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间 + ${train_cmd} > ${log_file} 2>&1 + else + timeout 30m ${train_cmd} > ${log_file} 2>&1 + # echo ${train_cmd} + Effective_Tokens_per_second=`cat ${log_file} | grep -E 'Effective_Tokens_per_second|Effective tokens per second:' \ + |awk -F': ' '{print $2}' |awk -F' ' '{print $1}'` + num_gpu=$(echo "$device_num" | sed 's/^.*C//') + ips=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}') + echo "Effective_Tokens_per_second_per_gpu: ${ips}" >> ${log_file} + fi + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" -a -d mylog ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + fi +} + +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH + +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +#_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh new file mode 100644 index 000000000000..4eb207e828fd --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_dpo_bs16_bf16_tp4_sd2_acc4_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-14B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=4 " +param+="sharding_parallel_degree=2 " +param+="sharding=stage2 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=4 " +param+="run_stage=dpo " +param+="run_mode=tp4_sd2_acc4_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-14b_dpo " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh new file mode 100644 index 000000000000..d040cecacef0 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_lora_bs16_bf16_tp1_sd8_acc1_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-14B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=1 " +param+="sharding_parallel_degree=8 " +param+="sharding=stage1 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=lora " +param+="run_mode=tp1_sd8_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-14b_lora " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh new file mode 100644 index 000000000000..ce078b50f1be --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-14b_sft_bs16_bf16_tp2_sd4_acc2_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-14B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=2 " +param+="sharding_parallel_degree=4 " +param+="sharding=stage2 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=2 " +param+="run_stage=sft " +param+="run_mode=tp2_sd4_acc2_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-14b_sft " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_dpo_bs16_bf16_tp1_sd1_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_dpo_bs16_bf16_tp1_sd1_acc1_dygraph.sh new file mode 100644 index 000000000000..191020daebf6 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_dpo_bs16_bf16_tp1_sd1_acc1_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-1.5B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=1 " +param+="sharding_parallel_degree=1 " +param+="sharding=stage2 " +param+="recompute=false " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=dpo " +param+="run_mode=tp1_sd1_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-1_5b_dpo " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh new file mode 100644 index 000000000000..830ec3c4ca09 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh @@ -0,0 +1,33 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-1.5B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=1 " +param+="pipeline_parallel_degree=1 " +param+="recompute=false " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=lora " +param+="run_mode=tp1_pp1_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-1_5b_lora " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_sft_bs16_bf16_tp1_pp1_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_sft_bs16_bf16_tp1_pp1_acc1_dygraph.sh new file mode 100644 index 000000000000..9bb0a9a1bfd0 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-1_5b_sft_bs16_bf16_tp1_pp1_acc1_dygraph.sh @@ -0,0 +1,33 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-1.5B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=1 " +param+="pipeline_parallel_degree=1 " +param+="recompute=false " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=sft " +param+="run_mode=tp1_pp1_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-1_5b_sft " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh new file mode 100644 index 000000000000..ac37b6e86fbb --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_dpo_bs16_bf16_tp4_sd2_acc1_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-7B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=4 " +param+="sharding_parallel_degree=2 " +param+="sharding=stage2 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=dpo " +param+="run_mode=tp4_sd2_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-7b_dpo " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh new file mode 100644 index 000000000000..bc0a6ae685ae --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_lora_bs16_bf16_tp1_pp1_acc1_dygraph.sh @@ -0,0 +1,33 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-7B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=1 " +param+="pipeline_parallel_degree=1 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=lora " +param+="run_mode=tp1_pp1_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-7b_lora " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh new file mode 100644 index 000000000000..76cb9fa7e2e5 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/N1C8/qwen-qwen2_5-7b_sft_bs16_bf16_tp2_sd4_acc1_dygraph.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +param="model_name_or_path=Qwen/Qwen2.5-7B " +param+="per_device_train_batch_size=1 " +param+="tensor_parallel_degree=2 " +param+="sharding_parallel_degree=4 " +param+="sharding=stage2 " +param+="recompute=true " +param+="recompute_granularity=full " +param+="gradient_accumulation_steps=1 " +param+="run_stage=sft " +param+="run_mode=tp2_sd4_acc1_dygraph " +param+="device_num=N1C8 " +param+="global_batch_size=16 " +param+="model_item=qwen-qwen2_5-7b_sft " +param+="max_steps=150 " + +cd ./tests +bash ./test_tipc/llm/qwen2_5/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/dpo.json new file mode 100644 index 000000000000..0580e1e47ff8 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/dpo.json @@ -0,0 +1,36 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-14B", + "train_dataset_path": "./data/dpo_benchmark_train/train.json", + "dev_dataset_path": "./data/dpo_benchmark_train/dev.json", + "output_dir": "./checkpoints/dpo_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 1, + "num_train_epochs": 1, + "learning_rate": 1e-06, + "warmup_steps": 10, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "max_seq_len": 8192, + "max_prompt_len": 4096, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "tensor_parallel_degree": 4, + "sharding_parallel_degree": 2, + "sharding": "stage2", + "use_flash_attention": true, + "flash_mask": true, + "recompute": true, + "recompute_granularity": "full", + "benchmark": true, + "unified_checkpoint": true, + "autotuner_benchmark":false, + "beta": 0.1, + "loss_type": "sigmoid", + "label_smoothing": 0.0 + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/lora.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/lora.json new file mode 100644 index 000000000000..a3106cfcce59 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/lora.json @@ -0,0 +1,39 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-14B", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/lora_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "recompute_granularity": "full", + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "sharding_parallel_degree": 8, + "sharding": "stage1", + "pipeline_parallel_degree": 1, + "lora": true, + "unified_checkpoint": true, + "benchmark": true, + "zero_padding": true, + "flash_mask": true, + "use_flash_attention": true, + "pissa": false + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/sft.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/sft.json new file mode 100644 index 000000000000..9ff6c10456c0 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-14b/sft.json @@ -0,0 +1,37 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-14B", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/sft_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 2, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-05, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "recompute_granularity": "full", + "save_total_limit": 1, + "benchmark": true, + "tensor_parallel_degree": 2, + "sharding_parallel_degree": 4, + "pipeline_parallel_degree": 1, + "sharding": "stage2", + "zero_padding": true, + "flash_mask": true, + "unified_checkpoint": true, + "use_flash_attention": true + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/dpo.json new file mode 100644 index 000000000000..9fac0129d1d9 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/dpo.json @@ -0,0 +1,36 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-1.5B", + "train_dataset_path": "./data/dpo_benchmark_train/train.json", + "dev_dataset_path": "./data/dpo_benchmark_train/dev.json", + "output_dir": "./checkpoints/dpo_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 1, + "num_train_epochs": 1, + "learning_rate": 1e-06, + "warmup_steps": 10, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "max_seq_len": 8192, + "max_prompt_len": 4096, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "tensor_parallel_degree": 1, + "sharding_parallel_degree": 1, + "sharding": "stage2", + "use_flash_attention": true, + "flash_mask": true, + "recompute": false, + "recompute_granularity": "full", + "benchmark": true, + "unified_checkpoint": true, + "autotuner_benchmark":false, + "beta": 0.1, + "loss_type": "sigmoid", + "label_smoothing": 0.0 + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/lora.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/lora.json new file mode 100644 index 000000000000..a1a2acf64a0e --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/lora.json @@ -0,0 +1,37 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-1.5B", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/lora_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": false, + "recompute_granularity": "full", + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "lora": true, + "unified_checkpoint": true, + "benchmark": true, + "zero_padding": true, + "flash_mask": true, + "use_flash_attention": true, + "pissa": false + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/sft.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/sft.json new file mode 100644 index 000000000000..4ff7267dcc5c --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-1_5b/sft.json @@ -0,0 +1,37 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-1.5B", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/sft_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-05, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": false, + "recompute_granularity": "full", + "save_total_limit": 1, + "benchmark": true, + "tensor_parallel_degree": 1, + "sharding_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "sharding": "stage2", + "zero_padding": true, + "flash_mask": true, + "unified_checkpoint": true, + "use_flash_attention": true + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/dpo.json new file mode 100644 index 000000000000..f47f63ef8c11 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/dpo.json @@ -0,0 +1,36 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-7B", + "train_dataset_path": "./data/dpo_benchmark_train/train.json", + "dev_dataset_path": "./data/dpo_benchmark_train/dev.json", + "output_dir": "./checkpoints/dpo_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 1, + "num_train_epochs": 1, + "learning_rate": 1e-06, + "warmup_steps": 10, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "max_seq_len": 8192, + "max_prompt_len": 4096, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "tensor_parallel_degree": 4, + "sharding_parallel_degree": 2, + "sharding": "stage2", + "use_flash_attention": true, + "flash_mask": true, + "recompute": true, + "recompute_granularity": "full", + "benchmark": true, + "unified_checkpoint": true, + "autotuner_benchmark":false, + "beta": 0.1, + "loss_type": "sigmoid", + "label_smoothing": 0.0 + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/lora.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/lora.json new file mode 100644 index 000000000000..de1169384551 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/lora.json @@ -0,0 +1,37 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-7B", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/lora_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "recompute_granularity": "full", + "save_total_limit": 1, + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "lora": true, + "unified_checkpoint": true, + "benchmark": true, + "zero_padding": true, + "flash_mask": true, + "use_flash_attention": true, + "pissa": false + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/sft.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/sft.json new file mode 100644 index 000000000000..af7987f4b9b1 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-7b/sft.json @@ -0,0 +1,37 @@ +{ + "model_name_or_path": "Qwen/Qwen2.5-7B", + "dataset_name_or_path": "./data/sft_benchmark_train/", + "output_dir": "./checkpoints/sft_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 1, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 1, + "learning_rate": 3e-05, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "no", + "src_length": 4096, + "max_length": 8192, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": false, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "recompute_granularity": "full", + "save_total_limit": 1, + "benchmark": true, + "tensor_parallel_degree": 2, + "sharding_parallel_degree": 4, + "pipeline_parallel_degree": 1, + "sharding": "stage2", + "zero_padding": true, + "flash_mask": true, + "unified_checkpoint": true, + "use_flash_attention": true + } diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh b/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh new file mode 100644 index 000000000000..416ded186efb --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh @@ -0,0 +1,35 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python -m pip install -r ../requirements.txt +python -m pip install -r ../requirements-dev.txt +export PYTHONPATH=../:${PYTHONPATH} +echo ${PYTHONPATH} + + +# install fused_ln custom ops +cd ../slm/model_zoo/gpt-3/external_ops/ +python setup.py install +cd - + +# install paddlenlp_ops +cd ../csrc/ +python setup_cuda.py install +cd - + +cd ../llm +cp -r ../tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json ./ + +wget https://paddlenlp.bj.bcebos.com/llm_benchmark_data/paddle_data.tar.gz +tar zxvf paddle_data.tar.gz && rm -rf paddle_data.tar.gz \ No newline at end of file diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh b/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..6f6b27849ea9 --- /dev/null +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} +function _set_params(){ + model_name_or_path=${model_name_or_path:-"Qwen/Qwen2.5-7B"} + per_device_train_batch_size=${per_device_train_batch_size:-1} + tensor_parallel_degree=${tensor_parallel_degree:-1} + pipeline_parallel_degree=${pipeline_parallel_degree:-1} + sharding_parallel_degree=${sharding_parallel_degree:-1} + sharding=${sharding:-"stage1"} + recompute=${recompute:-1} + recompute_granularity=${recompute_granularity:-"full"} + gradient_accumulation_steps=${gradient_accumulation_steps:-1} + run_stage=${run_stage:-"sft"} + run_mode=${run_mode:-"DP1-MP1-PP4-mbs1-acc8-recompute"} + device_num=${device_num:-"N1C8"} + global_batch_size=${global_batch_size:-16} + model_item=${model_item:-"qwen-qwen-7b_seqlen2048_pretrain"} + max_steps=${max_steps:-150} + logging_steps=${logging_steps:-1} + base_batch_size=${global_batch_size} + + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="tokens/s" # (必选)速度指标单位 + skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="Effective_Tokens_per_second_per_gpu:" # (必选)解析日志,筛选出性能数据所在行的关键字 + is_large_model=True # (可选)普通模型默认为False,如果添加大模型且只取一条ips设置为True + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + + fp_item="bf16" + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + mkdir -p $(dirname ${train_log_file}) + + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + mkdir -p $(dirname ${profiling_log_file}) + + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed + mkdir -p $(dirname ${speed_log_file}) + + OUTPUT_PATH=${run_log_path}/output + +} + +function _train(){ + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + + if [ -d $OUTPUT_PATH ]; then + rm -rf $OUTPUT_PATH + fi + mkdir $OUTPUT_PATH + + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} = "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + + case ${run_stage} in + sft) train_cmd="run_finetune.py benchmark_json/${model_item%_*}/sft.json" ;; + lora) train_cmd="run_finetune.py benchmark_json/${model_item%_*}/lora.json" ;; + dpo) train_cmd="alignment/dpo/run_dpo.py benchmark_json/${model_item%_*}/dpo.json" ;; + *) echo "choose run_stage(sft | lora | dpo)"; exit 1; + esac + + if [ ${PADDLE_TRAINER_ID} ] + then + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" + else + PADDLE_RANK_OPTION="" + fi + # 以下为通用执行命令,无特殊可不用修改 + case ${device_num} in + N1C1) echo "Run with: device_num=${device_num}, run_mode=${run_mode}, run_stage=${run_stage}" + train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog --gpus=0 ${PADDLE_RANK_OPTION}\ + ${train_cmd}" + workerlog_id=0 + ;; + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}, run_stage=${run_stage}" + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ + ${train_cmd}" + workerlog_id=0 + ;; + esac + cd ../llm/ + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + python -c "import paddlenlp" + if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间 + ${train_cmd} > ${log_file} 2>&1 + else + timeout 30m ${train_cmd} > ${log_file} 2>&1 + # echo ${train_cmd} + Effective_Tokens_per_second=`cat ${log_file} | grep -E 'Effective_Tokens_per_second|Effective tokens per second:' \ + |awk -F': ' '{print $2}' |awk -F' ' '{print $1}'` + num_gpu=$(echo "$device_num" | sed 's/^.*C//') + ips=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}') + echo "Effective_Tokens_per_second_per_gpu: ${ips}" >> ${log_file} + fi + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + else + echo -e "${model_name}, SUCCESS" + fi + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" -a -d mylog ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + fi +} + +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH + +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +#_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开