From 3add21e2de2dceb37f417c2211892086a0f75baf Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Wed, 8 May 2024 19:45:59 +0800 Subject: [PATCH 1/2] fit for llama3 --- paddlenlp/transformers/llama/modeling_auto.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddlenlp/transformers/llama/modeling_auto.py b/paddlenlp/transformers/llama/modeling_auto.py index fce60999c214..697eb1bbf078 100644 --- a/paddlenlp/transformers/llama/modeling_auto.py +++ b/paddlenlp/transformers/llama/modeling_auto.py @@ -367,24 +367,28 @@ def _init_rope(self): self.rotary_emb = LlamaRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, ) elif self.config.rope_scaling_type == "linear": self.rotary_emb = LlamaLinearScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=self.config.rope_scaling_factor, + base=self.config.rope_theta, ) elif self.config.rope_scaling_type == "ntk": self.rotary_emb = LlamaNTKScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=self.config.rope_scaling_factor, + base=self.config.rope_theta, ) elif self.config.rope_scaling_type == "dynamic_ntk": self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=self.config.rope_scaling_factor, + base=self.config.rope_theta, ) else: raise ValueError(f"Unknown RoPE scaling type {self.config.rope_scaling_type}") From db8d15d2e0b310ab70ade6d2c64c6d6cb5733ae2 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Thu, 9 May 2024 15:26:26 +0800 Subject: [PATCH 2/2] add run scripts --- llm/llama/auto_parallel/run_llama3.sh | 101 ++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 llm/llama/auto_parallel/run_llama3.sh diff --git a/llm/llama/auto_parallel/run_llama3.sh b/llm/llama/auto_parallel/run_llama3.sh new file mode 100644 index 000000000000..d95d48b8d295 --- /dev/null +++ b/llm/llama/auto_parallel/run_llama3.sh @@ -0,0 +1,101 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# just for debug + +set -x +unset CUDA_VISIBLE_DEVICES + +task_name="llama3_dp2pp4sd2" +rm -rf output/$task_name/ +rm -rf "output/$task_name""_log" + +export SOT_LOG_LEVEL=4 +export PYTHONPATH=../../../:$PYTHONPATH + +#ulimit -c unlimited +#export GLOG_v=10 + +# export FLAGS_call_stack_level=3 +# export FLAGS_use_cuda_managed_memory=true + +# export FLAGS_embedding_deterministic=1 +# export FLAGS_cudnn_deterministic=1 +# export NVIDIA_TF32_OVERRIDE=0 + +python -u -m paddle.distributed.launch \ + --gpus "0,1,2,3,4,5,6,7" \ + --log_dir "output/$task_name""_log" \ + ./run_pretrain_auto.py \ + --model_name_or_path "meta-llama/Meta-Llama-3-8B-Instruct" \ + --tokenizer_name_or_path "meta-llama/Meta-Llama-3-8B-Instruct" \ + --input_dir "./data" \ + --output_dir "./output" \ + --split 949,50,1 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --max_grad_norm 1.0 \ + --learning_rate 3e-05 \ + --min_learning_rate 3e-06 \ + --max_steps 30 \ + --logging_steps 10 \ + --eval_steps 1000 \ + --save_steps 50000 \ + --continue_training 0 \ + --do_train true \ + --do_eval false \ + --do_predict false \ + --disable_tqdm true \ + --skip_profile_timer true \ + --save_total_limit 2 \ + --device gpu \ + --disable_tqdm true \ + --dataloader_num_workers 1 \ + --distributed_dataloader 0 \ + --enable_auto_parallel 1 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 1 \ + --recompute false \ + --recompute_use_reentrant true \ + --recompute_granularity full \ + --pp_recompute_interval 0 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --amp_master_grad true \ + --fuse_attention_ffn false \ + --fuse_attention_qkv false \ + --fused_linear_param_grad_add 1 \ + --fuse_sequence_parallel_allreduce false \ + --use_flash_attention true \ + --use_fused_rope true \ + --use_fused_rms_norm true \ + --max_seq_length 4096 \ + --sep_parallel_degree 1 \ + --sequence_parallel false \ + --pipeline_parallel_degree 2 \ + --sharding_parallel_degree 2 \ + --tensor_parallel_degree 1 \ + --virtual_pp_degree 4 \ + --pipeline_schedule_mode "VPP" \ + --sharding "stage2" \ + --pipeline_parallel_config "enable_send_recv_overlap" \ + --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \ + --sharding_parallel_config "enable_stage2_overlap" \ + --tensor_parallel_config "enable_mp_async_allreduce" \ + --to_static 1 \ + --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ + --amp_custom_white_list "lookup_table" "lookup_table_v2" \ + --skip_memory_metrics 0 +