Skip to content

Commit

Permalink
ci: Add a functional test for SFT (#373)
Browse files Browse the repository at this point in the history
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: Terry Kong <terryk@nvidia.com>
Co-authored-by: Terry Kong <terryk@nvidia.com>
  • Loading branch information
HeyyyyyyG and terrykong authored Nov 5, 2024
1 parent 8c4e61a commit e2c9695
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ jobs:
test_case:
- ppo-llama3-pp2-reshard
- dpo-llama3
- sft-llama3
- rm-llama3
with:
RUNNER: self-hosted-azure
Expand Down
86 changes: 86 additions & 0 deletions tests/functional/sft.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
set -eoux pipefail

export NCCL_ALGO=Tree
export NVTE_APPLY_QK_LAYER_SCALING=1

PRETRAINED_CHECKPOINT_NEMO_FILE=${PRETRAINED_CHECKPOINT_NEMO_FILE}
GLOBAL_BATCH_SIZE=${GLOBAL_BATCH_SIZE:-4}
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-1}
MAX_SEQ_LENGTH=${MAX_SEQ_LENGTH:-4096}

TRAIN_DATA_PATH=$SCRIPT_DIR/test_data/dummy-sft.jsonl
VALID_DATA_PATH=$SCRIPT_DIR/test_data/dummy-sft.jsonl

NAME="llama3_sft_test"

# PARAMETERS
RESULTS_DIR="/tmp/${NAME}"
mkdir -p $RESULTS_DIR

GPFS=$(git rev-parse --show-toplevel)

# W&B Logging
PROJECT=llama3_sft_test

CONF_DIR="${GPFS}/examples/nlp/gpt/conf/"
CONF_NAME="gpt_sft"

CHECKPOINT_DIR="${RESULTS_DIR}/checkpoints"
TENSOBOARD_DIR="${RESULTS_DIR}/tensorboard"

mkdir -p $RESULTS_DIR
mkdir -p $TENSOBOARD_DIR
mkdir -p $CHECKPOINT_DIR

sft() {
export CUDA_VISIBLE_DEVICES=0,1
export PYTHONPATH="${GPFS}:${PYTHONPATH:-}"
export HYDRA_FULL_ERROR=1
mpirun -np 2 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_gpt_sft.py \
--config-path=${CONF_DIR} \
--config-name=${CONF_NAME} \
trainer.num_nodes=1 \
trainer.devices=2 \
++model.mcore_gpt=True \
++model.megatron_amp_O2=True \
model.restore_from_path=${PRETRAINED_CHECKPOINT_NEMO_FILE} \
exp_manager.create_checkpoint_callback=False \
model.data.num_workers=0 \
model.data.chat=True \
model.data.chat_prompt_tokens.system_turn_start=\'\<extra_id_0\>\' \
model.data.chat_prompt_tokens.turn_start=\'\<extra_id_1\>\' \
model.data.chat_prompt_tokens.label_start=\'\<extra_id_2\>\' \
model.data.train_ds.max_seq_length=${MAX_SEQ_LENGTH} \
model.data.train_ds.micro_batch_size=${MICRO_BATCH_SIZE} \
model.data.train_ds.global_batch_size=${GLOBAL_BATCH_SIZE} \
model.data.train_ds.file_path=${TRAIN_DATA_PATH} \
model.data.train_ds.index_mapping_dir=${SCRIPT_DIR}/test_data \
model.data.train_ds.add_eos=False \
model.data.train_ds.hf_dataset=True \
model.data.validation_ds.max_seq_length=${MAX_SEQ_LENGTH} \
model.data.validation_ds.file_path=${VALID_DATA_PATH} \
model.data.validation_ds.micro_batch_size=${MICRO_BATCH_SIZE} \
model.data.validation_ds.global_batch_size=${GLOBAL_BATCH_SIZE} \
model.data.validation_ds.index_mapping_dir=${SCRIPT_DIR}/test_data \
model.data.validation_ds.add_eos=False \
model.data.validation_ds.hf_dataset=True \
model.answer_only_loss=True \
++model.tensor_model_parallel_size=1 \
++model.pipeline_model_parallel_size=1 \
trainer.sft.max_steps=5 \
trainer.sft.val_check_interval=1 \
trainer.sft.limit_val_batches=8 \
trainer.sft.save_interval=0 \
exp_manager.explicit_log_dir=${RESULTS_DIR} \
++model.activations_checkpoint_granularity=full \
++model.activations_checkpoint_method=uniform \
++model.activations_checkpoint_num_layers=1 \
++model.dist_ckpt_load_strictness=log_all
}

log_file=$(mktemp /tmp/sft-log-XXXXXX)
sft | tee $log_file
8 changes: 8 additions & 0 deletions tests/functional/test_cases/sft-llama3
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR

set -eoux pipefail

PRETRAINED_CHECKPOINT_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/tiny-llama3-results-nlayers2-hidden128-ffn448-nhead4-qgroup2-megatron_gpt.nemo \
bash ../sft.sh
Loading

0 comments on commit e2c9695

Please sign in to comment.