Skip to content

Commit

Permalink
add enable_sp_async_reduce_scatter (#8803)
Browse files Browse the repository at this point in the history
  • Loading branch information
DesmonDay authored Jul 25, 2024
1 parent 77f6e98 commit 5fd6dd2
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion paddlenlp/trainer/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ class TrainingArguments:
enable_mp_async_allreduce, it supports all_reduce(dx) overlap with matmul(dw) in ColumnParallelLinear backward when it set True, which can accelerate model parallel performance.
enable_mp_skip_c_identity, it supports skip c_identity in ColumnParallelLinear and RowParallelLinear. It only works when set mp_async_allreduce is True. It can accelerate model parallel further.
enable_mp_fused_linear_param_grad_add, it supports fused_linear_param_grad_add in ColumnParallelLinear (cuda >= 11.6). It only works when mp_async_allreduce is true. It can accelerate model parallel further.
enable_sp_async_reduce_scatter, it supports async reduce_scatter in ColumnSequenceParallelLinear. It only works when set sp_async_reduce_scatter is True. It can accelerate sequence parallel further.
enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by accumute step. instead of div accumute step on loss directly.
sync_param, in optimizer step, use broadcast to sync parameters those attr 'is_distributed' is False.
sync_grad, in optimizer step, use broadcast to sync gradients those attr 'is_distributed' is False.
Expand Down Expand Up @@ -629,6 +630,7 @@ class TrainingArguments:
"enable_mp_async_allreduce, it supports all_reduce(dx) overlap with matmul(dw) in ColumnParallelLinear backward when it set True, which can accelerate model parallel performance. \n"
"enable_mp_skip_c_identity, it supports skip c_identity in ColumnParallelLinear and RowParallelLinear. It only works when set mp_async_allreduce is True. It can accelerate model parallel further.\n"
"enable_mp_fused_linear_param_grad_add, it supports fused_linear_param_grad_add in ColumnParallelLinear (cuda >= 11.6). It only works when mp_async_allreduce is true. It can accelerate model parallel further.\n"
"enable_sp_async_reduce_scatter, it supports async reduce_scatter in ColumnSequenceParallelLinear. It only works when set sp_async_reduce_scatter is True. It can accelerate sequence parallel further.\n"
"enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by accumute step. instead of div accumute step on loss directly.\n"
"sync_param, in optimizer step, use broadcast to sync parameters those attr 'is_distributed' is False.\n"
"sync_grad, in optimizer step, use broadcast to sync gradients those attr 'is_distributed' is False.\n"
Expand Down Expand Up @@ -1128,14 +1130,15 @@ def split_parallel_config(parallel_config):
"enable_mp_async_allreduce",
"enable_mp_skip_c_identity",
"enable_mp_fused_linear_param_grad_add",
"enable_sp_async_reduce_scatter",
"enable_delay_scale_loss",
"sync_param",
"sync_grad",
"sync_moment",
]:
raise ValueError(
f"Found unknown tensor parallell config {x}, "
f"accept config is enable_mp_async_allreduce, enable_mp_skip_c_identity, enable_mp_fused_linear_param_grad_add, sync_param, sync_grad and sync_moment."
f"accept config is enable_mp_async_allreduce, enable_mp_skip_c_identity, enable_mp_fused_linear_param_grad_add, enable_sp_async_reduce_scatter, enable_delay_scale_loss, sync_param, sync_grad and sync_moment."
)
try:
if "enable_mp_async_allreduce" in mp_config:
Expand All @@ -1153,6 +1156,8 @@ def split_parallel_config(parallel_config):
warnings.warn(
"enable_mp_fused_linear_param_grad_add only works with enable_mp_async_allreduce. It will not work."
)
if "enable_sp_async_reduce_scatter" in mp_config:
strategy.hybrid_configs["mp_configs"].sp_async_reduce_scatter = True

sync_param = "sync_param" in mp_config
sync_grad = "sync_grad" in mp_config
Expand Down

0 comments on commit 5fd6dd2

Please sign in to comment.