Skip to content

Commit

Permalink
[NewComm] Set new communication library as default. (PaddlePaddle#57768)
Browse files Browse the repository at this point in the history
* [NewComm] Set Flags_dynamic_static_unified_comm `True` in default.
New communication library will be used defaultly.

* Polish code.

* Fix problems of distributed testcases using new comm lib.

* Fix problems of testcases using new comm lib in default.

* Fix failed testcase.

* Fix falied testcases.
  • Loading branch information
GhostScreaming authored Oct 11, 2023
1 parent 88a8d61 commit f93f0ea
Show file tree
Hide file tree
Showing 13 changed files with 13 additions and 8 deletions.
4 changes: 2 additions & 2 deletions paddle/phi/core/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1338,13 +1338,13 @@ PHI_DEFINE_EXPORTED_int32(
* Communication library related FLAG
* Name: FLAGS_dynamic_static_unified_comm
* Since Version: 2.5
* Value Range: bool, default=false
* Value Range: bool, default=true
* Example:
* Note: Whether to use new communication library in auto parallel and static
* mode. If true, it will use unified CommContextManager for communication.
*/
PHI_DEFINE_EXPORTED_bool(dynamic_static_unified_comm,
false,
true,
"Whether to use new communication library in auto "
"parallel and static mode.");
#endif // FLAGS_dynamic_static_unified_comm
5 changes: 0 additions & 5 deletions test/collective/fleet/c_comm_init_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@

import paddle
from paddle import base
from paddle.distributed.fleet.base.private_helper_function import (
wait_server_ready,
)

paddle.enable_static()

Expand All @@ -35,8 +32,6 @@ def setUp(self):
self.exe = base.Executor(self.place)
self.endpoints.remove(self.current_endpoint)
self.other_endpoints = self.endpoints
if self.rank == 0:
wait_server_ready(self.other_endpoints)

def test_specifying_devices(self):
program = base.Program()
Expand Down
1 change: 1 addition & 0 deletions test/collective/fleet/test_fused_attention_pass_with_mp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@
set -e
# use default values
# FIXME: random fails on Unknown command lines -c (or -m).
export FLAGS_dynamic_static_unified_comm=0
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch fused_attention_pass_with_mp.py
1 change: 1 addition & 0 deletions test/distributed_passes/auto_parallel_pass_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class AutoPallelPassTestBase(DistPassTestBase):
def setUp(self):
paddle.enable_static()
seed = int(os.environ.get('SEED', -1))
os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
if seed <= 0:
seed = np.random.randint(low=1, high=1000000, size=[1])[0]
os.environ['SEED'] = str(seed)
Expand Down
1 change: 1 addition & 0 deletions test/distributed_passes/dist_pass_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def setUp(self):
if paddle.is_compiled_with_cuda():
paddle.set_flags({'FLAGS_cudnn_deterministic': 1})

os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
seed = int(os.environ.get('SEED', -1))
if seed <= 0:
seed = np.random.randint(low=1, high=1000000, size=[1])[0]
Expand Down
1 change: 1 addition & 0 deletions test/legacy_test/test_collective_api_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ def check_with_place(
"PATH_ID": path_id,
"DTYPE": dtype,
"REDUCE_TYPE": str(reduce_type),
"FLAGS_dynamic_static_unified_comm": "0",
}
required_envs.update(additional_envs)
required_envs.update(need_envs)
Expand Down
2 changes: 1 addition & 1 deletion test/legacy_test/test_collective_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def check_with_place(
"LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
"GLOG_v": "3",
"NCCL_P2P_DISABLE": "1",
"Flags_dynamic_static_unified_comm": "False",
"FLAGS_dynamic_static_unified_comm": "0",
"DTYPE": "float32",
}
required_envs.update(need_envs)
Expand Down
1 change: 1 addition & 0 deletions test/legacy_test/test_dist_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1692,6 +1692,7 @@ def _get_required_envs(self, check_error_log=False, need_envs={}):
"NCCL_P2P_DISABLE": "1",
"NCCL_SHM_DISABLE": "1",
"FLAGS_new_executor_static_build": "1",
"FLAGS_dynamic_static_unified_comm": "0",
}

if check_error_log:
Expand Down
1 change: 1 addition & 0 deletions test/legacy_test/test_dist_hapi_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def start_local_trainers(
"PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
"FLAGS_dynamic_static_unified_comm": "0",
}

current_env.update(proc_env)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def run_test(
os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0)
os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
os.environ.update(need_env)

touch_file_env = 'SUCCESS_TOUCH_FILE'
Expand Down
1 change: 1 addition & 0 deletions test/legacy_test/test_parallel_dygraph_dataparallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def start_local_trainers(
"PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
"FLAGS_dynamic_static_unified_comm": "0",
}

proc_env["FLAGS_allocator_strategy"] = allocator_strategy
Expand Down
1 change: 1 addition & 0 deletions test/legacy_test/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,4 +207,5 @@ def test_ps_4(self):


if __name__ == '__main__':
os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
unittest.main()
1 change: 1 addition & 0 deletions tools/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

def main():
sys.path.append(os.getcwd())
os.environ["FLAGS_dynamic_static_unified_comm"] = "false"
if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
if os.getenv('FLAGS_enable_gpu_memory_usage_log') is None:
os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
Expand Down

0 comments on commit f93f0ea

Please sign in to comment.