PaddlePaddle · JZ-LIANG · May 30, 2024 · May 10, 2024 · May 10, 2024 · May 16, 2024
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -39,6 +40,17 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
 
     int root = ctx.Attr<int>("root");
 
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      distributed::ProcessGroup* pg = map->get(rid);
+      auto b_opts = distributed::BroadcastOptions();
+      b_opts.source_rank = rid;
+      b_opts.source_root = root;
+      auto task = pg->Broadcast(out, *x, b_opts, false);
+      task->Wait();
+      return;
+    }
+
     gpuStream_t stream = ctx.cuda_device_context().stream();
     const auto& comm_context_manager =
         phi::distributed::CommContextManager::GetInstance();

diff --git a/python/paddle/distributed/communication/stream/broadcast.py b/python/paddle/distributed/communication/stream/broadcast.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import framework
+from paddle import _C_ops, framework
 from paddle.base import data_feeder
 from paddle.distributed.communication.group import (
     _get_global_group,
     _get_or_throw_group_rank,
     _warn_cur_rank_not_in_group,
 )
+from paddle.distributed.communication.reduce import _to_inplace_op
+from paddle.framework import in_pir_mode
 
 
 def _broadcast_in_dygraph(
@@ -59,6 +61,11 @@ def _broadcast_in_static_mode(
     helper = framework.LayerHelper(op_type, **locals())
     ring_id = 0 if group is None else group.id
 
+    if in_pir_mode():
+        op_type = _to_inplace_op(op_type)
+        getattr(_C_ops, op_type)(tensor, ring_id, src_rank_in_group, sync_op)
+        return
+
     helper.append_op(
         type=op_type,
         inputs={'X': [tensor]},

diff --git a/test/collective/process_group_nccl_pir.py b/test/collective/process_group_nccl_pir.py
@@ -321,6 +321,58 @@ def test_allreduce_prod_with_0d_input(self):
                         np.multiply(x_np, y_np), y_out
                     )
 
+    def test_broadcast(self):
+        # to_tensor dose not support float16 input
+        if self.dtype == "float16":
+            return
+        pg = self.pg
+        # rank 0
+        x_np = np.random.random(self.shape).astype(self.dtype)
+        # rank 1
+        y_np = np.random.random(self.shape).astype(self.dtype)
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program, startup_program):
+                if pg.rank() == 0:
+                    data = paddle.to_tensor(x_np)
+                else:
+                    data = paddle.to_tensor(y_np)
+                dist.broadcast(data, 1)
+                exe = paddle.static.Executor()
+                (data,) = exe.run(
+                    main_program,
+                    feed={},
+                    fetch_list=[data],
+                )
+                np.testing.assert_array_equal(y_np, data)
+
+    def test_broadcast_with_0d_input(self):
+        # to_tensor dose not support float16 input
+        if self.dtype == "float16":
+            return
+        pg = self.pg
+        # rank 0
+        x_np = np.random.random([]).astype(self.dtype)
+        # rank 1
+        y_np = np.random.random([]).astype(self.dtype)
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program, startup_program):
+                if pg.rank() == 0:
+                    data = paddle.to_tensor(x_np)
+                else:
+                    data = paddle.to_tensor(y_np)
+                dist.broadcast(data, 1)
+                exe = paddle.static.Executor()
+                (data,) = exe.run(
+                    main_program,
+                    feed={},
+                    fetch_list=[data],
+                )
+                np.testing.assert_array_equal(y_np, data)
+
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
     def setUp(self):