PaddlePaddle · LiYuRio · Dec 22, 2023 · Dec 9, 2023 · Dec 9, 2023 · Dec 12, 2023
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -51,6 +51,7 @@
 #include "paddle/phi/core/distributed/auto_parallel/reshard/s_to_r_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/same_status_reshard_function.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.h"
 #include "paddle/phi/core/enforce.h"
 
 #ifdef PADDLE_WITH_DISTRIBUTE
@@ -233,6 +234,10 @@ void BindAutoParallel(py::module *m) {
       *m, "PToSReshardFunction", ReshardFunction)
       .def(py::init<>());
 
+  py::class_<phi::distributed::XToRShrinkReshardFunction>(
+      *m, "XToRShrinkReshardFunction", ReshardFunction)
+      .def(py::init<>());
+
   py::class_<phi::distributed::SameNdMeshReshardFunction>(
       *m, "SameNdMeshReshardFunction", ReshardFunction)
       .def(py::init<>());

diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/reshard/CMakeLists.txt
@@ -10,6 +10,7 @@ collect_srcs(
   s_to_s_reshard_function.cc
   p_to_s_reshard_function.cc
   s_to_p_reshard_function.cc
+  x_to_r_reshard_function.cc
   nd_mesh_reshard_function.cc
   same_status_reshard_function.cc
   reshard_function_registry.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/p_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/p_to_r_reshard_function.cc
@@ -105,6 +105,8 @@ bool PToRReshardFunctionCrossMesh::IsSuitable(
 
   RESHARD_SHORTCUT_IF_FALSE(in_process_mesh.ndim() == 1);
   RESHARD_SHORTCUT_IF_FALSE(out_process_mesh.ndim() == 1);
+  RESHARD_SHORTCUT_IF_FALSE(in_process_mesh.shape() ==
+                            out_process_mesh.shape());
   RESHARD_SHORTCUT_IF_FALSE(in_process_mesh != out_process_mesh);
 
   return true;

diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function_registry.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function_registry.cc
@@ -24,6 +24,7 @@
 #include "paddle/phi/core/distributed/auto_parallel/reshard/s_to_r_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/same_status_reshard_function.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.h"
 
 namespace phi {
 namespace distributed {
@@ -65,6 +66,7 @@ REGISTER_RESHARD_FUNC(PToSReshardFunction);
 REGISTER_RESHARD_FUNC(PToSReshardFunctionCrossMesh);
 REGISTER_RESHARD_FUNC(SToSReshardFunction);
 REGISTER_RESHARD_FUNC(SToSReshardFunctionCrossMesh);
+REGISTER_RESHARD_FUNC(XToRShrinkReshardFunction);
 REGISTER_RESHARD_FUNC(SameStatusReshardFunction);
 REGISTER_RESHARD_FUNC(SameNdMeshReshardFunction);
 REGISTER_RESHARD_FUNC(CrossNdMeshReshardFunction);

diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.h"
+
+#include "glog/logging.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
+#include "paddle/phi/core/distributed/store/store_utils.h"
+#include "paddle/phi/kernels/add_n_kernel.h"
+#include "paddle/phi/kernels/concat_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/p_recv_kernel.h"
+#include "paddle/phi/kernels/p_send_kernel.h"
+
+namespace phi {
+namespace distributed {
+
+namespace {
+
+std::vector<int64_t> GetUnionProcessIds(std::vector<int64_t> in_process_ids,
+                                        std::vector<int64_t> out_process_ids) {
+  std::vector<int64_t> result;
+  std::sort(in_process_ids.begin(), in_process_ids.end());
+  std::sort(out_process_ids.begin(), out_process_ids.end());
+  std::set_union(in_process_ids.begin(),
+                 in_process_ids.end(),
+                 out_process_ids.begin(),
+                 out_process_ids.end(),
+                 std::back_inserter(result));
+  return result;
+}
+
+}  // namespace
+
+bool XToRShrinkReshardFunction::IsSuitable(
+    const DistTensor& in, const TensorDistAttr& out_dist_attr) {
+  const auto& in_dist_attr = in.dist_attr();
+
+  RESHARD_SHORTCUT_IF_FALSE(out_dist_attr.is_replicated());
+
+  const auto& in_process_mesh = in_dist_attr.process_mesh();
+  const auto& out_process_mesh = out_dist_attr.process_mesh();
+
+  RESHARD_SHORTCUT_IF_FALSE(in_process_mesh.ndim() == 1);
+  RESHARD_SHORTCUT_IF_FALSE(out_process_mesh.ndim() == 1);
+  RESHARD_SHORTCUT_IF_FALSE(in_process_mesh.process_ids().size() != 1);
+  RESHARD_SHORTCUT_IF_FALSE(out_process_mesh.process_ids().size() == 1);
+
+  return true;
+}
+
+void XToRShrinkReshardFunction::Eval(phi::DeviceContext* dev_ctx,
+                                     const DistTensor& in,
+                                     const TensorDistAttr& out_dist_attr,
+                                     DistTensor* out) {
+  VLOG(3) << "Call XToRShrinkReshardFunction Eval";
+  const auto& in_dist_attr = in.dist_attr();
+  const auto& in_dims_mapping = in_dist_attr.dims_mapping();
+  const auto& in_mesh = in_dist_attr.process_mesh();
+  const auto& out_mesh = out_dist_attr.process_mesh();
+  const auto& in_process_ids = in_mesh.process_ids();
+  const auto& out_process_ids = out_mesh.process_ids();
+  int64_t cur_global_rank = GetCurGlobalRank();
+  int64_t root_rank = out_process_ids[0];
+  auto dtype = in.dtype();
+  const auto& in_partial_status = in_dist_attr.partial_status();
+  auto all_process_ids = GetUnionProcessIds(in_process_ids, out_process_ids);
+  std::unordered_map<int64_t, DenseTensor> rank_to_result;
+  bool dynamic_shape = true;
+
+  // Step 1: other ranks need to send value to the root
+  if (!in_dist_attr.is_replicated()) {
+    if (cur_global_rank != root_rank) {
+      // send dense tensor to root
+      RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                                PSendKernel,
+                                dtype,
+                                all_process_ids,
+                                in.value(),
+                                root_rank,
+                                dynamic_shape);
+    } else {
+      for (size_t i = 0; i < all_process_ids.size(); ++i) {
+        if (all_process_ids[i] != root_rank) {
+          rank_to_result.emplace(all_process_ids[i], DenseTensor());
+          RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                                    PRecv,
+                                    dtype,
+                                    all_process_ids,
+                                    all_process_ids[i],
+                                    dynamic_shape,
+                                    &rank_to_result[all_process_ids[i]]);
+        }
+      }
+    }
+  }
+
+  // Step 2: concat or reduce based on dist attr
+  if (cur_global_rank == root_rank) {
+    std::vector<const DenseTensor*> input_vec;
+    for (size_t i = 0; i < in_process_ids.size(); ++i) {
+      if (in_process_ids[i] == cur_global_rank) {
+        input_vec.emplace_back(&(in.value()));
+      } else {
+        input_vec.emplace_back(&(rank_to_result[in_process_ids[i]]));
+      }
+    }
+    if (in_dist_attr.is_shard()) {
+      int split_axis =
+          GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first;
+      RESHARD_FUNCTOR(
+          dev_ctx, Concat, dtype, input_vec, split_axis, GetMutableTensor(out));
+    } else if (in_dist_attr.is_partial()) {
+      auto in_reduce_type = in_partial_status.at(0);
+      if (in_reduce_type == ReduceType::kRedSum) {
+        DenseTensor result_add_out = *input_vec[0];
+        for (size_t i = 1; i < input_vec.size(); ++i) {
+          RESHARD_FUNCTOR(dev_ctx,
+                          Add,
+                          dtype,
+                          *input_vec[i],
+                          result_add_out,
+                          &result_add_out);
+        }
+        SetValue(out, result_add_out);
+      } else {
+        PADDLE_THROW(phi::errors::Unavailable(
+            "The reduce type is not supported, will be supported soon."));
+      }
+    } else {
+      SetValue(out, in.value());
+    }
+    SetDistProps(out, in.dims(), out_dist_attr);
+  }
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.h b/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.h"
+
+namespace phi {
+namespace distributed {
+
+class XToRShrinkReshardFunction final : public ReshardFunction {
+ public:
+  bool IsSuitable(const DistTensor& in,
+                  const TensorDistAttr& out_dist_attr) override;
+
+  void Eval(DeviceContext* dev_ctx,
+            const DistTensor& in,
+            const TensorDistAttr& out_dist_attr,
+            DistTensor* out) override;
+
+  std::string Name() override { return "XToRShrinkReshard"; }
+};
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
@@ -92,6 +92,9 @@ PD_REGISTER_KERNEL(add_grad,
                    double,
                    int16_t,
                    int,
+                   bool,
+                   uint8_t,
+                   int8_t,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}

diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -78,6 +78,9 @@ PD_REGISTER_KERNEL(add,
                    double,
                    int16_t,
                    int,
+                   bool,
+                   uint8_t,
+                   int8_t,
                    int64_t,
                    complex64,
                    complex128) {}
@@ -90,6 +93,9 @@ PD_REGISTER_KERNEL(grad_add,
                    double,
                    int16_t,
                    int,
+                   bool,
+                   uint8_t,
+                   int8_t,
                    int64_t,
                    complex64,
                    complex128) {}
diff --git a/paddle/phi/kernels/elementwise_add_kernel.h b/paddle/phi/kernels/elementwise_add_kernel.h
@@ -35,4 +35,14 @@ DenseTensor Add(const Context& dev_ctx,
   return dense_out;
 }
 
+template <typename T, typename Context>
+void Add(const Context& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* dense_out) {
+  MetaTensor meta_out(dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  AddKernel<T, Context>(dev_ctx, x, y, dense_out);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -281,6 +281,9 @@ PD_REGISTER_KERNEL(add,
                    double,
                    int16_t,
                    int,
+                   bool,
+                   uint8_t,
+                   int8_t,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
@@ -295,6 +298,9 @@ PD_REGISTER_KERNEL(grad_add,
                    double,
                    int16_t,
                    int,
+                   bool,
+                   uint8_t,
+                   int8_t,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,

diff --git a/paddle/phi/kernels/legacy/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/legacy/cpu/elementwise_add_kernel.cc
@@ -41,6 +41,9 @@ PD_REGISTER_KERNEL(add_raw,
                    double,
                    int16_t,
                    int,
+                   bool,
+                   uint8_t,
+                   int8_t,
                    int64_t,
                    complex64,
                    complex128) {}
diff --git a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
@@ -65,6 +65,9 @@ PD_REGISTER_KERNEL(add_raw,
                    double,
                    int16_t,
                    int,
+                   bool,
+                   uint8_t,
+                   int8_t,
                    int64_t,
                    float16,
                    bfloat16,

diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
@@ -120,6 +120,9 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_reshard_p_to_r MODULES test_reshard_p_to_r)
   set_tests_properties(test_reshard_p_to_r
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
+  py_test_modules(test_reshard_x_to_r MODULES test_reshard_x_to_r)
+  set_tests_properties(test_reshard_x_to_r
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
   py_test_modules(test_reshard_nd_mesh MODULES test_reshard_nd_mesh)
   set_tests_properties(test_reshard_nd_mesh
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)