PaddlePaddle · jeff41404 · Dec 30, 2021 · Dec 24, 2021 · Dec 24, 2021 · Dec 27, 2021
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mode_op.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ModeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode");
+    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int& dim_size = input_dims.size();
+    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE_EQ(
+        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
+        paddle::platform::errors::InvalidArgument(
+            "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
+            dim_size, dim_size, axis));
+    PADDLE_ENFORCE_GE(input_dims.size(), 1,
+                      paddle::platform::errors::InvalidArgument(
+                          "input of ModeOp must have >= 1d shape"));
+    if (axis < 0) axis += dim_size;
+    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
+    std::vector<int64_t> dimvec;
+    for (int64_t i = 0; i < axis; i++) {
+      dimvec.emplace_back(input_dims[i]);
+    }
+    if (keepdim) {
+      dimvec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < dim_size; i++) {
+      dimvec.emplace_back(input_dims[i]);
+    }
+    framework::DDim dims = framework::make_ddim(dimvec);
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
+                                                "input shape should >= 1d"));
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class ModeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Mode op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddAttr<int>("axis",
+                 "the axis to calculate mode values."
+                 "if not set, will calculate on last axis.")
+        .SetDefault(-1);
+    AddAttr<bool>("keepdim", "Keep the dim that to reduce.").SetDefault(false);
+    AddComment(R"DOC(
+This operator finds the mode of input Tensor. And outputs their values and indices as vectors. 
+)DOC");
+  }
+};
+
+class ModeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument("Input(X) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Indices"), true,
+        platform::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Grad Input(Out) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename T>
+class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("mode_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Output("Indices"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker,
+                  ops::ModeGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ModeGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(mode,
+                       ops::ModeCPUKernel<paddle::platform::CPUPlace, float>,
+                       ops::ModeCPUKernel<paddle::platform::CPUPlace, double>,
+                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int32_t>,
+                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int64_t>);
+
+REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    mode_grad, ops::ModeGradCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, double>,
+    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
@@ -0,0 +1,233 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mode_op.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/fluid/operators/top_k_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+int ComputeBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T>
+void getModebySort(const platform::CUDADeviceContext& ctx,
+                   const framework::Tensor* input_tensor,
+                   const int64_t num_cols, const int64_t num_rows,
+                   T* out_tensor, int64_t* indices_tensor) {
+  framework::Tensor input_tmp;
+  framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp);
+  T* input_tmp_data = input_tmp.mutable_data<T>(ctx.GetPlace());
+  input_tmp.Resize(framework::make_ddim({num_rows, num_cols}));
+  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
+  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
+
+  for (int64_t i = 0; i < num_rows; ++i) {
+    T* begin = input_tmp_data + num_cols * i;
+    T* end = input_tmp_data + num_cols * (i + 1);
+    thrust::device_vector<int64_t> indices_data(num_cols);
+    thrust::sequence(thrust::device, indices_data.begin(),
+                     indices_data.begin() + num_cols);
+    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
+    int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
+                                           begin + 1, 0, thrust::plus<int>(),
+                                           thrust::not_equal_to<T>());
+    thrust::device_vector<T> keys_data(unique);
+    thrust::device_vector<int64_t> cnts_data(unique);
+    thrust::reduce_by_key(thrust::device, begin, end,
+                          thrust::constant_iterator<int>(1), keys_data.begin(),
+                          cnts_data.begin());
+    auto it = thrust::max_element(thrust::device, cnts_data.begin(),
+                                  cnts_data.begin() + unique);
+    T mode = keys_data[it - cnts_data.begin()];
+    int64_t counts = cnts_data[it - cnts_data.begin()];
+    auto pos = thrust::find(thrust::device, begin, end, mode);
+    int64_t index = indices_data[pos - begin + counts - 1];
+    out_tensor_ptr[i] = static_cast<T>(mode);
+    indices_tensor_ptr[i] = static_cast<int64_t>(index);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class ModeOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument(
+            "It must use CUDAPlace, you must check your device set."));
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
+
+    // get the input dims
+    const auto& in_dims = input->dims();
+    // calcluate the real axis
+    if (axis < 0) axis += in_dims.size();
+
+    auto out_dims = output->dims();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    if (axis == in_dims.size() - 1) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      const auto& dev_ctx = ctx.cuda_device_context();
+      getModebySort<T>(dev_ctx, input, input_width, input_height, output_data,
+                       indices_data);
+    } else {
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+
+      if (!keepdim) {
+        std::vector<int> tmp_out_shape;
+        for (int i = 0; i < axis; i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        tmp_out_shape.emplace_back(1);
+        for (int i = axis + 1; i < in_dims.size(); i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        framework::DDim tmp_out_dims = framework::make_ddim(tmp_out_shape);
+        output->Resize(tmp_out_dims);
+        indices->Resize(tmp_out_dims);
+      }
+
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(in_dims);
+      for (int i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+        trans_out_dims[i] = in_dims[trans[i]];
+      }
+      trans_out_dims[in_dims.size() - 1] = 1;
+
+      // second step, tranpose the input
+      framework::Tensor trans_input;
+      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
+      int ndims = trans.size();
+      const auto& dev_ctx = ctx.cuda_device_context();
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
+                                                   &trans_input, trans);
+      framework::Tensor trans_ind;
+      int64_t* trans_ind_data =
+          trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
+      framework::Tensor trans_out;
+      T* trans_out_data =
+          trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+      getModebySort<T>(dev_ctx, &trans_input, input_width, input_height,
+                       trans_out_data, trans_ind_data);
+      // last step, tranpose back the indices and output
+      TransCompute<platform::CUDADeviceContext, int64_t>(
+          ndims, dev_ctx, trans_ind, indices, trans);
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
+                                                   output, trans);
+      if (!keepdim) {
+        output->Resize(out_dims);
+        indices->Resize(out_dims);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ModeOpGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::InvalidArgument(
+            "It must use CUDAPlace, you must check your device set."));
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<framework::Tensor>("Indices");
+    auto* x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int axis = context.Attr<int>("axis");
+
+    const auto& in_dims = x->dims();
+    auto out_dims = indices->dims();
+
+    if (axis < 0) axis += in_dims.size();
+    // allocate the cuda memory for the x_grad
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+
+    int pre, n, post;
+    GetDims(in_dims, axis, &pre, &n, &post);
+
+    // calcluate the block and grid num
+    auto& dev_ctx = context.cuda_device_context();
+    int block_size = ComputeBlockSize(post);
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+    int grid_size = std::min(max_blocks, pre);
+    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    mode, ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    mode_grad,
+    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);