diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc index 3c2fe8b9e5d9f..ddfbdbace054d 100644 --- a/paddle/fluid/operators/range_op.cc +++ b/paddle/fluid/operators/range_op.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/range_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -22,51 +26,6 @@ class RangeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->HasInput("Start")) { - auto s_dims = ctx->GetInputDim("Start"); - PADDLE_ENFORCE_EQ( - s_dims.size(), 1, - platform::errors::InvalidArgument( - "The dim of the shape of Input(Start) should be 1, but got %d", - s_dims.size())); - - PADDLE_ENFORCE_EQ(s_dims[0], 1, - platform::errors::InvalidArgument( - "The first dim of the shape of Input(Start) should " - "be 1, but got %d", - s_dims[0])); - } - if (ctx->HasInput("End")) { - auto e_dims = ctx->GetInputDim("End"); - PADDLE_ENFORCE_EQ( - e_dims.size(), 1, - platform::errors::InvalidArgument( - "The dim of the shape of Input(End) should be 1, but got %d", - e_dims.size())); - - PADDLE_ENFORCE_EQ(e_dims[0], 1, platform::errors::InvalidArgument( - "The first dim of the shape of " - "Input(End) should be 1, but got %d", - e_dims[0])); - } - if (ctx->HasInput("Step")) { - auto step_dims = ctx->GetInputDim("Step"); - PADDLE_ENFORCE_EQ( - step_dims.size(), 1, - platform::errors::InvalidArgument( - "The dim of the shape of Input(Step) should be 1, but got %d", - step_dims.size())); - - PADDLE_ENFORCE_EQ(step_dims[0], 1, - platform::errors::InvalidArgument( - "The first dim of the shape of Input(Step) should " - "be 1, but got %d", - step_dims[0])); - } - ctx->SetOutputDim("Out", {-1}); - } - protected: framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const framework::Tensor &tensor, @@ -101,7 +60,7 @@ class RangeOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker); -REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel, - ops::CPURangeKernel, ops::CPURangeKernel, - ops::CPURangeKernel); +DECLARE_INFER_SHAPE_FUNCTOR(range, RangeInferMetaFunctor, + PD_INFER_META(phi::RangeInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker, + RangeInferMetaFunctor); diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu deleted file mode 100644 index 1b1d41ae4c5c7..0000000000000 --- a/paddle/fluid/operators/range_op.cu +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/range_op.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -template -__global__ void RangeKernel(T start, T step, int64_t size, T* out) { - CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; } -} - -template -class CUDARangeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* start_t = context.Input("Start"); - auto* end_t = context.Input("End"); - auto* step_t = context.Input("Step"); - auto* out = context.Output("Out"); - - T start = GetValue(start_t); - T end = GetValue(end_t); - T step = GetValue(step_t); - - int64_t size = 0; - GetSize(start, end, step, &size); - out->Resize(phi::make_ddim({size})); - T* out_data = out->mutable_data(context.GetPlace()); - - auto stream = context.cuda_device_context().stream(); - int block = std::min(size, static_cast(256)); - int grid = (size + block - 1) / block; - RangeKernel<<>>(start, step, size, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel, - ops::CUDARangeKernel, - ops::CUDARangeKernel, - ops::CUDARangeKernel); diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index c7e91ba35dee1..ac32170d93957 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -30,7 +30,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(range); +USE_OP_ITSELF(range); USE_OP_DEVICE_KERNEL(range, NPU); template diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index af03ed668e8c8..a9fa78c4e4943 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/stack_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace plat = paddle::platform; namespace ops = paddle::operators; @@ -26,52 +29,6 @@ class StackOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0, - platform::errors::InvalidArgument( - "Number of Inputs(X) must be larger than 0, but" - " received value is:%d.", - ctx->Inputs("X").size())); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true, - platform::errors::InvalidArgument( - "Output(Y) of stack_op should not be null.")); - - auto input_dims = ctx->GetInputsDim("X"); - for (size_t i = 1; i < input_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0], - platform::errors::InvalidArgument( - "Dims of all Inputs(X) must be the same, but" - " received input %d dim is:%d not equal to input 0" - " dim:%d.", - i, input_dims[i], input_dims[0])); - } - - // Only lod of X[0] would be shared with Y - ctx->ShareLoD("X", /*->*/ "Y"); - - int axis = ctx->Attrs().Get("axis"); - int rank = input_dims[0].size(); - PADDLE_ENFORCE_GE( - axis, -(rank + 1), - platform::errors::InvalidArgument( - "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, " - "but received axis is:%d.", - rank, axis)); - - PADDLE_ENFORCE_LT( - axis, rank + 1, - platform::errors::InvalidArgument( - "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, " - "but received axis is:%d", - rank, axis)); - - if (axis < 0) axis += (rank + 1); - - auto vec = phi::vectorize(input_dims[0]); - vec.insert(vec.begin() + axis, input_dims.size()); - ctx->SetOutputDim("Y", phi::make_ddim(vec)); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = @@ -168,21 +125,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(stack, StackInferMetaFunctor, + PD_INFER_META(phi::StackInferMeta)); REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, ops::StackGradOpMaker, - ops::StackGradOpMaker); + ops::StackGradOpMaker, + StackInferMetaFunctor); REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); - -REGISTER_OP_CPU_KERNEL( - stack, ops::StackKernel, - ops::StackKernel, - ops::StackKernel, - ops::StackKernel, - ops::StackKernel); - -REGISTER_OP_CPU_KERNEL( - stack_grad, ops::StackGradKernel, - ops::StackGradKernel, - ops::StackGradKernel, - ops::StackGradKernel, - ops::StackGradKernel); diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu deleted file mode 100644 index a56dd6aef4f66..0000000000000 --- a/paddle/fluid/operators/stack_op.cu +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle/fluid/operators/stack_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -namespace plat = paddle::platform; -namespace ops = paddle::operators; - -namespace paddle { -namespace operators { - -template -__global__ void StackCUDAKernel(T** input_ptrs, int split_size, int rows, - int cols, T* __restrict__ output) { - IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x; - - for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) { - IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y; - - IntType split = grid_x / split_size; - const T* input_ptr = input_ptrs[split]; - IntType col_offset = grid_x % split_size; -#pragma unroll - for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) { - output[grid_y * cols + grid_x] = - input_ptr[grid_y * split_size + col_offset]; - } - } -} - -template -class StackGPUKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto x = ctx.MultiInput("X"); - auto* y = ctx.Output("Y"); - - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - - int n = static_cast(x.size()); - auto* y_data = y->mutable_data(ctx.GetPlace()); - std::vector x_datas(n); - for (int i = 0; i < n; i++) { - x_datas[i] = x[i]->data(); - } - - auto& dev_ctx = ctx.template device_context(); - auto tmp_x_data = memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*)); - memory::Copy(dev_ctx.GetPlace(), tmp_x_data->ptr(), platform::CPUPlace(), - reinterpret_cast(x_datas.data()), - x_datas.size() * sizeof(T*), dev_ctx.stream()); - - // Split x dim from axis to matrix - int x_row = 1, x_col = 1; - for (int i = 0; i < axis; ++i) { - x_row *= x[0]->dims()[i]; - } - x_col = x[0]->numel() / x_row; - int out_col = x_col * n; - - auto config = GetGpuLaunchConfig2D(dev_ctx, out_col, x_row); - - if (y->numel() < std::numeric_limits::max()) { - StackCUDAKernel<<>>( - reinterpret_cast(tmp_x_data->ptr()), x_col, x_row, out_col, - y_data); - } else { - StackCUDAKernel<<>>( - reinterpret_cast(tmp_x_data->ptr()), x_col, x_row, out_col, - y_data); - } - } -}; - -template -__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input, - int pre_dim_size, int split_dim_size, - int suf_dim_size, int num_split, - T** output_ptrs) { - assert(blockDim.y == 1); - assert(blockDim.z == 1); - // In this case they are equal - assert(split_dim_size % num_split == 0); - - IntType size = pre_dim_size * split_dim_size * suf_dim_size; - IntType each_dim_size = split_dim_size / num_split; - - for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size; - offset += blockDim.x * gridDim.x) { - IntType i = offset / (split_dim_size * suf_dim_size); - IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size; - IntType k = offset % suf_dim_size; - - T* output = output_ptrs[j / each_dim_size]; - if (output == nullptr) { - return; - } - IntType output_ind = i * each_dim_size * suf_dim_size + - (j % each_dim_size) * suf_dim_size + k; - *(output + output_ind) = input[offset]; - } -} - -template -class StackGradGPUKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dy = ctx.Input(framework::GradVarName("Y")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - - int n = dy->dims()[axis]; - PADDLE_ENFORCE_EQ(n, dx.size(), - platform::errors::InvalidArgument( - "Output dx size should be equal to n, but" - " received n is:%d dx size is:%d.", - n, dx.size())); - - // dx is output, so save each data address, then copy each dy into dx_data - std::vector outputs(n); - auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); - for (size_t j = 0; j < dx.size(); ++j) { - if (dx[j] == nullptr) { - outputs[j] = nullptr; - } - if (out_var_names[j] != framework::kEmptyVarName && - dx[j]->numel() != 0UL) { - T* ptr = dx[j]->mutable_data(ctx.GetPlace()); - outputs[j] = ptr; - } else { - outputs[j] = nullptr; - } - } - auto dy_data = dy->data(); - // each dx should have same shape - int dy_pre = 1, dy_suf = 1; - auto dy_dims = dy->dims(); - int split_dim = n; - for (int i = 0; i < axis; ++i) { - dy_pre *= dy_dims[i]; - } - dy_suf = dy->numel() / (split_dim * dy_pre); - - auto& dev_ctx = ctx.template device_context(); - auto tmp_out_data = memory::Alloc(dev_ctx, outputs.size() * sizeof(T*)); - memory::Copy(dev_ctx.GetPlace(), tmp_out_data->ptr(), platform::CPUPlace(), - reinterpret_cast(outputs.data()), - outputs.size() * sizeof(T*), dev_ctx.stream()); - - auto config = GetGpuLaunchConfig1D(dev_ctx, dy_pre * split_dim * dy_suf); - - if (dy->numel() < std::numeric_limits::max()) { - UnStackHelperCUDAKernel< - T, int32_t><<>>( - dy_data, dy_pre, split_dim, dy_suf, split_dim, - reinterpret_cast(tmp_out_data->ptr())); - } else { - UnStackHelperCUDAKernel< - T, int64_t><<>>( - dy_data, dy_pre, split_dim, dy_suf, split_dim, - reinterpret_cast(tmp_out_data->ptr())); - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(stack, ops::StackGPUKernel, - ops::StackGPUKernel, ops::StackGPUKernel, - ops::StackGPUKernel, - ops::StackGPUKernel, - ops::StackGPUKernel); - -REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradGPUKernel, - ops::StackGradGPUKernel, - ops::StackGradGPUKernel, - ops::StackGradGPUKernel, - ops::StackGradGPUKernel, - ops::StackGradGPUKernel); diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h deleted file mode 100644 index 03d5324528930..0000000000000 --- a/paddle/fluid/operators/stack_op.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct StackGradFunctor { - HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) - : dx_(dx), dy_(dy), n_(n), post_(post) {} - - HOSTDEVICE void operator()(int idx) { - int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; - if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx]; - } - - private: - VecDxType dx_; - const T *dy_; - int n_; - int post_; -}; - -template -static inline void StackGradFunctorForRange(const DeviceContext &ctx, - const VecDxType &dx, const T *dy, - int total_num, int n, int post) { - platform::ForRange for_range(ctx, total_num); - for_range(StackGradFunctor(dx, dy, n, post)); -} - -template -class StackKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto x = ctx.MultiInput("X"); - auto *y = ctx.Output("Y"); - - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - - int n = static_cast(x.size()); - auto *y_data = y->mutable_data(ctx.GetPlace()); - std::vector x_datas(n); - for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); - - int pre = 1, post = 1; - auto &dim = x[0]->dims(); - for (auto i = 0; i < axis; ++i) pre *= dim[i]; - for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; - - auto x_data_arr = x_datas.data(); - - size_t x_offset = 0; - size_t y_offset = 0; - for (int i = 0; i < pre; i++) { - for (int j = 0; j < n; j++) { - std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset, - post * sizeof(T)); - y_offset += post; - } - x_offset += post; - } - } -}; - -template -class StackGradKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *dy = ctx.Input(framework::GradVarName("Y")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - int n = dy->dims()[axis]; - std::vector dx_datas(n); // NOLINT - - for (int i = 0; i < n; i++) { - if (dx[i] == nullptr) { - dx_datas[i] = nullptr; - } else { - dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); - } - } - auto dy_data = dy->data(); - int pre = 1; - for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; - int total_num = dy->numel(); - int post = total_num / (n * pre); - auto &dev_ctx = ctx.template device_context(); - auto dx_data_arr = dx_datas.data(); - StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc index 3a6e5b2aca4b8..9d4ef0ffa20e2 100644 --- a/paddle/fluid/operators/stack_op_npu.cc +++ b/paddle/fluid/operators/stack_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/stack_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc index c5a20ed4d1c89..baaa2b4884ce3 100644 --- a/paddle/fluid/operators/stack_op_xpu.cc +++ b/paddle/fluid/operators/stack_op_xpu.cc @@ -13,9 +13,9 @@ // limitations under the License. #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/stack_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc index 9044a881b738d..5c103e088b559 100644 --- a/paddle/fluid/operators/unique_op.cc +++ b/paddle/fluid/operators/unique_op.cc @@ -13,7 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/unique_op.h" -#include "paddle/fluid/framework/op_version_registry.h" +#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,62 +29,54 @@ class UniqueOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique"); - auto in_dims = ctx->GetInputDim("X"); - if (!ctx->Attrs().Get("is_sorted")) { - OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); - PADDLE_ENFORCE_EQ(in_dims.size(), 1, - platform::errors::InvalidArgument( - "The Input(X) should be 1-D Tensor, " - "But now the dims of Input(X) is %d.", - in_dims.size())); - - ctx->SetOutputDim("Out", {-1}); - ctx->SetOutputDim("Index", in_dims); - return; - } bool return_index = ctx->Attrs().Get("return_index"); bool return_inverse = ctx->Attrs().Get("return_inverse"); bool return_counts = ctx->Attrs().Get("return_counts"); auto axis_vec = ctx->Attrs().Get>("axis"); + auto data_type = + static_cast(static_cast( + ctx->Attrs().Get("dtype"))); + + // Construct MetaTensor for InferMeta Func + using CompatMetaTensor = framework::CompatMetaTensor; + CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime()); + CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime()); + std::unique_ptr indices(nullptr); + std::unique_ptr index(nullptr); + std::unique_ptr counts(nullptr); if (return_index) { OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique"); + indices = + std::move(std::unique_ptr(new CompatMetaTensor( + ctx->GetOutputVarPtrs("Indices")[0], ctx->IsRuntime()))); } if (return_inverse) { OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); + index = std::move(std::unique_ptr(new CompatMetaTensor( + ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime()))); } if (return_counts) { OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique"); + counts = std::move(std::unique_ptr(new CompatMetaTensor( + ctx->GetOutputVarPtrs("Counts")[0], ctx->IsRuntime()))); } - - if (axis_vec.empty()) { - ctx->SetOutputDim("Out", {-1}); - if (return_inverse) { - ctx->SetOutputDim("Index", {phi::product(in_dims)}); - } + bool is_sorted = ctx->Attrs().Get("is_sorted"); + if (is_sorted) { + phi::UniqueInferMeta(x, return_index, return_inverse, return_counts, + axis_vec, data_type, &out, indices.get(), + index.get(), counts.get()); } else { - int axis = axis_vec[0]; - if (axis < 0) { - axis += in_dims.size(); - } - PADDLE_ENFORCE_LT( - axis, in_dims.size(), - platform::errors::InvalidArgument("The axis(%d) should be less than " - "the dimension size(%d) of x.", - axis, in_dims.size())); - auto out_dims = in_dims; - out_dims[axis] = -1; - ctx->SetOutputDim("Out", out_dims); - if (return_inverse) { - ctx->SetOutputDim("Index", {in_dims[axis]}); + OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); + if (index == nullptr) { + index = + std::move(std::unique_ptr(new CompatMetaTensor( + ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime()))); } - } - if (return_index) { - ctx->SetOutputDim("Indices", {-1}); - } - if (return_counts) { - ctx->SetOutputDim("Counts", {-1}); + phi::UniqueRawInferMeta(x, return_index, return_inverse, return_counts, + axis_vec, data_type, is_sorted, &out, + indices.get(), index.get(), counts.get()); } } @@ -152,40 +148,5 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker); -REGISTER_OP_CPU_KERNEL( - unique, ops::UniqueKernel, - ops::UniqueKernel, - ops::UniqueKernel, - ops::UniqueKernel); -REGISTER_OP_VERSION(unique) - .AddCheckpoint( - R"ROC( - Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute - [return_index, return_inverse, return_counts, axis, is_sorted]. - )ROC", - paddle::framework::compatible::OpVersionDesc() - .NewOutput("Indices", - "The indices of the input tensor that result in the " - "unique tensor.") - .NewOutput("Counts", "The counts for each unique element.") - .NewAttr("return_index", - "If True, also return the indices of the input" - " tensor that result in the unique Tensor.", - false) - .NewAttr("return_inverse", - "If True, also return the indices for where elements" - " in the original input ended up in the returned unique " - "tensor.", - false) - .NewAttr("return_counts", - "If True, also return the counts for each unique element.", - false) - .NewAttr("axis", - "The axis to apply unique. If None, the input will be " - "flattened.", - std::vector{}) - .NewAttr("is_sorted", - "If True, the unique elements of X are in ascending order." - "Otherwise, the unique elements are not sorted.", - false)); diff --git a/paddle/fluid/operators/unique_op.cu b/paddle/fluid/operators/unique_op.cu deleted file mode 100644 index 871274c999c8b..0000000000000 --- a/paddle/fluid/operators/unique_op.cu +++ /dev/null @@ -1,474 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() -#include "paddle/fluid/operators/unique_op.h" // TransComute() - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// Binary function 'less than' -template -struct LessThan { - int col; - const InT* in_trans_data; - - LessThan(int64_t _col, const InT* _in_trans_data) - : col(_col), in_trans_data(_in_trans_data) {} - - __device__ bool operator()(int64_t a, int64_t b) const { - for (int i = 0; i < col; ++i) { - InT lhs = in_trans_data[i + a * col]; - InT rhs = in_trans_data[i + b * col]; - if (lhs < rhs) { - return true; - } else if (lhs > rhs) { - return false; - } - } - return false; - } -}; - -// Binary function 'equal_to' -template -struct BinaryEqual { - int64_t col; - const InT* in_trans_data; - - BinaryEqual(int64_t _col, const InT* _in_trans_data) - : col(_col), in_trans_data(_in_trans_data) {} - - __device__ bool operator()(int64_t a, int64_t b) const { - for (int64_t i = 0; i < col; ++i) { - InT lhs = in_trans_data[i + a * col]; - InT rhs = in_trans_data[i + b * col]; - if (lhs != rhs) { - return false; - } - } - return true; - } -}; - -// Binary function 'not_equal_to' -template -struct BinaryNotEqual { - int64_t col; - const InT* in_trans_data; - - BinaryNotEqual(int64_t _col, const InT* _in_trans_data) - : col(_col), in_trans_data(_in_trans_data) {} - - __device__ bool operator()(int64_t a, int64_t b) const { - for (int64_t i = 0; i < col; ++i) { - InT lhs = in_trans_data[i + a * col]; - InT rhs = in_trans_data[i + b * col]; - if (lhs != rhs) { - return true; - } - } - return false; - } -}; - -// index_select() function for Tensor -template -void IndexSelect(const framework::ExecutionContext& context, - const Tensor& input, const Tensor& index, Tensor* output, - int dim) { - auto input_dim = input.dims(); - auto input_dim_size = input_dim.size(); - auto output_dim = output->dims(); - - auto slice_size = 1; - for (auto i = dim + 1; i < input_dim_size; i++) { - slice_size *= input_dim[i]; - } - - auto input_width = slice_size * input_dim[dim]; - auto output_width = slice_size * output_dim[dim]; - - auto outer_nums = 1; - for (auto i = 0; i < dim; i++) { - outer_nums *= input_dim[i]; - } - - auto index_size = index.dims()[0]; - - std::vector input_vec; - std::vector index_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &input_vec); - paddle::framework::TensorToVector(index, context.device_context(), - &index_vec); - std::vector out_vec(output->numel()); - - for (int i = 0; i < index_size; i++) { - PADDLE_ENFORCE_GE( - index_vec[i], 0, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_select) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - input_dim[dim], index_vec[i])); - PADDLE_ENFORCE_LT( - index_vec[i], input_dim[dim], - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_select) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - input_dim[dim], index_vec[i])); - } - - for (auto i = 0; i < outer_nums; i++) { - auto input_start_offset = i * input_width; - auto output_start_offset = i * output_width; - - for (auto j = 0; j < index_size; j++) { - IndexT index_value = index_vec[j]; - for (auto k = 0; k < slice_size; k++) { - out_vec[output_start_offset + j * slice_size + k] = - input_vec[input_start_offset + index_value * slice_size + k]; - } - } - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(output_dim); -} - -// The core logic of computing Unique for a flattend Tensor -template -static void UniqueFlattendCUDATensor(const framework::ExecutionContext& context, - const Tensor& in, Tensor* out, - bool return_index, bool return_inverse, - bool return_counts, equal_T equal, - not_equal_T not_equal, int64_t num_input) { - // 0. Prepration - Tensor in_hat; - framework::TensorCopy(in, context.GetPlace(), &in_hat); - auto in_data_hat = in_hat.mutable_data(context.GetPlace()); - - Tensor* sorted_indices = context.Output("Indices"); - sorted_indices->Resize(phi::make_ddim({num_input})); - auto sorted_indices_data = - sorted_indices->mutable_data(context.GetPlace()); - thrust::sequence(thrust::device, sorted_indices_data, - sorted_indices_data + num_input); - thrust::sort_by_key(thrust::device, in_data_hat, in_data_hat + num_input, - sorted_indices_data); - - // 1. Calculate op result: 'out' - Tensor range; - range.Resize(phi::make_ddim({num_input + 1})); - auto range_data_ptr = range.mutable_data(context.GetPlace()); - thrust::sequence(thrust::device, range_data_ptr, - range_data_ptr + num_input + 1); - framework::TensorCopy(in_hat, context.GetPlace(), out); - int num_out; - auto out_data = out->mutable_data(context.GetPlace()); - num_out = thrust::unique_by_key(thrust::device, out_data, - out_data + num_input, range_data_ptr, equal) - .first - - out_data; - out->Resize(phi::make_ddim({num_out})); - - // 3. Calculate inverse index: 'inverse' - if (return_inverse) { - Tensor* inverse = context.Output("Index"); - inverse->Resize(phi::make_ddim({num_input})); - auto inverse_data = inverse->mutable_data(context.GetPlace()); - Tensor inv_loc; - inv_loc.Resize(phi::make_ddim({num_input})); - auto inv_loc_data_ptr = inv_loc.mutable_data(context.GetPlace()); - thrust::adjacent_difference(thrust::device, in_data_hat, - in_data_hat + num_input, inv_loc_data_ptr, - not_equal); - thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); - inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault - thrust::inclusive_scan(thrust::device, inv_loc_data_ptr, - inv_loc_data_ptr + num_input, inv_loc_data_ptr); - thrust::scatter(thrust::device, inv_loc_data_ptr, - inv_loc_data_ptr + num_input, sorted_indices_data, - inverse_data); - } - - // 2. Calculate sorted index: 'sorted_indices' - if (return_index) { - Tensor indices; - indices.Resize(phi::make_ddim({num_input})); - auto indices_data_ptr = indices.mutable_data(context.GetPlace()); - thrust::copy(thrust::device, in_data_hat, in_data_hat + num_input, - indices_data_ptr); - thrust::unique_by_key(thrust::device, indices_data_ptr, - indices_data_ptr + num_input, sorted_indices_data, - equal); - sorted_indices->Resize(phi::make_ddim({num_out})); - } - - // 4. Calculate 'counts' - if (return_counts) { - Tensor* counts = context.Output("Counts"); - counts->Resize(phi::make_ddim({num_out})); - auto count_data = counts->mutable_data(context.GetPlace()); - // init 'count_data' as 0 - thrust::fill(thrust::device, count_data, count_data + num_out, 0); - thrust::device_ptr range_data_ptr_dev(range_data_ptr); - range_data_ptr_dev[num_out] = num_input; - thrust::adjacent_difference(thrust::device, range_data_ptr + 1, - range_data_ptr + num_out + 1, count_data); - } -} - -// The logic of compute unique with axis required, it's a little different -// from above function -template -static void ComputeUniqueDims(const framework::ExecutionContext& context, - Tensor* sorted_indices, - IndexT* sorted_indices_data, Tensor* out, - bool return_index, bool return_inverse, - bool return_counts, equal_T equal, - not_equal_T not_equal, int64_t row) { - // 1. inverse indices: 'inverse' - Tensor* inverse = context.Output("Index"); - inverse->Resize(phi::make_ddim({row})); - auto inverse_data = inverse->mutable_data(context.GetPlace()); - Tensor inv_loc; - inv_loc.Resize(phi::make_ddim({row})); - auto inv_loc_data_ptr = inv_loc.mutable_data(context.GetPlace()); - thrust::adjacent_difference(thrust::device, sorted_indices_data, - sorted_indices_data + row, inv_loc_data_ptr, - not_equal); - thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); - inv_loc_data_dev[0] = 0; - thrust::inclusive_scan(thrust::device, inv_loc_data_ptr, - inv_loc_data_ptr + row, inv_loc_data_ptr); - thrust::scatter(thrust::device, inv_loc_data_ptr, inv_loc_data_ptr + row, - sorted_indices_data, inverse_data); - - // 2. sorted indices - Tensor range; - range.Resize(phi::make_ddim({row + 1})); - auto range_data_ptr = range.mutable_data(context.GetPlace()); - thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); - int num_out; - num_out = - thrust::unique_by_key(thrust::device, sorted_indices_data, - sorted_indices_data + row, range_data_ptr, equal) - .first - - sorted_indices_data; - thrust::device_ptr range_data_ptr_dev(range_data_ptr); - range_data_ptr_dev[num_out] = row; - sorted_indices->Resize(phi::make_ddim({num_out})); - - // 3. counts: 'counts' - Tensor* counts = context.Output("Counts"); - counts->Resize(phi::make_ddim({num_out})); - auto count_data = counts->mutable_data(context.GetPlace()); - thrust::fill(thrust::device, count_data, count_data + row, 0); - thrust::adjacent_difference(thrust::device, range_data_ptr + 1, - range_data_ptr + row + 1, count_data); -} - -// Calculate unique when 'axis' is set -template -static void UniqueDimsCUDATensor(const framework::ExecutionContext& context, - const Tensor& in, Tensor* out, - bool return_index, bool return_inverse, - bool return_counts, int axis) { - // 1. Transpose & reshape - // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] - std::vector permute(in.dims().size()); - std::iota(permute.begin(), permute.end(), 0); - permute[axis] = 0; - permute[0] = axis; - std::vector in_trans_dims_vec(phi::vectorize(in.dims())); - in_trans_dims_vec[axis] = in.dims()[0]; - in_trans_dims_vec[0] = in.dims()[axis]; - framework::Tensor in_trans; - framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); - in_trans.Resize(in_trans_dims); - in_trans.mutable_data(context.GetPlace()); - auto& dev_ctx = context.cuda_device_context(); - TransCompute(in.dims().size(), // num of dims - dev_ctx, // device - in, // original Tensor - &in_trans, // Tensor after reshape - permute); // index of axis - - // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] - framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); - in_trans.Resize(in_trans_flat_dims); - - // now 'in_trans' is 2D - int64_t col = in_trans.dims()[1]; - int64_t row = in_trans.dims()[0]; - const InT* in_trans_data = in_trans.data(); - - Tensor* sorted_indices = context.Output("Indices"); - sorted_indices->Resize(phi::make_ddim({row})); - auto sorted_indices_data = - sorted_indices->mutable_data(context.GetPlace()); - - // 2. Calculate 'sorted_indices', 'inverse', 'counts' - // Init index and sort - thrust::sequence(thrust::device, sorted_indices_data, - sorted_indices_data + row); - thrust::sort(thrust::device, sorted_indices_data, sorted_indices_data + row, - LessThan(col, in_trans_data)); - ComputeUniqueDims( - context, sorted_indices, sorted_indices_data, out, return_index, - return_inverse, return_counts, BinaryEqual(col, in_trans_data), - BinaryNotEqual(col, in_trans_data), row); - - // 3. Select indices and reshape back to get 'out' - Tensor out_trans; - std::vector out_trans_dims_vec = in_trans_dims_vec; - out_trans_dims_vec[0] = sorted_indices->numel(); - out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); - out_trans.mutable_data(context.GetPlace()); - - IndexSelect(context, in_trans, *sorted_indices, &out_trans, 0); - - std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); - out->Resize(phi::make_ddim(out_trans_dims_vec)); - out->mutable_data(context.GetPlace()); - std::vector out_trans_unbind = Unbind(out_trans); - math::ConcatFunctor concat_functor; - concat_functor(dev_ctx, out_trans_unbind, 0, &out_trans); - TransCompute(out_trans.dims().size(), dev_ctx, out_trans, - out, permute); -} - -// functor for processing a flattend Tensor -template -struct UniqueFlattendCUDAFunctor { - const framework::ExecutionContext& ctx_; - const Tensor& in_; - Tensor* out_; - const bool return_index_; - const bool return_inverse_; - const bool return_counts_; - - UniqueFlattendCUDAFunctor(const framework::ExecutionContext& context, - const Tensor& in, Tensor* out, bool return_index, - bool return_inverse, bool return_counts) - : ctx_(context), - in_(in), - out_(out), - return_index_(return_index), - return_inverse_(return_inverse), - return_counts_(return_counts) {} - - template - void apply() const { - UniqueFlattendCUDATensor( - ctx_, in_, out_, return_index_, return_inverse_, return_counts_, - thrust::equal_to(), thrust::not_equal_to(), in_.numel()); - } -}; - -// functor for processing a multi-dimentional Tensor -template -struct UniqueDimsCUDAFunctor { - const framework::ExecutionContext& ctx_; - const Tensor& in_; - Tensor* out_; - const int axis_; - const bool return_index_; - const bool return_inverse_; - const bool return_counts_; - - UniqueDimsCUDAFunctor(const framework::ExecutionContext& context, - const Tensor& in, Tensor* out, const int axis, - bool return_index, bool return_inverse, - bool return_counts) - : ctx_(context), - in_(in), - out_(out), - axis_(axis), - return_index_(return_index), - return_inverse_(return_inverse), - return_counts_(return_counts) {} - - template - void apply() const { - UniqueDimsCUDATensor( - ctx_, in_, out_, return_index_, return_inverse_, return_counts_, axis_); - } -}; - -// Unique_op CUDA implementation. -template -class UniqueKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - auto data_type = static_cast( - context.Attr("dtype")); - if (data_type == framework::proto::VarType::INT32) { - PADDLE_ENFORCE_LE( - x->numel() + 1, INT_MAX, - platform::errors::InvalidArgument( - "The number of elements in Input(X) should be less than or " - "equal to INT_MAX, but received num is %d. Please set `dtype` to " - "int64.", - x->numel())); - } - - std::vector axis_vec = context.Attr>("axis"); - bool return_index = context.Attr("return_index"); - bool return_inverse = context.Attr("return_inverse"); - bool return_counts = context.Attr("return_counts"); - - // if 'axis' is not required, flatten the Tensor. - if (axis_vec.empty()) { - framework::VisitDataTypeTiny( - data_type, - UniqueFlattendCUDAFunctor( - context, *x, out, return_index, return_inverse, return_counts)); - } else { - // 'axis' is required. - int axis = axis_vec[0]; - framework::VisitDataTypeTiny( - data_type, UniqueDimsCUDAFunctor( - context, *x, out, axis, return_index, return_inverse, - return_counts)); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - unique, ops::UniqueKernel, - ops::UniqueKernel, - ops::UniqueKernel, - ops::UniqueKernel); diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc index 96320202b73fb..8c8684bf4b035 100644 --- a/paddle/fluid/operators/unstack_op.cc +++ b/paddle/fluid/operators/unstack_op.cc @@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/unstack_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,43 +27,6 @@ namespace operators { class UnStackOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "UnStack"); - int axis = ctx->Attrs().Get("axis"); - int num = ctx->Attrs().Get("num"); - auto x_dim = ctx->GetInputDim("X"); - int rank = x_dim.size(); - PADDLE_ENFORCE_GE(axis, -rank, - platform::errors::InvalidArgument( - "The attribute axis is out of range, it must be " - "inside [-rank, rank), where rank = %d", - rank)); - PADDLE_ENFORCE_LT(axis, rank, - platform::errors::InvalidArgument( - "The attribute axis is out of range, it must be " - "inside [-rank, rank), where rank = %d", - rank)); - if (axis < 0) axis += rank; - - PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast(num), - platform::errors::InvalidArgument( - "Number of Outputs(Y) is wrong. Got %d , but it must " - "equal to attribute num which is %d.", - ctx->Outputs("Y").size(), static_cast(num))); - if (x_dim[axis] > 0) { - PADDLE_ENFORCE_EQ( - num, x_dim[axis], - platform::errors::InvalidArgument( - "The number of attribute num is not equal to the length of the " - "%d axis of Input(X). Expect %d but got %d.", - axis, x_dim[axis], num)); - } - auto vec = phi::vectorize(x_dim); - vec.erase(vec.begin() + axis); - ctx->SetOutputsDim("Y", std::vector( // NOLINT - x_dim[axis], phi::make_ddim(vec))); - } }; class UnStackOpMaker : public framework::OpProtoAndCheckerMaker { @@ -141,20 +106,12 @@ class UnStackGradOp : public framework::OperatorWithKernel { namespace plat = paddle::platform; namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(unstack, UnStackInferMetaFunctor, + PD_INFER_META(phi::UnStackInferMeta)); + REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker, ops::UnStackGradOpMaker, - ops::UnStackGradOpMaker); + ops::UnStackGradOpMaker, + UnStackInferMetaFunctor); REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp); - -REGISTER_OP_CPU_KERNEL(unstack, - ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel); - -REGISTER_OP_CPU_KERNEL(unstack_grad, - ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel); diff --git a/paddle/fluid/operators/unstack_op.cu b/paddle/fluid/operators/unstack_op.cu deleted file mode 100644 index b591898a4d7aa..0000000000000 --- a/paddle/fluid/operators/unstack_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/unstack_op.h" - -namespace plat = paddle::platform; -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - unstack, ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel); - -REGISTER_OP_CUDA_KERNEL( - unstack_grad, ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel); diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h deleted file mode 100644 index 413470e3db5d4..0000000000000 --- a/paddle/fluid/operators/unstack_op.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" - -#if defined(__NVCC__) || defined(__HIPCC__) -#include -#endif - -namespace paddle { -namespace operators { - -template -struct StackFunctor { - HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post) - : x_(x), y_(y), n_(n), post_(post) {} - - HOSTDEVICE void operator()(int idx) { - int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; - y_[idx] = x_[which_x][x_index]; - } - - private: - VecXType x_; - T *y_; - int n_; - int post_; -}; - -template -struct StackGradFunctor { - HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) - : dx_(dx), dy_(dy), n_(n), post_(post) {} - - HOSTDEVICE void operator()(int idx) { - int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; - dx_[which_x][x_index] = dy_[idx]; - } - - private: - VecDxType dx_; - const T *dy_; - int n_; - int post_; -}; - -template -static inline void StackFunctorForRange(const DeviceContext &ctx, - const VecXType &x, T *y, int total_num, - int n, int post) { - platform::ForRange for_range(ctx, total_num); - for_range(StackFunctor(x, y, n, post)); -} - -template -static inline void StackGradFunctorForRange(const DeviceContext &ctx, - const VecDxType &dx, const T *dy, - int total_num, int n, int post) { - platform::ForRange for_range(ctx, total_num); - for_range(StackGradFunctor(dx, dy, n, post)); -} - -template -class UnStackGradKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto x = ctx.MultiInput(framework::GradVarName("Y")); - auto *y = ctx.Output(framework::GradVarName("X")); - - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - - int n = static_cast(x.size()); - auto *y_data = y->mutable_data(ctx.GetPlace()); - std::vector x_datas(n); - for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); - - int pre = 1; - int post = 1; - auto &dim = x[0]->dims(); - for (auto i = 0; i < axis; ++i) pre *= dim[i]; - for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; - -#if defined(__NVCC__) || defined(__HIPCC__) - int total_num = pre * n * post; - auto &dev_ctx = ctx.template device_context(); - - thrust::device_vector device_x_vec(x_datas); - auto x_data_arr = device_x_vec.data().get(); - - StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); - - // Wait() must be called because device_x_vec may be destructed before - // kernel ends - dev_ctx.Wait(); -#else - auto x_data_arr = x_datas.data(); - - size_t x_offset = 0; - size_t y_offset = 0; - for (int i = 0; i < pre; i++) { - for (int j = 0; j < n; j++) { - std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset, - post * sizeof(T)); - y_offset += post; - } - x_offset += post; - } -#endif - } -}; - -template -class UnStackKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *dy = ctx.Input("X"); - auto dx = ctx.MultiOutput("Y"); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - - int n = dy->dims()[axis]; - std::vector dx_datas(n); // NOLINT - for (int i = 0; i < n; i++) { - dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); - } - auto dy_data = dy->data(); - if (dy->numel() == 0) return; - int pre = 1; - for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; - int total_num = dy->numel(); - int post = total_num / (n * pre); - - auto &dev_ctx = ctx.template device_context(); -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector device_dx_vec(dx_datas); - auto dx_data_arr = device_dx_vec.data().get(); -#else - auto dx_data_arr = dx_datas.data(); -#endif - StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); -#if defined(__NVCC__) || defined(__HIPCC__) - // Wait() must be called because device_dx_vec may be destructed before - // kernel ends - dev_ctx.Wait(); -#endif - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc index fb88566e3426c..c55ec1fcf9044 100644 --- a/paddle/fluid/operators/unstack_op_npu.cc +++ b/paddle/fluid/operators/unstack_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/unstack_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index a190b222f86ac..9ef8e8a356c7a 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -44,6 +44,10 @@ namespace phi { _PhiForEachDataTypeHelper_( \ callback, ::phi::dtype::complex, DataType::COMPLEX128); +#define _PhiForEachDataTypeTiny_(callback) \ + _PhiForEachDataTypeHelper_(callback, int, DataType::INT32); \ + _PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64); + template inline void VisitDataType(phi::DataType type, Visitor visitor) { #define PhiVisitDataTypeCallback(cpp_type, data_type) \ @@ -59,4 +63,21 @@ inline void VisitDataType(phi::DataType type, Visitor visitor) { PADDLE_THROW(phi::errors::Unimplemented( "Not supported phi::DataType(%d) as data type.", static_cast(type))); } + +template +inline void VisitDataTypeTiny(phi::DataType type, Visitor visitor) { +#define PhiVisitDataTypeCallbackTiny(cpp_type, data_type) \ + do { \ + if (type == data_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _PhiForEachDataTypeTiny_(PhiVisitDataTypeCallbackTiny); +#undef PhiVisitDataTypeCallbackTiny + PADDLE_THROW(phi::errors::Unimplemented( + "Not supported phi::DataType(%d) as data type.", static_cast(type))); +} + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 1e261abbcc28d..7f25572830508 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1167,6 +1167,52 @@ void RnnInferMeta(const MetaTensor& x, } } +void StackInferMeta(const std::vector& x, + int axis, + MetaTensor* out) { + PADDLE_ENFORCE_GT(x.size(), + 0UL, + phi::errors::InvalidArgument( + "Number of Inputs(x) must be larger than 0, but" + " received value is:%d.", + x.size())); + const auto& input_dims = GetMetaTensorsDim(x); + for (size_t i = 1; i < input_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(input_dims[i], + input_dims[0], + phi::errors::InvalidArgument( + "Dims of all Inputs(X) must be the same, but" + " received input %d dim is:%d not equal to input 0" + " dim:%d.", + i, + input_dims[i], + input_dims[0])); + } + int rank = input_dims[0].size(); + PADDLE_ENFORCE_GE( + axis, + -(rank + 1), + phi::errors::InvalidArgument( + "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, " + "but received axis is:%d.", + rank, + axis)); + PADDLE_ENFORCE_LT( + axis, + rank + 1, + phi::errors::InvalidArgument( + "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, " + "but received axis is:%d", + rank, + axis)); + if (axis < 0) axis += (rank + 1); + auto vec = phi::vectorize(input_dims[0]); + vec.insert(vec.begin() + axis, input_dims.size()); + out->set_dims(phi::make_ddim(vec)); + out->set_dtype(x.at(0)->dtype()); + out->share_lod(*x.at(0)); +} + void WarpctcInferMeta(const MetaTensor& logits, const MetaTensor& label, const paddle::optional logits_length, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 6261d521e0e5b..e750b3c558abf 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -231,6 +231,10 @@ void RnnInferMeta(const MetaTensor& x, std::vector state, MetaTensor* reserve); +void StackInferMeta(const std::vector& x, + int axis, + MetaTensor* out); + void WarpctcInferMeta(const MetaTensor& logits, const MetaTensor& label, const paddle::optional logits_length, diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 0376d4e79e00d..582dcb0137894 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -345,6 +345,56 @@ void PutAlongAxisInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void RangeInferMeta(const MetaTensor& start, + const MetaTensor& end, + const MetaTensor& step, + MetaTensor* out) { + auto start_dims = start.dims(); + auto end_dims = end.dims(); + auto step_dims = step.dims(); + PADDLE_ENFORCE_EQ( + start_dims.size(), + 1, + phi::errors::InvalidArgument( + "The dim of the shape of Input(Start) should be 1, but got %d", + start_dims.size())); + + PADDLE_ENFORCE_EQ(start_dims[0], + 1, + phi::errors::InvalidArgument( + "The first dim of the shape of Input(Start) should " + "be 1, but got %d", + start_dims[0])); + PADDLE_ENFORCE_EQ( + end_dims.size(), + 1, + phi::errors::InvalidArgument( + "The dim of the shape of Input(End) should be 1, but got %d", + end_dims.size())); + + PADDLE_ENFORCE_EQ( + end_dims[0], + 1, + phi::errors::InvalidArgument("The first dim of the shape of " + "Input(End) should be 1, but got %d", + end_dims[0])); + PADDLE_ENFORCE_EQ( + step_dims.size(), + 1, + phi::errors::InvalidArgument( + "The dim of the shape of Input(Step) should be 1, but got %d", + step_dims.size())); + + PADDLE_ENFORCE_EQ(step_dims[0], + 1, + phi::errors::InvalidArgument( + "The first dim of the shape of Input(Step) should " + "be 1, but got %d", + step_dims[0])); + out->set_dims({-1}); + out->set_dtype(start.dtype()); +} + void RoiAlignInferMeta(const MetaTensor& x, const MetaTensor& boxes, paddle::optional boxes_num, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 30fdb4e612c8b..c18dde42f1ed2 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -81,6 +81,11 @@ void PutAlongAxisInferMeta(const MetaTensor& x, const std::string& reduce, MetaTensor* out); +void RangeInferMeta(const MetaTensor& start, + const MetaTensor& end, + const MetaTensor& step, + MetaTensor* out); + void RoiAlignInferMeta(const MetaTensor& x, const MetaTensor& boxes, paddle::optional boxes_num, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index bbeb14363e84e..cc267febc3850 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -2552,6 +2552,85 @@ void UnfoldInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim(out_dims)); } +void UniqueInferMeta(const MetaTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + MetaTensor* out, + MetaTensor* indices, + MetaTensor* index, + MetaTensor* counts) { + bool is_sorted = true; + UniqueRawInferMeta(x, + return_index, + return_inverse, + return_counts, + axis, + dtype, + is_sorted, + out, + indices, + index, + counts); +} + +void UniqueRawInferMeta(const MetaTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + MetaTensor* out, + MetaTensor* indices, + MetaTensor* index, + MetaTensor* counts) { + if (!is_sorted) { + PADDLE_ENFORCE_EQ( + x.dims().size(), + 1, + phi::errors::InvalidArgument("The Input(X) should be 1-D Tensor, " + "But now the dims of Input(X) is %d.", + x.dims().size())); + out->set_dims(phi::make_ddim({-1})); + index->set_dims(x.dims()); + return; + } + + if (axis.empty()) { + out->set_dims(phi::make_ddim({-1})); + if (return_inverse) { + index->set_dims(phi::make_ddim({phi::product(x.dims())})); + } + } else { + int axis_value = axis[0]; + if (axis_value < 0) { + axis_value += x.dims().size(); + } + PADDLE_ENFORCE_LT( + axis_value, + x.dims().size(), + phi::errors::InvalidArgument("The axis(%d) should be less than " + "the dimension size(%d) of x.", + axis_value, + x.dims().size())); + auto out_dims = x.dims(); + out_dims[axis_value] = -1; + out->set_dims(out_dims); + if (return_inverse) { + index->set_dims(phi::make_ddim({x.dims()[axis_value]})); + } + } + if (return_index) { + indices->set_dims(phi::make_ddim({-1})); + } + if (return_counts) { + counts->set_dims(phi::make_ddim({-1})); + } +} + void UnsqueezeInferMeta(const MetaTensor& x, const ScalarArray& axes, MetaTensor* xshape, @@ -2595,6 +2674,53 @@ void UnsqueezeInferMeta(const MetaTensor& x, xshape->set_dtype(x.dtype()); } +void UnStackInferMeta(const MetaTensor& x, + int axis, + int num, + std::vector outs) { + auto x_dim = x.dims(); + int rank = x_dim.size(); + PADDLE_ENFORCE_GE(axis, + -rank, + phi::errors::InvalidArgument( + "The attribute axis is out of range, it must be " + "inside [-rank, rank), where rank = %d", + rank)); + PADDLE_ENFORCE_LT(axis, + rank, + phi::errors::InvalidArgument( + "The attribute axis is out of range, it must be " + "inside [-rank, rank), where rank = %d", + rank)); + if (axis < 0) axis += rank; + + size_t output_count = outs.size(); + PADDLE_ENFORCE_EQ(output_count, + static_cast(num), + phi::errors::InvalidArgument( + "Number of Outputs(Y) is wrong. Got %d , but it must " + "equal to attribute num which is %d.", + output_count, + static_cast(num))); + if (x_dim[axis] > 0) { + PADDLE_ENFORCE_EQ( + num, + x_dim[axis], + phi::errors::InvalidArgument( + "The number of attribute num is not equal to the length of the " + "%d axis of Input(X). Expect %d but got %d.", + axis, + x_dim[axis], + num)); + } + auto vec = phi::vectorize(x_dim); + vec.erase(vec.begin() + axis); + for (size_t i = 0; i < output_count; i++) { + outs[i]->set_dims(phi::make_ddim(vec)); + outs[i]->set_dtype(x.dtype()); + } +} + void OneHotRawInferMeta(const MetaTensor& x, int32_t depth, DataType dtype, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index ea902e0d98eca..c6efe05c65067 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -360,12 +360,40 @@ void UnfoldInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void UniqueInferMeta(const MetaTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + MetaTensor* out, + MetaTensor* indices, + MetaTensor* index, + MetaTensor* counts); + +void UniqueRawInferMeta(const MetaTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + MetaTensor* out, + MetaTensor* indices, + MetaTensor* index, + MetaTensor* counts); + void UnsqueezeInferMeta(const MetaTensor& x, const ScalarArray& axes, MetaTensor* xshape, MetaTensor* out, MetaConfig config = MetaConfig()); +void UnStackInferMeta(const MetaTensor& x, + int axis, + int num, + std::vector outs); + void OneHotRawInferMeta(const MetaTensor& x, int32_t depth, DataType dtype, diff --git a/paddle/phi/kernels/cpu/range_kernel.cc b/paddle/phi/kernels/cpu/range_kernel.cc new file mode 100644 index 0000000000000..8731696f61760 --- /dev/null +++ b/paddle/phi/kernels/cpu/range_kernel.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/range_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/range_function.h" + +namespace phi { + +template +void RangeKernel(const Context& dev_ctx, + const DenseTensor& start, + const DenseTensor& end, + const DenseTensor& step, + DenseTensor* out) { + T start_value = start.data()[0]; + T end_value = end.data()[0]; + T step_value = step.data()[0]; + int64_t size = 0; + phi::funcs::GetSize(start_value, end_value, step_value, &size); + out->Resize(phi::make_ddim({size})); + T* out_data = dev_ctx.template Alloc(out); + T value = start_value; + for (int64_t i = 0; i < size; ++i) { + out_data[i] = value; + value += step_value; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + range, CPU, ALL_LAYOUT, phi::RangeKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/stack_grad_kernel.cc b/paddle/phi/kernels/cpu/stack_grad_kernel.cc new file mode 100644 index 0000000000000..018705333e962 --- /dev/null +++ b/paddle/phi/kernels/cpu/stack_grad_kernel.cc @@ -0,0 +1,59 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/stack_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/stack_functor.h" + +namespace phi { + +template +void StackGradKernel(const Context& dev_ctx, + const DenseTensor& out, + int axis, + std::vector x_grad) { + if (axis < 0) axis += out.dims().size(); + int n = out.dims()[axis]; + std::vector dx_datas(n); // NOLINT + + for (int i = 0; i < n; i++) { + if (x_grad[i] == nullptr) { + dx_datas[i] = nullptr; + } else { + dx_datas[i] = dev_ctx.template Alloc(x_grad[i]); + } + } + auto dy_data = out.data(); + int pre = 1; + for (int i = 0; i < axis; ++i) pre *= out.dims()[i]; + int total_num = out.numel(); + int post = total_num / (n * pre); + auto dx_data_arr = dx_datas.data(); + phi::funcs::StackGradFunctorForRange( + dev_ctx, dx_data_arr, dy_data, total_num, n, post); +} + +} // namespace phi + +PD_REGISTER_KERNEL(stack_grad, + CPU, + ALL_LAYOUT, + phi::StackGradKernel, + float, + double, + int64_t, + int, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/stack_kernel.cc b/paddle/phi/kernels/cpu/stack_kernel.cc new file mode 100644 index 0000000000000..5eb1cf061be2b --- /dev/null +++ b/paddle/phi/kernels/cpu/stack_kernel.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/stack_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void StackKernel(const Context& dev_ctx, + const std::vector& x, + int axis, + DenseTensor* out) { + if (axis < 0) axis += (x[0]->dims().size() + 1); + int n = static_cast(x.size()); + T* y_data = dev_ctx.template Alloc(out); + std::vector x_datas(n); + for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); + + int pre = 1, post = 1; + auto& dim = x[0]->dims(); + for (auto i = 0; i < axis; ++i) pre *= dim[i]; + for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; + + auto x_data_arr = x_datas.data(); + + size_t x_offset = 0; + size_t y_offset = 0; + for (int i = 0; i < pre; i++) { + for (int j = 0; j < n; j++) { + std::memcpy( + y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T)); + y_offset += post; + } + x_offset += post; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(stack, + CPU, + ALL_LAYOUT, + phi::StackKernel, + float, + double, + int, + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc new file mode 100644 index 0000000000000..853b401315d22 --- /dev/null +++ b/paddle/phi/kernels/cpu/unique_kernel.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unique_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/unique_functor.h" + +namespace phi { + +template +void UniqueKernel(const Context& context, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + bool is_sorted = true; + UniqueRawKernel(context, + x, + return_index, + return_inverse, + return_counts, + axis, + dtype, + is_sorted, + out, + indices, + index, + counts); +} + +template +void UniqueRawKernel(const Context& context, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + if (dtype == phi::DataType::INT32) { + PADDLE_ENFORCE_LE( + x.numel(), + INT_MAX, + phi::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x.numel())); + } + if (!is_sorted) { + phi::VisitDataType( + dtype, + phi::funcs::UniqueOpFunctor(context, out, index, &x)); + return; + } + + if (x.numel() == 0) { + context.template Alloc(out); + return; + } + if (axis.empty()) { + phi::VisitDataTypeTiny( + dtype, + phi::funcs::UniqueFlattendTensorFunctor(context, + x, + out, + indices, + index, + counts, + return_index, + return_inverse, + return_counts)); + } else { + int axis_value = axis[0]; + phi::VisitDataTypeTiny( + dtype, + phi::funcs::UniqueDimFunctor(context, + x, + out, + indices, + index, + counts, + axis_value, + return_index, + return_inverse, + return_counts)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(unique, + CPU, + ALL_LAYOUT, + phi::UniqueKernel, + float, + double, + int32_t, + int64_t) {} + +PD_REGISTER_KERNEL(unique_raw, + CPU, + ALL_LAYOUT, + phi::UniqueRawKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc new file mode 100644 index 0000000000000..9c2dce808dca7 --- /dev/null +++ b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/unstack_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(unstack_grad, + CPU, + ALL_LAYOUT, + phi::UnStackGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/unstack_kernel.cc b/paddle/phi/kernels/cpu/unstack_kernel.cc new file mode 100644 index 0000000000000..3d233e9ec405f --- /dev/null +++ b/paddle/phi/kernels/cpu/unstack_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/unstack_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unstack_kernel_impl.h" + +PD_REGISTER_KERNEL( + unstack, CPU, ALL_LAYOUT, phi::UnStackKernel, float, double, int, int64_t) { +} diff --git a/paddle/phi/kernels/funcs/range_function.h b/paddle/phi/kernels/funcs/range_function.h new file mode 100644 index 0000000000000..5ace32f46ace1 --- /dev/null +++ b/paddle/phi/kernels/funcs/range_function.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace funcs { + +template +void GetSize(T start, T end, T step, int64_t* size) { + PADDLE_ENFORCE_NE( + step, + 0, + phi::errors::InvalidArgument("The step of range op should not be 0.")); + + if (start < end) { + PADDLE_ENFORCE_GT( + step, + 0, + phi::errors::InvalidArgument( + "The step should be greater than 0 while start < end.")); + } + + if (start > end) { + PADDLE_ENFORCE_LT(step, + 0, + phi::errors::InvalidArgument( + "The step should be less than 0 while start > end.")); + } + + *size = std::is_integral::value + ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step)) + : std::ceil(std::abs((end - start) / step)); +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/stack_functor.h b/paddle/phi/kernels/funcs/stack_functor.h new file mode 100644 index 0000000000000..68379c27058ad --- /dev/null +++ b/paddle/phi/kernels/funcs/stack_functor.h @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +namespace funcs { + +template +struct StackFunctor { + HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post) + : x_(x), y_(y), n_(n), post_(post) {} + + HOSTDEVICE void operator()(int idx) { + int i = idx / (n_ * post_); + int which_x = idx / post_ - i * n_; + int x_index = i * post_ + idx % post_; + y_[idx] = x_[which_x][x_index]; + } + + private: + VecXType x_; + T *y_; + int n_; + int post_; +}; + +template +struct StackGradFunctor { + HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) + : dx_(dx), dy_(dy), n_(n), post_(post) {} + + HOSTDEVICE void operator()(int idx) { + int i = idx / (n_ * post_); + int which_x = idx / post_ - i * n_; + int x_index = i * post_ + idx % post_; + if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx]; + } + + private: + VecDxType dx_; + const T *dy_; + int n_; + int post_; +}; + +template +static inline void StackFunctorForRange(const DeviceContext &ctx, + const VecXType &x, + T *y, + int total_num, + int n, + int post) { + phi::funcs::ForRange for_range(ctx, total_num); + for_range(StackFunctor(x, y, n, post)); +} + +template +static inline void StackGradFunctorForRange(const DeviceContext &ctx, + const VecDxType &dx, + const T *dy, + int total_num, + int n, + int post) { + phi::funcs::ForRange for_range(ctx, total_num); + for_range(StackGradFunctor(dx, dy, n, post)); +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h new file mode 100644 index 0000000000000..2bb51cdab65c6 --- /dev/null +++ b/paddle/phi/kernels/funcs/unique_functor.h @@ -0,0 +1,426 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +template +struct UniqueOpFunctor { + const Context& context_; + DenseTensor* out_; + DenseTensor* index_; + const DenseTensor* in_; + DenseTensor* count_; + + UniqueOpFunctor(const Context& context, + DenseTensor* out, + DenseTensor* index, + const DenseTensor* in, + DenseTensor* count = nullptr) + : context_(context), out_(out), index_(index), in_(in), count_(count) {} + + template + void apply() const { + auto* in_data = in_->data(); + auto* index_data = context_.template Alloc(index_); + + int64_t j = 0; + + // TODO(fangzeyang): Should optimize performance here. + std::unordered_map dict; + std::vector uniq; + + PADDLE_ENFORCE_LT( + in_->numel(), + pow(2, 31), + phi::errors::InvalidArgument( + "The num of Input(X) elements should be less then INT_MAX, " + "but received num is %d.", + in_->numel())); + + for (auto i = 0; i < in_->numel(); i++) { + auto it = dict.find(in_data[i]); + if (it == dict.end()) { + dict.emplace(std::make_pair(in_data[i], j)); + uniq.emplace_back(in_data[i]); + index_data[i] = static_cast(j); + j++; + } else { + index_data[i] = static_cast(it->second); + } + } + + if (count_ != nullptr) { + // Resize the count tensor dims to allocate the memory + count_->Resize(phi::make_ddim({static_cast(uniq.size())})); + IndexT* count_data = context_.template Alloc(count_); + // init count_data to 0 + memset(count_data, 0, uniq.size() * sizeof(IndexT)); + + const auto& index_type = index_->dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + phi::errors::InvalidArgument( + "Index holds the wrong type, it holds %s, " + "but desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT64)))); + + if (index_type == DataType::INT32) { + for (auto i = 0; i < in_->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[static_cast(index)] += static_cast(1); + } + } else { + for (auto i = 0; i < in_->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[static_cast(index)] += static_cast(1); + } + } + } + + out_->Resize(phi::make_ddim({static_cast(uniq.size())})); + auto* out_data = context_.template Alloc(out_); + std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT)); + } +}; + +static std::vector Unbind(const DenseTensor& in) { + int64_t size = in.dims()[0]; + std::vector tensors(size); + for (int64_t i = 0; i < size; ++i) { + tensors[i] = in.Slice(i, i + 1); + } + return tensors; +} + +template +static bool Equal(const DenseTensor& a, const DenseTensor& b) { + if (a.numel() != b.numel()) { + return false; + } + for (int64_t i = 0; i < a.numel(); ++i) { + if (a.data()[i] != b.data()[i]) { + return false; + } + } + return true; +} + +template +static void UniqueFlattendTensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + bool return_index, + bool return_inverse, + bool return_counts) { + const InT* in_data = in.data(); + std::set unique(in_data, in_data + in.numel()); + out->Resize(phi::make_ddim({static_cast(unique.size())})); + auto* out_data = context.template Alloc(out); + std::copy(unique.begin(), unique.end(), out_data); + + if (return_index) { + indices->Resize(phi::make_ddim({out->numel()})); + auto indices_data = context.template Alloc(indices); + std::unordered_map indices_map; + indices_map.reserve(out->numel()); + for (int64_t i = 0; i < in.numel(); ++i) { + if (indices_map.find(in_data[i]) != indices_map.end()) continue; + indices_map[in_data[i]] = i; + } + for (int64_t i = 0; i < out->numel(); ++i) { + indices_data[i] = indices_map[out_data[i]]; + } + } + + if (return_inverse) { + index->Resize(phi::make_ddim({in.numel()})); + auto inverse_data = context.template Alloc(index); + std::unordered_map inverse_map; + inverse_map.reserve(out->numel()); + for (int64_t i = 0; i < out->numel(); ++i) { + inverse_map[out_data[i]] = i; + } + for (int64_t i = 0; i < in.numel(); ++i) { + inverse_data[i] = inverse_map[in_data[i]]; + } + } + + if (return_counts) { + count->Resize(phi::make_ddim({out->numel()})); + auto count_data = context.template Alloc(count); + std::unordered_map counts_map; + counts_map.reserve(out->numel()); + for (int64_t i = 0; i < out->numel(); ++i) { + counts_map[out_data[i]] = 0; + } + for (int64_t i = 0; i < in.numel(); i++) { + counts_map[in_data[i]] += 1; + } + for (int64_t i = 0; i < out->numel(); i++) { + count_data[i] = counts_map[out_data[i]]; + } + } +} + +template +static ForwardIt UniqueDimImpl(const Context& context, + ForwardIt first, + ForwardIt last, + const std::vector& sorted_indices_vec, + std::vector* inverse_vec, + std::vector* counts_vec, + std::vector* indices_vec) { + if (first == last) { + return last; + } + + (*inverse_vec)[sorted_indices_vec[0]] = 0; + (*counts_vec)[0] = 1; + (*indices_vec)[0] = sorted_indices_vec[0]; + + ForwardIt begin = first; + ForwardIt result = first; + + while (++first != last) { + int64_t idx_first = std::distance(begin, first); + int64_t idx_result = std::distance(begin, result); + if (!Equal(*result, *first)) { + if (++result != first) { + *result = std::move(*first); + } + idx_result += 1; + (*indices_vec)[idx_result] = sorted_indices_vec[idx_first]; + } + (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result; + (*counts_vec)[idx_result] += 1; + } + return ++result; +} + +template +static void UniqueDim(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + bool return_index, + bool return_inverse, + bool return_counts, + int axis) { + // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dims_vec(phi::vectorize(in.dims())); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; + DenseTensor in_trans; + phi::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); + in_trans.Resize(in_trans_dims); + context.template Alloc(&in_trans); + TransCompute(in.dims().size(), context, in, &in_trans, permute); + // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + phi::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + // sort indices + std::vector sorted_indices_vec(in_trans.dims()[0]); + std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0); + int64_t col = in_trans.dims()[1]; + const InT* in_trans_data = in_trans.data(); + std::sort(sorted_indices_vec.begin(), + sorted_indices_vec.end(), + [&](int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + }); + + // sort tensor according to indices + DenseTensor input_sorted; + input_sorted.Resize(in_trans_dims); + context.template Alloc(&input_sorted); + InT* input_sorted_data = input_sorted.data(); + for (size_t i = 0; i < sorted_indices_vec.size(); ++i) { + memcpy(input_sorted_data + i * col, + in_trans_data + static_cast(sorted_indices_vec[i]) * col, + col * sizeof(InT)); + } + + std::vector input_unbind = Unbind(input_sorted); + std::vector inverse_vec(sorted_indices_vec.size(), 0); + std::vector counts_vec(sorted_indices_vec.size(), 0); + std::vector indices_vec(sorted_indices_vec.size(), 0); + auto last = UniqueDimImpl::iterator, InT>( + context, + input_unbind.begin(), + input_unbind.end(), + sorted_indices_vec, + &inverse_vec, + &counts_vec, + &indices_vec); + input_unbind.erase(last, input_unbind.end()); + counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); + indices_vec.erase(indices_vec.begin() + input_unbind.size(), + indices_vec.end()); + + phi::funcs::ConcatFunctor concat_functor; + DenseTensor out_trans; + std::vector out_trans_dims_vec = in_trans_dims_vec; + out_trans_dims_vec[0] = input_unbind.size(); + out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(&out_trans); + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(out); + concat_functor(context, input_unbind, 0, &out_trans); + TransCompute( + out_trans.dims().size(), context, out_trans, out, permute); + + if (return_inverse) { + paddle::framework::TensorFromVector(inverse_vec, context, index); + } + + if (return_counts) { + paddle::framework::TensorFromVector(counts_vec, context, count); + } + + if (return_index) { + paddle::framework::TensorFromVector(indices_vec, context, indices); + } +} + +template +struct UniqueFlattendTensorFunctor { + const Context& ctx_; /* */ + const DenseTensor& in_; + DenseTensor* out_; + DenseTensor* indices_; + DenseTensor* index_; + DenseTensor* count_; + const bool return_index_; + const bool return_inverse_; + const bool return_counts_; + + UniqueFlattendTensorFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + bool return_index, + bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + indices_(indices), + index_(index), + count_(count), + return_index_(return_index), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueFlattendTensor(ctx_, + in_, + out_, + indices_, + index_, + count_, + return_index_, + return_inverse_, + return_counts_); + } +}; + +template +struct UniqueDimFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + DenseTensor* indices_; + DenseTensor* index_; + DenseTensor* count_; + const int axis_; + const bool return_index_; + const bool return_inverse_; + const bool return_counts_; + + UniqueDimFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + const int axis, + bool return_index, + bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + indices_(indices), + index_(index), + count_(count), + axis_(axis), + return_index_(return_index), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueDim(ctx_, + in_, + out_, + indices_, + index_, + count_, + return_index_, + return_inverse_, + return_counts_, + axis_); + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/range_kernel.cu b/paddle/phi/kernels/gpu/range_kernel.cu new file mode 100644 index 0000000000000..65d9b45efbcdd --- /dev/null +++ b/paddle/phi/kernels/gpu/range_kernel.cu @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/range_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/range_function.h" + +namespace phi { + +template +__global__ void Range(T start, T step, int64_t size, T* out) { + CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; } +} + +template +void RangeKernel(const Context& dev_ctx, + const DenseTensor& start, + const DenseTensor& end, + const DenseTensor& step, + DenseTensor* out) { + T start_value = start.data()[0]; + T end_value = end.data()[0]; + T step_value = step.data()[0]; + + int64_t size = 0; + phi::funcs::GetSize(start_value, end_value, step_value, &size); + out->Resize(phi::make_ddim({size})); + T* out_data = dev_ctx.template Alloc(out); + + auto stream = dev_ctx.stream(); + int block = std::min(size, static_cast(256)); + int grid = (size + block - 1) / block; + Range<<>>(start_value, step_value, size, out_data); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + range, GPU, ALL_LAYOUT, phi::RangeKernel, float, double, int64_t, int) { + kernel->InputAt(0).SetBackend(phi::Backend::CPU); + kernel->InputAt(1).SetBackend(phi::Backend::CPU); + kernel->InputAt(2).SetBackend(phi::Backend::CPU); +} diff --git a/paddle/phi/kernels/gpu/stack_grad_kernel.cu b/paddle/phi/kernels/gpu/stack_grad_kernel.cu new file mode 100644 index 0000000000000..9b754e22692af --- /dev/null +++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/stack_grad_kernel.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input, + int pre_dim_size, + int split_dim_size, + int suf_dim_size, + int num_split, + T** output_ptrs) { + assert(blockDim.y == 1); + assert(blockDim.z == 1); + // In this case they are equal + assert(split_dim_size % num_split == 0); + + IntType size = pre_dim_size * split_dim_size * suf_dim_size; + IntType each_dim_size = split_dim_size / num_split; + + for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size; + offset += blockDim.x * gridDim.x) { + IntType i = offset / (split_dim_size * suf_dim_size); + IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size; + IntType k = offset % suf_dim_size; + + T* output = output_ptrs[j / each_dim_size]; + if (output == nullptr) { + return; + } + IntType output_ind = i * each_dim_size * suf_dim_size + + (j % each_dim_size) * suf_dim_size + k; + *(output + output_ind) = input[offset]; + } +} + +template +void StackGradKernel(const Context& dev_ctx, + const DenseTensor& out, + int axis, + std::vector x_grad) { + if (axis < 0) axis += out.dims().size(); + + int n = out.dims()[axis]; + PADDLE_ENFORCE_EQ(n, + x_grad.size(), + phi::errors::InvalidArgument( + "Output x_grad size should be equal to n, but" + " received n is:%d x_grad size is:%d.", + n, + x_grad.size())); + + // x_grad is output, so save each data address, then copy each dy into dx_data + std::vector outputs(n); + for (size_t j = 0; j < x_grad.size(); ++j) { + if (x_grad[j] == nullptr) { + outputs[j] = nullptr; + continue; + } + if (x_grad[j]->numel() != 0UL) { + T* ptr = dev_ctx.template Alloc(x_grad[j]); + outputs[j] = ptr; + } else { + outputs[j] = nullptr; + } + } + auto dy_data = out.data(); + // each x_grad should have same shape + int dy_pre = 1, dy_suf = 1; + auto dy_dims = out.dims(); + int split_dim = n; + for (int i = 0; i < axis; ++i) { + dy_pre *= dy_dims[i]; + } + dy_suf = out.numel() / (split_dim * dy_pre); + + auto tmp_out_data = + paddle::memory::Alloc(dev_ctx, outputs.size() * sizeof(T*)); + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_out_data->ptr(), + phi::CPUPlace(), + reinterpret_cast(outputs.data()), + outputs.size() * sizeof(T*), + dev_ctx.stream()); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, dy_pre * split_dim * dy_suf); + + if (out.numel() < std::numeric_limits::max()) { + UnStackHelperCUDAKernel<<>>( + dy_data, + dy_pre, + split_dim, + dy_suf, + split_dim, + reinterpret_cast(tmp_out_data->ptr())); + } else { + UnStackHelperCUDAKernel<<>>( + dy_data, + dy_pre, + split_dim, + dy_suf, + split_dim, + reinterpret_cast(tmp_out_data->ptr())); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(stack_grad, + GPU, + ALL_LAYOUT, + phi::StackGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu new file mode 100644 index 0000000000000..cc7d136c95293 --- /dev/null +++ b/paddle/phi/kernels/gpu/stack_kernel.cu @@ -0,0 +1,113 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/stack_kernel.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__global__ void StackCUDAKernel(T** input_ptrs, + int split_size, + int rows, + int cols, + T* __restrict__ output) { + IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x; + + for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) { + IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y; + + IntType split = grid_x / split_size; + const T* input_ptr = input_ptrs[split]; + IntType col_offset = grid_x % split_size; +#pragma unroll + for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) { + output[grid_y * cols + grid_x] = + input_ptr[grid_y * split_size + col_offset]; + } + } +} + +template +void StackKernel(const Context& dev_ctx, + const std::vector& x, + int axis, + DenseTensor* out) { + if (axis < 0) axis += (x[0]->dims().size() + 1); + + int n = static_cast(x.size()); + T* y_data = dev_ctx.template Alloc(out); + std::vector x_datas(n); + for (int i = 0; i < n; i++) { + x_datas[i] = x[i]->data(); + } + + auto tmp_x_data = paddle::memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*)); + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_x_data->ptr(), + phi::CPUPlace(), + reinterpret_cast(x_datas.data()), + x_datas.size() * sizeof(T*), + dev_ctx.stream()); + + // Split x dim from axis to matrix + int x_row = 1, x_col = 1; + for (int i = 0; i < axis; ++i) { + x_row *= x[0]->dims()[i]; + } + x_col = x[0]->numel() / x_row; + int out_col = x_col * n; + + auto config = + phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row); + + if (out->numel() < std::numeric_limits::max()) { + StackCUDAKernel<<>>( + reinterpret_cast(tmp_x_data->ptr()), + x_col, + x_row, + out_col, + y_data); + } else { + StackCUDAKernel<<>>( + reinterpret_cast(tmp_x_data->ptr()), + x_col, + x_row, + out_col, + y_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(stack, + GPU, + ALL_LAYOUT, + phi::StackKernel, + float, + double, + int64_t, + int, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu new file mode 100644 index 0000000000000..c09730ba76a91 --- /dev/null +++ b/paddle/phi/kernels/gpu/unique_kernel.cu @@ -0,0 +1,615 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unique_kernel.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/unique_functor.h" + +namespace phi { + +// Binary function 'less than' +template +struct LessThan { + int col; + const InT* in_trans_data; + + LessThan(int64_t _col, const InT* _in_trans_data) + : col(_col), in_trans_data(_in_trans_data) {} + + __device__ bool operator()(int64_t a, int64_t b) const { + for (int i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + } +}; + +// Binary function 'equal_to' +template +struct BinaryEqual { + int64_t col; + const InT* in_trans_data; + + BinaryEqual(int64_t _col, const InT* _in_trans_data) + : col(_col), in_trans_data(_in_trans_data) {} + + __device__ bool operator()(int64_t a, int64_t b) const { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs != rhs) { + return false; + } + } + return true; + } +}; + +// Binary function 'not_equal_to' +template +struct BinaryNotEqual { + int64_t col; + const InT* in_trans_data; + + BinaryNotEqual(int64_t _col, const InT* _in_trans_data) + : col(_col), in_trans_data(_in_trans_data) {} + + __device__ bool operator()(int64_t a, int64_t b) const { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs != rhs) { + return true; + } + } + return false; + } +}; + +// index_select() function for DenseTensor +template +void IndexSelect(const Context& context, + const DenseTensor& input, + const DenseTensor& index, + DenseTensor* output, + int dim) { + auto input_dim = input.dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = output->dims(); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto input_width = slice_size * input_dim[dim]; + auto output_width = slice_size * output_dim[dim]; + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + auto index_size = index.dims()[0]; + + std::vector input_vec; + std::vector index_vec; + paddle::framework::TensorToVector(input, context, &input_vec); + paddle::framework::TensorToVector(index, context, &index_vec); + std::vector out_vec(output->numel()); + + for (int i = 0; i < index_size; i++) { + PADDLE_ENFORCE_GE( + index_vec[i], + 0, + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_vec[i])); + PADDLE_ENFORCE_LT( + index_vec[i], + input_dim[dim], + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_vec[i])); + } + + for (auto i = 0; i < outer_nums; i++) { + auto input_start_offset = i * input_width; + auto output_start_offset = i * output_width; + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_vec[j]; + for (auto k = 0; k < slice_size; k++) { + out_vec[output_start_offset + j * slice_size + k] = + input_vec[input_start_offset + index_value * slice_size + k]; + } + } + } + context.template Alloc(output); + paddle::framework::TensorFromVector(out_vec, context, output); + output->Resize(output_dim); +} + +// The core logic of computing Unique for a flattend DenseTensor +template +static void UniqueFlattendCUDATensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts, + equal_T equal, + not_equal_T not_equal, + int64_t num_input) { + // 0. Prepration + DenseTensor in_hat; + phi::Copy(context, in, context.GetPlace(), false, &in_hat); + auto* in_data_hat = context.template Alloc(&in_hat); + + indices->Resize(phi::make_ddim({num_input})); + auto* indices_data = context.template Alloc(indices); + + thrust::sequence(thrust::device, indices_data, indices_data + num_input); + thrust::sort_by_key( + thrust::device, in_data_hat, in_data_hat + num_input, indices_data); + + // 1. Calculate op result: 'out' + DenseTensor range; + range.Resize(phi::make_ddim({num_input + 1})); + auto* range_data_ptr = context.template Alloc(&range); + thrust::sequence( + thrust::device, range_data_ptr, range_data_ptr + num_input + 1); + phi::Copy(context, in_hat, context.GetPlace(), false, out); + int num_out; + auto out_data = context.template Alloc(out); + num_out = + thrust::unique_by_key( + thrust::device, out_data, out_data + num_input, range_data_ptr, equal) + .first - + out_data; + out->Resize(phi::make_ddim({num_out})); + + // 3. Calculate inverse index: 'inverse' + if (return_inverse) { + index->Resize(phi::make_ddim({num_input})); + auto* inverse_data = context.template Alloc(index); + DenseTensor inv_loc; + inv_loc.Resize(phi::make_ddim({num_input})); + auto inv_loc_data_ptr = context.template Alloc(&inv_loc); + thrust::adjacent_difference(thrust::device, + in_data_hat, + in_data_hat + num_input, + inv_loc_data_ptr, + not_equal); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault + thrust::inclusive_scan(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + num_input, + inv_loc_data_ptr); + thrust::scatter(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + num_input, + indices_data, + inverse_data); + } + + // 2. Calculate sorted index: 'indices' + if (return_index) { + DenseTensor tmp_indices; + tmp_indices.Resize(phi::make_ddim({num_input})); + auto* tmp_indices_data_ptr = context.template Alloc(&tmp_indices); + thrust::copy(thrust::device, + in_data_hat, + in_data_hat + num_input, + tmp_indices_data_ptr); + thrust::unique_by_key(thrust::device, + tmp_indices_data_ptr, + tmp_indices_data_ptr + num_input, + indices_data, + equal); + indices->Resize(phi::make_ddim({num_out})); + } + + // 4. Calculate 'counts' + if (return_counts) { + counts->Resize(phi::make_ddim({num_out})); + auto count_data = context.template Alloc(counts); + // init 'count_data' as 0 + thrust::fill(thrust::device, count_data, count_data + num_out, 0); + thrust::device_ptr range_data_ptr_dev(range_data_ptr); + range_data_ptr_dev[num_out] = num_input; + thrust::adjacent_difference(thrust::device, + range_data_ptr + 1, + range_data_ptr + num_out + 1, + count_data); + } +} + +// The logic of compute unique with axis required, it's a little different +// from above function +template +static void ComputeUniqueDims(const Context& context, + DenseTensor* sorted_indices, + IndexT* sorted_indices_data, + DenseTensor* out, + DenseTensor* inverse, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts, + equal_T equal, + not_equal_T not_equal, + int64_t row) { + // 1. inverse indices: 'inverse' + inverse->Resize(phi::make_ddim({row})); + auto* inverse_data = context.template Alloc(inverse); + DenseTensor inv_loc; + inv_loc.Resize(phi::make_ddim({row})); + auto inv_loc_data_ptr = context.template Alloc(&inv_loc); + thrust::adjacent_difference(thrust::device, + sorted_indices_data, + sorted_indices_data + row, + inv_loc_data_ptr, + not_equal); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; + thrust::inclusive_scan(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + row, + inv_loc_data_ptr); + thrust::scatter(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + row, + sorted_indices_data, + inverse_data); + + // 2. sorted indices + DenseTensor range; + range.Resize(phi::make_ddim({row + 1})); + auto range_data_ptr = context.template Alloc(&range); + thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); + int num_out; + num_out = thrust::unique_by_key(thrust::device, + sorted_indices_data, + sorted_indices_data + row, + range_data_ptr, + equal) + .first - + sorted_indices_data; + thrust::device_ptr range_data_ptr_dev(range_data_ptr); + range_data_ptr_dev[num_out] = row; + sorted_indices->Resize(phi::make_ddim({num_out})); + + // 3. counts: 'counts' + counts->Resize(phi::make_ddim({num_out})); + auto* count_data = context.template Alloc(counts); + thrust::fill(thrust::device, count_data, count_data + row, 0); + thrust::adjacent_difference( + thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data); +} + +// Calculate unique when 'axis' is set +template +static void UniqueDimsCUDATensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts, + int axis) { + // 1. Transpose & reshape + // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dims_vec(phi::vectorize(in.dims())); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; + DenseTensor in_trans; + auto in_trans_dims = phi::make_ddim(in_trans_dims_vec); + in_trans.Resize(in_trans_dims); + context.template Alloc(&in_trans); + phi::funcs::TransCompute( + in.dims().size(), // num of dims + context, // device + in, // original DenseTensor + &in_trans, // DenseTensor after reshape + permute); // index of axis + + // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + auto in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + // now 'in_trans' is 2D + int64_t col = in_trans.dims()[1]; + int64_t row = in_trans.dims()[0]; + const InT* in_trans_data = in_trans.data(); + + indices->Resize(phi::make_ddim({row})); + auto* sorted_indices_data = context.template Alloc(indices); + + // 2. Calculate 'indices', 'inverse', 'counts' + // Init index and sort + thrust::sequence( + thrust::device, sorted_indices_data, sorted_indices_data + row); + thrust::sort(thrust::device, + sorted_indices_data, + sorted_indices_data + row, + LessThan(col, in_trans_data)); + ComputeUniqueDims( + context, + indices, + sorted_indices_data, + out, + index, + counts, + return_index, + return_inverse, + return_counts, + BinaryEqual(col, in_trans_data), + BinaryNotEqual(col, in_trans_data), + row); + + // 3. Select indices and reshape back to get 'out' + DenseTensor out_trans; + std::vector out_trans_dims_vec = in_trans_dims_vec; + out_trans_dims_vec[0] = indices->numel(); + out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(&out_trans); + + IndexSelect(context, in_trans, *indices, &out_trans, 0); + + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(out); + std::vector out_trans_unbind = phi::funcs::Unbind(out_trans); + phi::funcs::ConcatFunctor concat_functor; + concat_functor(context, out_trans_unbind, 0, &out_trans); + phi::funcs::TransCompute( + out_trans.dims().size(), context, out_trans, out, permute); +} + +// functor for processing a flattend DenseTensor +template +struct UniqueFlattendCUDAFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + DenseTensor* indices_; + DenseTensor* index_; + DenseTensor* counts_; + const bool return_index_; + const bool return_inverse_; + const bool return_counts_; + + UniqueFlattendCUDAFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + indices_(indices), + index_(index), + counts_(counts), + return_index_(return_index), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueFlattendCUDATensor(ctx_, + in_, + out_, + indices_, + index_, + counts_, + return_index_, + return_inverse_, + return_counts_, + thrust::equal_to(), + thrust::not_equal_to(), + in_.numel()); + } +}; + +// functor for processing a multi-dimentional DenseTensor +template +struct UniqueDimsCUDAFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + DenseTensor* indices_; + DenseTensor* index_; + DenseTensor* counts_; + const int axis_; + const bool return_index_; + const bool return_inverse_; + const bool return_counts_; + + UniqueDimsCUDAFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + const int axis, + bool return_index, + bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + indices_(indices), + index_(index), + counts_(counts), + axis_(axis), + return_index_(return_index), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueDimsCUDATensor(ctx_, + in_, + out_, + indices_, + index_, + counts_, + return_index_, + return_inverse_, + return_counts_, + axis_); + } +}; + +template +void UniqueRawKernel(const Context& context, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + if (dtype == phi::DataType::INT32) { + PADDLE_ENFORCE_LE( + x.numel() + 1, + INT_MAX, + phi::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x.numel())); + } + // if 'axis' is not required, flatten the DenseTensor. + if (axis.empty()) { + phi::VisitDataTypeTiny( + dtype, + UniqueFlattendCUDAFunctor(context, + x, + out, + indices, + index, + counts, + return_index, + return_inverse, + return_counts)); + } else { + // 'axis' is required. + int axis_value = axis[0]; + phi::VisitDataTypeTiny(dtype, + UniqueDimsCUDAFunctor(context, + x, + out, + indices, + index, + counts, + axis_value, + return_index, + return_inverse, + return_counts)); + } +} + +template +void UniqueKernel(const Context& context, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + bool is_sorted = true; + UniqueRawKernel(context, + x, + return_index, + return_inverse, + return_counts, + axis, + dtype, + is_sorted, + out, + indices, + index, + counts); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + unique, GPU, ALL_LAYOUT, phi::UniqueKernel, float, double, int64_t, int) {} + +PD_REGISTER_KERNEL(unique_raw, + GPU, + ALL_LAYOUT, + phi::UniqueRawKernel, + float, + double, + int64_t, + int) {} diff --git a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu new file mode 100644 index 0000000000000..b7c349de0df32 --- /dev/null +++ b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unstack_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(unstack_grad, + GPU, + ALL_LAYOUT, + phi::UnStackGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/unstack_kernel.cu b/paddle/phi/kernels/gpu/unstack_kernel.cu new file mode 100644 index 0000000000000..f147f4c0f0edf --- /dev/null +++ b/paddle/phi/kernels/gpu/unstack_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unstack_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unstack_kernel_impl.h" + +PD_REGISTER_KERNEL(unstack, + GPU, + ALL_LAYOUT, + phi::UnStackKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h new file mode 100644 index 0000000000000..0576742e349a8 --- /dev/null +++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/stack_functor.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include +#endif +namespace phi { + +template +void UnStackGradKernel(const Context &dev_ctx, + const std::vector &x, + int axis, + DenseTensor *x_grad) { + if (axis < 0) axis += (x[0]->dims().size() + 1); + + int n = static_cast(x.size()); + auto *x_grad_data = dev_ctx.template Alloc(x_grad); + std::vector x_datas(n); + for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); + + int pre = 1; + int post = 1; + auto &dim = x[0]->dims(); + for (auto i = 0; i < axis; ++i) pre *= dim[i]; + for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; + +#if defined(__NVCC__) || defined(__HIPCC__) + int total_num = pre * n * post; + + thrust::device_vector device_x_vec(x_datas); + auto x_data_arr = device_x_vec.data().get(); + + phi::funcs::StackFunctorForRange( + dev_ctx, x_data_arr, x_grad_data, total_num, n, post); + + // Wait() must be called because device_x_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#else + auto x_data_arr = x_datas.data(); + + size_t x_offset = 0; + size_t y_offset = 0; + for (int i = 0; i < pre; i++) { + for (int j = 0; j < n; j++) { + std::memcpy( + x_grad_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T)); + y_offset += post; + } + x_offset += post; + } +#endif +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h new file mode 100644 index 0000000000000..030f4a62c6e00 --- /dev/null +++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/stack_functor.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include +#endif + +namespace phi { + +template +void UnStackKernel(const Context &dev_ctx, + const DenseTensor &x, + int axis, + int num, + std::vector outs) { + auto *dy = &x; + auto dx = outs; + if (axis < 0) axis += dy->dims().size(); + + int n = dy->dims()[axis]; + std::vector dx_datas(n); // NOLINT + for (int i = 0; i < n; i++) { + dx_datas[i] = dev_ctx.template Alloc(dx[i]); + } + auto dy_data = dy->data(); + if (dy->numel() == 0) return; + int pre = 1; + for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; + int total_num = dy->numel(); + int post = total_num / (n * pre); + +#if defined(__NVCC__) || defined(__HIPCC__) + thrust::device_vector device_dx_vec(dx_datas); + auto dx_data_arr = device_dx_vec.data().get(); +#else + auto dx_data_arr = dx_datas.data(); +#endif + phi::funcs::StackGradFunctorForRange( + dev_ctx, dx_data_arr, dy_data, total_num, n, post); +#if defined(__NVCC__) || defined(__HIPCC__) + // Wait() must be called because device_dx_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#endif +} + +} // namespace phi diff --git a/paddle/phi/kernels/range_kernel.h b/paddle/phi/kernels/range_kernel.h new file mode 100644 index 0000000000000..c76308193ae5e --- /dev/null +++ b/paddle/phi/kernels/range_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RangeKernel(const Context& dev_ctx, + const DenseTensor& start, + const DenseTensor& end, + const DenseTensor& step, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/stack_grad_kernel.h b/paddle/phi/kernels/stack_grad_kernel.h new file mode 100644 index 0000000000000..32451e606f26a --- /dev/null +++ b/paddle/phi/kernels/stack_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void StackGradKernel(const Context& dev_ctx, + const DenseTensor& out, + int axis, + std::vector x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/stack_kernel.h b/paddle/phi/kernels/stack_kernel.h new file mode 100644 index 0000000000000..679c74063080e --- /dev/null +++ b/paddle/phi/kernels/stack_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void StackKernel(const Context& dev_ctx, + const std::vector& x, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/unique_kernel.h b/paddle/phi/kernels/unique_kernel.h new file mode 100644 index 0000000000000..353570c8e7da3 --- /dev/null +++ b/paddle/phi/kernels/unique_kernel.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UniqueKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts); + +template +void UniqueRawKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts); + +} // namespace phi diff --git a/paddle/phi/kernels/unstack_grad_kernel.h b/paddle/phi/kernels/unstack_grad_kernel.h new file mode 100644 index 0000000000000..de0e3004d8038 --- /dev/null +++ b/paddle/phi/kernels/unstack_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UnStackGradKernel(const Context& dev_ctx, + const std::vector& x, + int axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/unstack_kernel.h b/paddle/phi/kernels/unstack_kernel.h new file mode 100644 index 0000000000000..0494aa6327c21 --- /dev/null +++ b/paddle/phi/kernels/unstack_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UnStackKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + int num, + std::vector outs); + +} // namespace phi diff --git a/paddle/phi/ops/compat/stack_sig.cc b/paddle/phi/ops/compat/stack_sig.cc new file mode 100644 index 0000000000000..97768eb89026e --- /dev/null +++ b/paddle/phi/ops/compat/stack_sig.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature StackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "stack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(stack_grad, phi::StackGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/unique_sig.cc b/paddle/phi/ops/compat/unique_sig.cc new file mode 100644 index 0000000000000..2a7ba543012f3 --- /dev/null +++ b/paddle/phi/ops/compat/unique_sig.cc @@ -0,0 +1,42 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature UniqueOpArgumentMapping(const ArgumentMappingContext& ctx) { + bool is_sorted = paddle::any_cast(ctx.Attr("is_sorted")); + if (is_sorted) { + return KernelSignature( + "unique", + {"X"}, + {"return_index", "return_inverse", "return_counts", "axis", "dtype"}, + {"Out", "Indices", "Index", "Counts"}); + } else { + return KernelSignature("unique_raw", + {"X"}, + {"return_index", + "return_inverse", + "return_counts", + "axis", + "dtype", + "is_sorted"}, + {"Out", "Indices", "Index", "Counts"}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(unique, phi::UniqueOpArgumentMapping); diff --git a/paddle/phi/ops/compat/unstack_sig.cc b/paddle/phi/ops/compat/unstack_sig.cc new file mode 100644 index 0000000000000..41d7fc120a9ef --- /dev/null +++ b/paddle/phi/ops/compat/unstack_sig.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature UnStackGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "unstack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(unstack_grad, phi::UnStackGradOpArgumentMapping);