Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XPU] add top_k op #44656

Merged
merged 4 commits into from
Jul 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
if(NOT DEFINED XPU_BASE_URL)
set(XPU_BASE_URL_WITHOUT_DATE
"https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220722")
set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220727")
else()
set(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
Expand All @@ -19,7 +19,7 @@ endif()
if(NOT DEFINED XPU_XDNN_BASE_URL)
set(XPU_XDNN_BASE_URL_WITHOUT_DATE
"https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220722")
set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220727")
else()
set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
endif()
Expand Down
77 changes: 41 additions & 36 deletions paddle/fluid/operators/top_k_op_xpu.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@ limitations under the License. */
#include <memory>

#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "xpu/refactor/math.h"

namespace paddle {
Expand All @@ -25,17 +26,26 @@ namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class TopkXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;

public:
void Compute(const framework::ExecutionContext& ctx) const override {
// Get the top k elements of each row of input tensor
auto* input = ctx.Input<Tensor>("X");
const auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices");

size_t k = static_cast<int>(ctx.Attr<int>("k"));
// get k from attr
int k = static_cast<int>(ctx.Attr<int>("k"));

// get k from input tensor
auto* k_t = ctx.Input<Tensor>("K");
if (k_t) {
k = k_t->data<int>()[0];
memory::Copy(platform::CPUPlace(),
static_cast<void*>(&k),
ctx.GetPlace(),
static_cast<const void*>(k_t->data<int>()),
sizeof(int));
framework::DDim output_dims = output->dims();
output_dims[output_dims.size() - 1] = k;
output->Resize(output_dims);
Expand All @@ -44,49 +54,44 @@ class TopkXPUKernel : public framework::OpKernel<T> {

T* output_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
Tensor indices_32_data_tensor;
int32_t* indices_int_data = indices_32_data_tensor.mutable_data<int32_t>(
ctx.GetPlace(), indices->numel());

auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
// allocate temp memory for int32 index
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int* indices_int_data = RAII_GUARD.alloc_l3_or_gm<int>(indices->numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int_data);

// reshape input to a flattern matrix(like flat_inner_dims)
framework::DDim inputdims = input->dims();
const size_t row =
phi::product(phi::slice_ddim(inputdims, 0, inputdims.size() - 1));
const size_t col = inputdims[inputdims.size() - 1];
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();

int ret = xpu::sorted_topk<T>(dev_ctx.x_context(),
input->data<T>(),
output_data,
indices_int_data,
row,
col,
k);
PADDLE_ENFORCE_EQ(ret,
XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d] in call kernel name "
"[%s], please check "
"where Baidu Kunlun Card is properly installed.",
ret,
"sorted_topk"));
ret = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)indices_int_data,
indices_data,
indices->numel());
PADDLE_ENFORCE_EQ(ret,
XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d] in call kernel name "
"[%s], please check "
"where Baidu Kunlun Card is properly installed.",
ret,
"cast_v2"));
// int sorted_topk(Context* ctx, const T* x, T* y, int* index, int m, int n,
// int k, bool largest = true);
int r = xpu::sorted_topk(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(input->data<T>()),
reinterpret_cast<XPUType*>(output_data),
indices_int_data,
row,
col,
k);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");

// cast to int64 as final result
r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)indices_int_data,
indices_data,
indices->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
}
};

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(top_k, ops::TopkXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(top_k,
ops::TopkXPUKernel<float>,
ops::TopkXPUKernel<paddle::platform::float16>);
#endif
7 changes: 3 additions & 4 deletions paddle/fluid/platform/device/xpu/xpu2_op_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,9 @@ XPUOpMap& get_kl2_ops() {
{"elementwise_add",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"elementwise_div_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"elementwise_div_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"elementwise_div",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"elementwise_div",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
Expand Down Expand Up @@ -495,6 +491,9 @@ XPUOpMap& get_kl2_ops() {
{"transpose",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"top_k",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"unsqueeze2_grad",
XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
Expand Down
126 changes: 85 additions & 41 deletions python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -18,62 +18,106 @@
import sys

sys.path.append("..")
from paddle.fluid.op import Operator
import paddle.fluid.core as core
import paddle.fluid as fluid
import paddle
from op_test import OpTest
from op_test_xpu import XPUOpTest
from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper

paddle.enable_static()


@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestTopkOp(OpTest):
def random_unique_float(row, k, dtype):
# create a random float array with 10x length
arr = np.random.uniform(-10.0, 10.0, int(row * k * 10)).astype(dtype)
arr = np.unique(arr)
assert arr.shape[
0] >= row * k, "failed to create enough unique values: %d vs %d" % (
arr.shape[0], row * k)
arr = arr[:row * k]
np.random.shuffle(arr)
arr = arr.reshape(row, k)
return arr

def setUp(self):
self.variable_k = False
self.use_xpu = True
self.set_args()
self.op_type = "top_k"
self.dtype = np.float32
self.init_dtype()

k = self.top_k
input = np.random.random((self.row, k)).astype(self.dtype)
output = np.ndarray((self.row, k))
indices = np.ndarray((self.row, k)).astype("int64")
self.inputs = {'X': input}
class XPUTestTopkOP(XPUOpTestWrapper):

if self.variable_k:
self.inputs['K'] = np.array([k]).astype("int32")
else:
self.attrs = {'k': k}
def __init__(self):
self.op_name = 'top_k'
self.use_dynamic_create_class = False

for rowid in range(self.row):
row = input[rowid]
output[rowid] = np.sort(row)[::-1][:k]
indices[rowid] = row.argsort()[::-1][:k]
class TestXPUTopkOP(XPUOpTest):

self.outputs = {'Out': output, 'Indices': indices}
def setUp(self):
self.place = paddle.XPUPlace(0)
self.init_dtype()
self.op_type = 'top_k'
self.set_case()

def init_dtype(self):
self.dtype = np.float32
# generate UNIQUE float values as input, in order to prevent the following potential problem: x[i] and x[j] are IDENTICAL float values, the result of cpu index is [i, j] while the xpu result is [j, i]. Both of them are correct but diff in numpy compare.
k = self.top_k
input = random_unique_float(self.row, k, self.dtype)
output = np.ndarray((self.row, k))
indices = np.ndarray((self.row, k)).astype("int64")
self.inputs = {'X': input}

def set_args(self):
self.row = 100
self.top_k = 1
if self.variable_k:
self.inputs['K'] = np.array([k]).astype("int32")
else:
self.attrs = {'k': k}

def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
for rowid in range(self.row):
row = input[rowid]
output[rowid] = np.sort(row)[::-1][:k]
indices[rowid] = row.argsort()[::-1][:k]

def test_check_grad(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_grad_with_place(place, ['X'], 'Out')
self.outputs = {'Out': output, 'Indices': indices}

def init_dtype(self):
self.dtype = self.in_type

def set_case(self):
self.variable_k = False
self.row = 16
self.top_k = 8

def test_check_output(self):
self.check_output_with_place(self.place)

def test_check_grad(self):
self.check_grad_with_place(self.place, ['X'], 'Out')

class TestTopk1(TestXPUTopkOP):

def set_case(self):
self.variable_k = True
self.row = 100
self.top_k = 1

class TestTopk2(TestXPUTopkOP):

def set_case(self):
self.variable_k = False
self.row = 16
self.top_k = 256

class TestTopk3(TestXPUTopkOP):

def set_case(self):
self.variable_k = True
self.row = 10
self.top_k = 512

class TestTopk4(TestXPUTopkOP):

def set_case(self):
self.variable_k = False
self.row = 5
self.top_k = 511


support_types = get_xpu_op_support_types('top_k')
for stype in support_types:
create_test_class(globals(), XPUTestTopkOP, stype)

if __name__ == "__main__":
unittest.main()