diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 8021d2a6f80c6..28a9bdfc581bf 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220722") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220727") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -19,7 +19,7 @@ endif() if(NOT DEFINED XPU_XDNN_BASE_URL) set(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220722") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220727") else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc index b4ae07a2bb551..9ffcd4d46fc1a 100644 --- a/paddle/fluid/operators/top_k_op_xpu.cc +++ b/paddle/fluid/operators/top_k_op_xpu.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "xpu/refactor/math.h" namespace paddle { @@ -25,17 +26,26 @@ namespace operators { using Tensor = framework::Tensor; template class TopkXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { // Get the top k elements of each row of input tensor - auto* input = ctx.Input("X"); + const auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); auto* indices = ctx.Output("Indices"); - size_t k = static_cast(ctx.Attr("k")); + // get k from attr + int k = static_cast(ctx.Attr("k")); + + // get k from input tensor auto* k_t = ctx.Input("K"); if (k_t) { - k = k_t->data()[0]; + memory::Copy(platform::CPUPlace(), + static_cast(&k), + ctx.GetPlace(), + static_cast(k_t->data()), + sizeof(int)); framework::DDim output_dims = output->dims(); output_dims[output_dims.size() - 1] = k; output->Resize(output_dims); @@ -44,43 +54,36 @@ class TopkXPUKernel : public framework::OpKernel { T* output_data = output->mutable_data(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - Tensor indices_32_data_tensor; - int32_t* indices_int_data = indices_32_data_tensor.mutable_data( - ctx.GetPlace(), indices->numel()); + + auto& dev_ctx = ctx.template device_context(); + // allocate temp memory for int32 index + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int* indices_int_data = RAII_GUARD.alloc_l3_or_gm(indices->numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int_data); + // reshape input to a flattern matrix(like flat_inner_dims) framework::DDim inputdims = input->dims(); const size_t row = phi::product(phi::slice_ddim(inputdims, 0, inputdims.size() - 1)); const size_t col = inputdims[inputdims.size() - 1]; - auto& dev_ctx = ctx.template device_context(); - int ret = xpu::sorted_topk(dev_ctx.x_context(), - input->data(), - output_data, - indices_int_data, - row, - col, - k); - PADDLE_ENFORCE_EQ(ret, - XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d] in call kernel name " - "[%s], please check " - "where Baidu Kunlun Card is properly installed.", - ret, - "sorted_topk")); - ret = xpu::cast_v2(dev_ctx.x_context(), - (const int32_t*)indices_int_data, - indices_data, - indices->numel()); - PADDLE_ENFORCE_EQ(ret, - XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d] in call kernel name " - "[%s], please check " - "where Baidu Kunlun Card is properly installed.", - ret, - "cast_v2")); + // int sorted_topk(Context* ctx, const T* x, T* y, int* index, int m, int n, + // int k, bool largest = true); + int r = xpu::sorted_topk(dev_ctx.x_context(), + reinterpret_cast(input->data()), + reinterpret_cast(output_data), + indices_int_data, + row, + col, + k); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); + + // cast to int64 as final result + r = xpu::cast_v2(dev_ctx.x_context(), + (const int32_t*)indices_int_data, + indices_data, + indices->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); } }; @@ -88,5 +91,7 @@ class TopkXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(top_k, ops::TopkXPUKernel); +REGISTER_OP_XPU_KERNEL(top_k, + ops::TopkXPUKernel, + ops::TopkXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 1d4a5bf74b8df..56da75844e619 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -105,13 +105,9 @@ XPUOpMap& get_kl2_ops() { {"elementwise_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_div_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"elementwise_div_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, - {"elementwise_div", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"elementwise_div", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, @@ -495,6 +491,9 @@ XPUOpMap& get_kl2_ops() { {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"top_k", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"unsqueeze2_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py index 1fa4a5e8b7d00..5b2177cdbe1aa 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,62 +18,106 @@ import sys sys.path.append("..") -from paddle.fluid.op import Operator -import paddle.fluid.core as core -import paddle.fluid as fluid import paddle from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestTopkOp(OpTest): +def random_unique_float(row, k, dtype): + # create a random float array with 10x length + arr = np.random.uniform(-10.0, 10.0, int(row * k * 10)).astype(dtype) + arr = np.unique(arr) + assert arr.shape[ + 0] >= row * k, "failed to create enough unique values: %d vs %d" % ( + arr.shape[0], row * k) + arr = arr[:row * k] + np.random.shuffle(arr) + arr = arr.reshape(row, k) + return arr - def setUp(self): - self.variable_k = False - self.use_xpu = True - self.set_args() - self.op_type = "top_k" - self.dtype = np.float32 - self.init_dtype() - k = self.top_k - input = np.random.random((self.row, k)).astype(self.dtype) - output = np.ndarray((self.row, k)) - indices = np.ndarray((self.row, k)).astype("int64") - self.inputs = {'X': input} +class XPUTestTopkOP(XPUOpTestWrapper): - if self.variable_k: - self.inputs['K'] = np.array([k]).astype("int32") - else: - self.attrs = {'k': k} + def __init__(self): + self.op_name = 'top_k' + self.use_dynamic_create_class = False - for rowid in range(self.row): - row = input[rowid] - output[rowid] = np.sort(row)[::-1][:k] - indices[rowid] = row.argsort()[::-1][:k] + class TestXPUTopkOP(XPUOpTest): - self.outputs = {'Out': output, 'Indices': indices} + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_dtype() + self.op_type = 'top_k' + self.set_case() - def init_dtype(self): - self.dtype = np.float32 + # generate UNIQUE float values as input, in order to prevent the following potential problem: x[i] and x[j] are IDENTICAL float values, the result of cpu index is [i, j] while the xpu result is [j, i]. Both of them are correct but diff in numpy compare. + k = self.top_k + input = random_unique_float(self.row, k, self.dtype) + output = np.ndarray((self.row, k)) + indices = np.ndarray((self.row, k)).astype("int64") + self.inputs = {'X': input} - def set_args(self): - self.row = 100 - self.top_k = 1 + if self.variable_k: + self.inputs['K'] = np.array([k]).astype("int32") + else: + self.attrs = {'k': k} - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) + for rowid in range(self.row): + row = input[rowid] + output[rowid] = np.sort(row)[::-1][:k] + indices[rowid] = row.argsort()[::-1][:k] - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') + self.outputs = {'Out': output, 'Indices': indices} + def init_dtype(self): + self.dtype = self.in_type + + def set_case(self): + self.variable_k = False + self.row = 16 + self.top_k = 8 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + class TestTopk1(TestXPUTopkOP): + + def set_case(self): + self.variable_k = True + self.row = 100 + self.top_k = 1 + + class TestTopk2(TestXPUTopkOP): + + def set_case(self): + self.variable_k = False + self.row = 16 + self.top_k = 256 + + class TestTopk3(TestXPUTopkOP): + + def set_case(self): + self.variable_k = True + self.row = 10 + self.top_k = 512 + + class TestTopk4(TestXPUTopkOP): + + def set_case(self): + self.variable_k = False + self.row = 5 + self.top_k = 511 + + +support_types = get_xpu_op_support_types('top_k') +for stype in support_types: + create_test_class(globals(), XPUTestTopkOP, stype) if __name__ == "__main__": unittest.main()