diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 8021d2a6f80c6..28a9bdfc581bf 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
       "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220722")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220727")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -19,7 +19,7 @@ endif()
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   set(XPU_XDNN_BASE_URL_WITHOUT_DATE
       "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220722")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220727")
 else()
   set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc
index b4ae07a2bb551..9ffcd4d46fc1a 100644
--- a/paddle/fluid/operators/top_k_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "xpu/refactor/math.h"
 
 namespace paddle {
@@ -25,17 +26,26 @@ namespace operators {
 using Tensor = framework::Tensor;
 template <typename T>
 class TopkXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
-    auto* input = ctx.Input<Tensor>("X");
+    const auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
 
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+    // get k from attr
+    int k = static_cast<int>(ctx.Attr<int>("k"));
+
+    // get k from input tensor
     auto* k_t = ctx.Input<Tensor>("K");
     if (k_t) {
-      k = k_t->data<int>()[0];
+      memory::Copy(platform::CPUPlace(),
+                   static_cast<void*>(&k),
+                   ctx.GetPlace(),
+                   static_cast<const void*>(k_t->data<int>()),
+                   sizeof(int));
       framework::DDim output_dims = output->dims();
       output_dims[output_dims.size() - 1] = k;
       output->Resize(output_dims);
@@ -44,43 +54,36 @@ class TopkXPUKernel : public framework::OpKernel<T> {
 
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-    Tensor indices_32_data_tensor;
-    int32_t* indices_int_data = indices_32_data_tensor.mutable_data<int32_t>(
-        ctx.GetPlace(), indices->numel());
+
+    auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+    // allocate temp memory for int32 index
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int* indices_int_data = RAII_GUARD.alloc_l3_or_gm<int>(indices->numel());
+    PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int_data);
+
     // reshape input to a flattern matrix(like flat_inner_dims)
     framework::DDim inputdims = input->dims();
     const size_t row =
         phi::product(phi::slice_ddim(inputdims, 0, inputdims.size() - 1));
     const size_t col = inputdims[inputdims.size() - 1];
-    auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
 
-    int ret = xpu::sorted_topk<T>(dev_ctx.x_context(),
-                                  input->data<T>(),
-                                  output_data,
-                                  indices_int_data,
-                                  row,
-                                  col,
-                                  k);
-    PADDLE_ENFORCE_EQ(ret,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU API return wrong value[%d] in call kernel name "
-                          "[%s], please check "
-                          "where Baidu Kunlun Card is properly installed.",
-                          ret,
-                          "sorted_topk"));
-    ret = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
-                                         (const int32_t*)indices_int_data,
-                                         indices_data,
-                                         indices->numel());
-    PADDLE_ENFORCE_EQ(ret,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU API return wrong value[%d] in call kernel name "
-                          "[%s], please check "
-                          "where Baidu Kunlun Card is properly installed.",
-                          ret,
-                          "cast_v2"));
+    // int sorted_topk(Context* ctx, const T* x, T* y, int* index, int m, int n,
+    // int k, bool largest = true);
+    int r = xpu::sorted_topk(dev_ctx.x_context(),
+                             reinterpret_cast<const XPUType*>(input->data<T>()),
+                             reinterpret_cast<XPUType*>(output_data),
+                             indices_int_data,
+                             row,
+                             col,
+                             k);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
+
+    // cast to int64 as final result
+    r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
+                                       (const int32_t*)indices_int_data,
+                                       indices_data,
+                                       indices->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
   }
 };
 
@@ -88,5 +91,7 @@ class TopkXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(top_k, ops::TopkXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(top_k,
+                       ops::TopkXPUKernel<float>,
+                       ops::TopkXPUKernel<paddle::platform::float16>);
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 1d4a5bf74b8df..56da75844e619 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -105,13 +105,9 @@ XPUOpMap& get_kl2_ops() {
       {"elementwise_add",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_div_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_div_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_div",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_div",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -495,6 +491,9 @@ XPUOpMap& get_kl2_ops() {
       {"transpose",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"top_k",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"unsqueeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
index 1fa4a5e8b7d00..5b2177cdbe1aa 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,62 +18,106 @@
 import sys
 
 sys.path.append("..")
-from paddle.fluid.op import Operator
-import paddle.fluid.core as core
-import paddle.fluid as fluid
 import paddle
 from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestTopkOp(OpTest):
+def random_unique_float(row, k, dtype):
+    # create a random float array with 10x length
+    arr = np.random.uniform(-10.0, 10.0, int(row * k * 10)).astype(dtype)
+    arr = np.unique(arr)
+    assert arr.shape[
+        0] >= row * k, "failed to create enough unique values: %d vs %d" % (
+            arr.shape[0], row * k)
+    arr = arr[:row * k]
+    np.random.shuffle(arr)
+    arr = arr.reshape(row, k)
+    return arr
 
-    def setUp(self):
-        self.variable_k = False
-        self.use_xpu = True
-        self.set_args()
-        self.op_type = "top_k"
-        self.dtype = np.float32
-        self.init_dtype()
 
-        k = self.top_k
-        input = np.random.random((self.row, k)).astype(self.dtype)
-        output = np.ndarray((self.row, k))
-        indices = np.ndarray((self.row, k)).astype("int64")
-        self.inputs = {'X': input}
+class XPUTestTopkOP(XPUOpTestWrapper):
 
-        if self.variable_k:
-            self.inputs['K'] = np.array([k]).astype("int32")
-        else:
-            self.attrs = {'k': k}
+    def __init__(self):
+        self.op_name = 'top_k'
+        self.use_dynamic_create_class = False
 
-        for rowid in range(self.row):
-            row = input[rowid]
-            output[rowid] = np.sort(row)[::-1][:k]
-            indices[rowid] = row.argsort()[::-1][:k]
+    class TestXPUTopkOP(XPUOpTest):
 
-        self.outputs = {'Out': output, 'Indices': indices}
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+            self.op_type = 'top_k'
+            self.set_case()
 
-    def init_dtype(self):
-        self.dtype = np.float32
+            # generate UNIQUE float values as input, in order to prevent the following potential problem: x[i] and x[j] are IDENTICAL float values, the result of cpu index is [i, j] while the xpu result is [j, i]. Both of them are correct but diff in numpy compare.
+            k = self.top_k
+            input = random_unique_float(self.row, k, self.dtype)
+            output = np.ndarray((self.row, k))
+            indices = np.ndarray((self.row, k)).astype("int64")
+            self.inputs = {'X': input}
 
-    def set_args(self):
-        self.row = 100
-        self.top_k = 1
+            if self.variable_k:
+                self.inputs['K'] = np.array([k]).astype("int32")
+            else:
+                self.attrs = {'k': k}
 
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+            for rowid in range(self.row):
+                row = input[rowid]
+                output[rowid] = np.sort(row)[::-1][:k]
+                indices[rowid] = row.argsort()[::-1][:k]
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+            self.outputs = {'Out': output, 'Indices': indices}
 
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def set_case(self):
+            self.variable_k = False
+            self.row = 16
+            self.top_k = 8
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestTopk1(TestXPUTopkOP):
+
+        def set_case(self):
+            self.variable_k = True
+            self.row = 100
+            self.top_k = 1
+
+    class TestTopk2(TestXPUTopkOP):
+
+        def set_case(self):
+            self.variable_k = False
+            self.row = 16
+            self.top_k = 256
+
+    class TestTopk3(TestXPUTopkOP):
+
+        def set_case(self):
+            self.variable_k = True
+            self.row = 10
+            self.top_k = 512
+
+    class TestTopk4(TestXPUTopkOP):
+
+        def set_case(self):
+            self.variable_k = False
+            self.row = 5
+            self.top_k = 511
+
+
+support_types = get_xpu_op_support_types('top_k')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTopkOP, stype)
 
 if __name__ == "__main__":
     unittest.main()