From 15700b82dd9d2c2158c58431e6e8bd068fa1fcc6 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 11 Nov 2021 03:30:35 +0000
Subject: [PATCH 01/41] Add XPU compiler for paddle, test=develop

---
 CMakeLists.txt                                |  17 +
 cmake/configure.cmake                         |   5 +
 cmake/generic.cmake                           |  84 +++++
 cmake/hip.cmake                               |   3 +
 cmake/inference_lib.cmake                     |   7 +
 cmake/operators.cmake                         |  32 +-
 cmake/third_party.cmake                       |   6 +
 cmake/xpu2.cmake                              | 297 ++++++++++++++++++
 paddle/fluid/framework/CMakeLists.txt         |   8 +
 paddle/fluid/framework/op_registry.h          |   4 +
 .../elementwise/elementwise_add_op.h          |   1 +
 .../elementwise/elementwise_add_op.xpu        |  34 ++
 .../elementwise/elementwise_add_op_kps.cc     |  50 +++
 paddle/fluid/pybind/CMakeLists.txt            |   8 +
 14 files changed, 552 insertions(+), 4 deletions(-)
 create mode 100644 cmake/xpu2.cmake
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op.xpu
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 334a6cfcd0ee1..5ed04ed994e15 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
+option(WITH_XPU2         "Compile PaddlePaddle with BAIDU KUNLUN XPU2"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
 option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
@@ -57,6 +58,9 @@ include(generic)            # simplify cmake module
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
+if (WITH_GPU  AND WITH_XPU2)
+    message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
+endif()
 if (WITH_GPU AND WITH_ASCEND)
     message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
@@ -265,6 +269,15 @@ if (NOT WITH_GPU AND WITH_NCCL)
         "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
+# XPU XPU2 use BKCL
+# if (NOT (WITH_XPU OR WITH_XPU2) AND WITH_XPU_BKCL)
+#     MESSAGE(WARNING
+#         "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
+#     set(WITH_XPU_BKCL OFF CACHE STRING
+#         "Disable BKCL when compiling without XPU" FORCE)
+# endif()
+
+#
 if (NOT WITH_XPU AND WITH_XPU_BKCL)
     MESSAGE(WARNING
         "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
@@ -305,6 +318,10 @@ if(WITH_ROCM)
     include(miopen) # set miopen libraries, must before configure
 endif(WITH_ROCM)
 
+if(WITH_XPU2)
+    include(xpu2)
+endif(WITH_ROCM)
+
 if (NOT WITH_ROCM AND WITH_RCCL)
     MESSAGE(WARNING
         "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 7f737cc189510..671cddbeb7907 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -99,6 +99,11 @@ if(WITH_XPU)
     add_definitions(-DPADDLE_WITH_XPU)
 endif()
 
+if(WITH_XPU2)
+    message(STATUS "Compile with XPU2!")
+    add_definitions(-DPADDLE_WITH_XPU2)
+endif()
+
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 2004abcbfa1f2..21cf2c0bdc9ed 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -400,6 +400,10 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
+  if(WITH_XPU2)
+    target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
+    target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
+  endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
@@ -424,6 +428,10 @@ function(cc_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
+    # added by lxd
+    # if(WITH_XPU2)
+    #   target_link_libraries(${TARGET_NAME} ${XPU2_CLANGRTC_LIB})
+    # endif()
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
 endfunction()
@@ -654,6 +662,82 @@ function(hip_test TARGET_NAME)
   endif()
 endfunction(hip_test)
 
+function(xpu_library TARGET_NAME)
+  if (WITH_XPU2)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(xpu_library_SRCS)
+      if (xpu_library_SHARED OR xpu_library_shared) # build *.so
+        # xpu_add_library(${TARGET_NAME} SHARED ${xpu_library_SRCS})
+        message(FATAL_ERROR "XPU kernel currently does not support dynamic links")
+      else()
+        xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS})
+        find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (xpu_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${xpu_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND xpu_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else(xpu_library_SRCS)
+      if (xpu_library_DEPS)
+        list(REMOVE_DUPLICATES xpu_library_DEPS)
+        generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:xpu_library")
+        target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
+        add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in xpu_library.")
+      endif()
+    endif(xpu_library_SRCS)
+  endif()
+endfunction(xpu_library)
+
+function(xpu_binary TARGET_NAME)
+  if (WITH_XPU2)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${xpu_binary_SRCS})
+    if(xpu_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${xpu_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${xpu_binary_DEPS})
+      common_link(${TARGET_NAME})
+    endif()
+  endif()
+endfunction(xpu_binary)
+
+function(xpu_test TARGET_NAME)
+  # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
+  # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
+  # other than *.py are modified.
+  if (WITH_XPU2 AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${xpu_test_SRCS})
+    # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
+    target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
+    add_dependencies(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+  endif()
+endfunction(xpu_test)
+
 function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 514f5ea9deaa3..f9c90c58c2adf 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -13,11 +13,13 @@ else()
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 
+#找到相关的编译器依赖环境
 find_package(HIP REQUIRED)
 include_directories(${ROCM_PATH}/include)
 message(STATUS "HIP version: ${HIP_VERSION}")
 message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}")
 
+# 添加相关环境
 macro(find_package_and_include PACKAGE_NAME)
   find_package("${PACKAGE_NAME}" REQUIRED)
   include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include")
@@ -83,6 +85,7 @@ endif()
 message(STATUS "HIP library name: ${hip_library_name}")
 
 # set HIP link libs
+# 从后面找到叫hip_hcc或者amdhip64的库
 find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib)
 message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}")
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e5a7f0d2bef54..5e418d3e33c7d 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -181,6 +181,13 @@ IF(WITH_XPU)
         DSTS ${dst_dir} ${dst_dir})
 ENDIF()
 
+# IF(WITH_XPU2)
+#     set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/xpu2")
+#     copy(inference_lib_dist
+#         SRCS ${XPU2_INC_DIR} ${XPU2_LIB_DIR}
+#         DSTS ${dst_dir} ${dst_dir})
+# ENDIF()
+
 # CMakeCache Info
 copy(inference_lib_dist
         SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index a537719cc7582..47181fada781a 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -11,6 +11,7 @@ function(op_library TARGET)
     set(cu_cc_srcs)
     set(hip_cc_srcs)
     set(xpu_cc_srcs)
+    set(xpu2_cc_srcs)
     set(npu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(miopen_cu_cc_srcs)
@@ -59,6 +60,7 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu)
                 list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
             endif()
+            # TODO(liuxiandong) add .kps file
         endif()
         if(WITH_ROCM)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
@@ -92,6 +94,16 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
             endif()
         endif()
+        if(WITH_XPU2)
+            # TODO(liuxiandong) xpu->kps
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
+                list(APPEND xpu2_cc_srcs ${TARGET}.xpu)
+            endif()
+            string(REPLACE "_op" "_op_kps" XPU2_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU2_FILE}.cc)
+                list(APPEND xpu2_cc_srcs ${XPU2_FILE}.cc)
+            endif()
+        endif()
         if(WITH_ASCEND_CL)
             string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
@@ -120,6 +132,10 @@ function(op_library TARGET)
                 list(APPEND cu_cc_srcs ${src})
             elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
+            elseif(WITH_XPU2 AND ${src} MATCHES ".*_op_kps.cc$")
+                list(APPEND xpu2_cc_srcs ${src})
+            elseif(WITH_XPU2 AND ${src} MATCHES ".*\\.xpu$")
+                list(APPEND xpu2_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
@@ -192,11 +208,13 @@ function(op_library TARGET)
         list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
+    elseif (WITH_XPU2)
+        xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu2_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
     else()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
             # Combine the cc source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
+            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu2_cc_srcs} ${npu_cc_srcs})
             if(TARGET ${UNITY_TARGET})
                 # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                 target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@@ -207,7 +225,7 @@ function(op_library TARGET)
             # Add alias library to handle dependencies.
             add_library(${TARGET} ALIAS ${UNITY_TARGET})
         else()
-            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
+            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu2_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
         endif()
     endif()
@@ -260,10 +278,12 @@ function(op_library TARGET)
     list(LENGTH hip_cc_srcs hip_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+    list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
     list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
     list(LENGTH npu_cc_srcs npu_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
+        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND 
+        ${xpu_cc_srcs_len} EQUAL 0 AND ${xpu2_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -304,6 +324,10 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
+    if (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    endif()
+
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
         file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
         # It is different from the logic above, becareful
@@ -362,7 +386,6 @@ function(op_library TARGET)
     endif()
 endfunction()
 
-
 function(register_operators)
     set(options "")
     set(oneValueArgs "")
@@ -373,6 +396,7 @@ function(register_operators)
     file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
     string(REPLACE "_mkldnn" "" OPS "${OPS}")
     string(REPLACE "_xpu" "" OPS "${OPS}")
+    string(REPLACE "_kps" "" OPS "${OPS}")
     string(REPLACE "_npu" "" OPS "${OPS}")
     string(REPLACE ".cc" "" OPS "${OPS}")
     list(REMOVE_DUPLICATES OPS)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index f2efc974073e5..65de76d2ea637 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -278,6 +278,12 @@ if(WITH_XPU)
     list(APPEND third_party_deps extern_xpu)
 endif(WITH_XPU)
 
+# added bt lxd 
+if(WITH_XPU2)
+    #include(external/xpu2)          # download, build, install xpu
+    #list(APPEND third_party_deps extern_xpu)
+endif(WITH_XPU)
+
 if(WITH_PSLIB)
     include(external/pslib)          # download, build, install pslib
     list(APPEND third_party_deps extern_pslib)
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
new file mode 100644
index 0000000000000..62754f41cb0d3
--- /dev/null
+++ b/cmake/xpu2.cmake
@@ -0,0 +1,297 @@
+if(NOT WITH_XPU2)
+    return()
+endif()
+
+if(NOT XPU_TOOLCHAIN)
+  set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
+  get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
+endif()
+if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
+  message(FATAL_ERROR "Directory ${XPU_TOOLCHAIN} not found!")
+endif()
+message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
+set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang)
+message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
+
+if(NOT HOST_SYSROOT)
+  set(HOST_SYSROOT /opt/compiler/gcc-8.2)
+endif()
+
+if(NOT IS_DIRECTORY ${HOST_SYSROOT})
+  message(FATAL_ERROR "Directory ${HOST_SYSROOT} not found!")
+endif()
+
+if(NOT API_ARCH)
+  set(API_ARCH x86_64-baidu-linux-gnu)
+endif()
+
+if(API_ARCH MATCHES "x86_64")
+if(EXISTS ${HOST_SYSROOT}/bin/g++)
+  set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
+  set(HOST_AR ${HOST_SYSROOT}/bin/ar)
+else()
+  set(HOST_CXX /usr/bin/g++)
+  set(HOST_AR /usr/bin/ar)
+endif()
+else()
+  set(HOST_CXX ${CMAKE_CXX_COMPILER})
+  set(HOST_AR ${CMAKE_AR})
+endif()
+
+set(TOOLCHAIN_ARGS )
+
+if(API_ARCH MATCHES "aarch64")
+  set(TOOLCHAIN_ARGS --gcc-toolchain=${HOST_SYSROOT} )
+  set(HOST_SYSROOT ${HOST_SYSROOT}/aarch64-linux-gnu/libc )
+endif()
+
+if(API_ARCH MATCHES "sw_64")
+  set(SW_API_LINK_FLAGS --sysroot=${HOST_SYSROOT} )
+  if(NOT SW_PREFIX)
+   set(SW_PREFIX sw_64sw6-sunway-linux-gnu)
+  endif()
+  if(NOT GCC_VERSION)
+   set(GCC_VERSION 8.3.0)
+  endif()
+
+  set(XPU_MF_FLAGS -isystem ${HOST_SYSROOT}/usr/include/c++/${GCC_VERSION} -isystem ${HOST_SYSROOT}/usr/include/c++/${GCC_VERSION}/${SW_PREFIX})
+  set(HOST_XPU_FLAGS -fuse-as=${SW_PREFIX}-gcc -isystem ${HOST_SYSROOT}/usr/include/c++/${GCC_VERSION} -isystem ${HOST_SYSROOT}/usr/include/c++/${GCC_VERSION}/${SW_PREFIX})
+  set(HOST_CXX ${HOST_SYSROOT}/usr/bin/${SW_PREFIX}-g++)
+  set(HOST_AR ${HOST_SYSROOT}/usr/bin/${SW_PREFIX}-ar)
+endif()
+
+if(OPT_LEVEL)
+  set(OPT_LEVEL ${OPT_LEVEL})
+else()
+  set(OPT_LEVEL "-O2")
+endif()
+
+message(STATUS "Build with API_ARCH=" ${API_ARCH})
+message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS})
+message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
+message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
+message(STATUS "Build with HOST_AR=" ${HOST_AR})
+
+macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_extra_flags xpu_1_or_2)
+  set(kernel_target ${kernel_name}_kernel)
+  add_custom_target(${kernel_target}
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      kernel_build/${kernel_name}.host.o
+      kernel_build/${kernel_name}.bin.o
+    COMMENT
+      ${kernel_target}
+    VERBATIM
+    )
+
+  set(arg_rule ${rule})
+  separate_arguments(arg_rule)
+  set(arg_device_o_extra_flags ${device_o_extra_flags})
+  separate_arguments(arg_device_o_extra_flags)
+  set(arg_host_o_extra_flags ${host_o_extra_flags})
+  separate_arguments(arg_host_o_extra_flags)
+
+  set(XTDK_DIR ${XPU_TOOLCHAIN})
+  set(CXX_DIR ${HOST_SYSROOT})
+
+  add_custom_command(
+    OUTPUT
+      kernel_build/${kernel_name}.bin.o
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+    # TODO(liuxiandong) xpu->kps
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -O2 -fno-builtin -g -mcpu=xpu2  
+        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.bin.o.sec /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+        --xpu-device-only -c -v 
+    COMMAND
+      ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
+    # COMMAND
+    #   ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}
+    #     ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}
+    #     ${CMAKE_BINARY_DIR}/CMakeFiles/${kernel_target}.dir/DependInfo.cmake --color=$(COLOR)
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      #${kernel_path}
+    COMMENT
+      kernel_build/${kernel_name}.bin.o
+    VERBATIM
+    )
+    # TODO attention here
+    #set(xpu_kernel_depends ${kernel_name}_depends)
+    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
+
+  add_custom_command(
+    OUTPUT
+      kernel_build/${kernel_name}.host.o
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+    # TODO(liuxiandong) xpu->kps
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -O2 -fno-builtin -g -mcpu=xpu2  
+        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.host.o /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+        --xpu-host-only -c -v 
+    # COMMAND
+    #   ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}
+    #     ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}
+    #     ${CMAKE_BINARY_DIR}/CMakeFiles/${kernel_target}.dir/DependInfo.cmake --color=$(COLOR)
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      #${kernel_path}
+    COMMENT
+      kernel_build/${kernel_name}.host.o
+    VERBATIM
+    )
+    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
+endmacro()
+
+macro(__compile_kernel_with_rules kernel_path kernel_name rules_path device_o_extra_flags host_o_extra_flags xpu_1_or_2)
+  file(STRINGS ${rules_path} rules)
+  foreach(rule IN LISTS rules)
+    message(STATUS "  Instantiate with '${rule}'")
+    execute_process(
+      COMMAND
+        bash "-c" "echo -n ${rule} | md5sum | cut -c1-6"
+      OUTPUT_VARIABLE
+        rule_md5
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      )
+
+    set(kernel_name_md5 ${kernel_name}_${rule_md5})
+    compile_kernel(${kernel_path} ${kernel_name_md5} ${rule} ${device_o_extra_flags} ${host_o_extra_flags} ${xpu_1_or_2})
+  endforeach()
+endmacro()
+
+macro(compile_kernel_with_rules kernel_path kernel_name rules_path device_o_extra_flags host_o_extra_flags xpu_1_or_2)
+  # XXX: reconfigure if file |rules_path| was modified
+  set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${rules_path})
+  __compile_kernel_with_rules(${kernel_path} ${kernel_name} ${rules_path} ${device_o_extra_flags} ${host_o_extra_flags} ${xpu_1_or_2})
+endmacro()
+
+###############################################################################
+# XPU_ADD_LIBRARY
+###############################################################################
+macro(xpu_add_library TARGET_NAME)
+    # Separate the sources from the options
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs STATIC)
+    cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(xpu_srcs ${xpu_add_library_STATIC})
+    set(xpu_target ${TARGET_NAME})
+    
+    file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
+    list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
+
+    set(XPU1_DEVICE_O_EXTRA_FLAGS " ")
+    set(XPU1_HOST_O_EXTRA_FLAGS " ")
+
+    # Distinguish .xpu file from other files
+    foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
+      get_filename_component(language_type_name ${cur_xpu_src} EXT)
+      # TODO(liuxiandong) xpu->kps
+      if(${language_type_name} STREQUAL ".xpu")
+        list(APPEND xpu_kernel_lists ${cur_xpu_src})
+      else()
+        list(APPEND cc_kernel_lists ${cur_xpu_src})
+      endif()
+    endforeach()
+
+    # Ensure that there is only one xpu kernel
+    list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
+
+    if(${xpu_kernel_lists_num})
+        foreach(xpu_kernel IN LISTS xpu_kernel_lists)
+            message(STATUS "Process ${xpu_kernel}")
+            get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
+            get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
+            #TODO(liuxiandong set default rules)
+            set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
+            set(kernel_name ${kernel_name})
+            if(EXISTS ${kernel_rules})
+                compile_kernel_with_rules(${xpu_kernel} ${kernel_name} ${kernel_rules}
+                    ${XPU1_DEVICE_O_EXTRA_FLAGS} ${XPU1_HOST_O_EXTRA_FLAGS} "xpu2")
+            else()
+                compile_kernel(${xpu_kernel} ${kernel_name} " "
+                    ${XPU1_DEVICE_O_EXTRA_FLAGS} ${XPU1_HOST_O_EXTRA_FLAGS} "xpu2")
+            endif()
+        endforeach()
+
+        add_custom_target(${xpu_target}_src ALL
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMENT
+                ${xpu_target}_src
+            VERBATIM
+            )
+
+        add_custom_command(
+            OUTPUT
+            ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMAND
+                ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+            COMMENT
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            VERBATIM
+            ) 
+        
+        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+        target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
+    else()
+        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+    endif()
+endmacro()
+
+# XPU2 PATH
+if(NOT DEFINED ENV{XPU2_PATH})
+    set(XPU2_PATH "/workspace/paddle/xpu-demo/XTDK" CACHE PATH "Path to which XPU2 has been installed")
+    set(XPU_CLANG_PATH ${XPU2_PATH}/bin/clang CACHE PATH "Path to which XPU2 CLANG has been installed")
+else()
+    set(XPU2_PATH $ENV{XPU2_PATH} CACHE PATH "Path to which ROCm has been installed")
+    set(XPU_CLANG_PATH ${XPU2_PATH}/bin/clang CACHE PATH "Path to which XPU2 CLANG has been installed")
+endif()
+set(CMAKE_MODULE_PATH "${XPU2_CLANG_PATH}/cmake" ${CMAKE_MODULE_PATH})
+
+# define XPU_CXX_FLAGS
+list(APPEND XPU_CFLAGS -fPIC)
+list(APPEND XPU_CFLAGS --sysroot = /opt/compiler/gcc-8.2)
+list(APPEND XPU_CFLAGS -std=c++11)
+list(APPEND XPU_CFLAGS -O2)
+list(APPEND XPU_CFLAGS -g)
+list(APPEND XPU_CFLAGS -mcpu=xpu2)
+list(APPEND XPU_CFLAGS --target=x86_64-linux-gnu)
+list(APPEND XPU_CFLAGS -v)
+list(APPEND XPU_CFLAGS --dyld-prefix=/opt/compiler/gcc-8.2)
+list(APPEND XPU_CFLAGS -fno-builtin)
+list(APPEND XPU_CFLAGS -Wno-dev)
+
+set(XPU_XPUCC_FLAGS ${XPU_CFLAGS})
+
+# set HIP link libs
+set(xpuapi_library_name xpuapi)
+message(STATUS "XPU API library name: ${xpuapi_library_name}")
+# link in the generic.cmake
+find_library(XPU2_CLANG_API_LIB ${xpuapi_library_name} HINTS ${XPU2_PATH}/shlib)
+message(STATUS "XPU2_CLANG_API_LIB: ${XPU2_CLANG_API_LIB}")
+
+set(xpurt_library_name xpurt)
+message(STATUS "XPU RT library name: ${xpurt_library_name}")
+# link in the generic.cmake
+find_library(XPU2_CLANG_RT_LIB ${xpurt_library_name} HINTS ${XPU2_PATH}/runtime/shlib)
+message(STATUS "XPU2_CLANG_RT_LIB: ${XPU2_CLANG_RT_LIB}")
+
+# # Ensure that xpu/api.h can be included without dependency errors.
+# file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
+# add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
+# add_dependencies(xpu_headers_dummy extern_xpu)
+# link_libraries(xpu_headers_dummy)
\ No newline at end of file
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1b45193e3d0a3..edf1b922694e3 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -205,6 +205,14 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl
     pten pten_utils kernel_factory)
 ENDIF()
 
+# IF(WITH_XPU2)
+# cc_library(operator SRCS operator.cc DEPS xpu2_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
+#     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+# ELSE()
+# cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
+#     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+# ENDIF()
+
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context)
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 348ca5b952bfe..1a382ba1ae7d1 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -330,6 +330,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
 
+// added by lxd
+#define REGISTER_OP_XPU2_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_NPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index f0292ffe17869..dc618d13888b7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -58,6 +58,7 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    std::cout << "lxd_debug: element_add in CPU !" << std::endl;
     auto *x = ctx.Input<framework::LoDTensor>("X");
     auto *y = ctx.Input<framework::LoDTensor>("Y");
     auto *z = ctx.Output<framework::LoDTensor>("Out");
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
new file mode 100644
index 0000000000000..214f112e4a00c
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU2
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+
+namespace paddle {
+namespace operators {
+
+template<int VecSize, typename InT, typename OutT, typename Functor >
+__global__ void elementwise(InT *in0,
+                            InT *in1, OutT *out,
+                            int size, Functor func) {
+  //TODO
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
new file mode 100644
index 0000000000000..07755a77f4276
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU2
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ElementwiseAddXPU2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::cout << "lxd_debug: XPU2 forward element_add !" << std::endl;
+  }
+};
+
+template <typename T>
+class ElementwiseAddGradXPU2Kernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::cout << "lxd_debug: XPU2 backward element_add !" << std::endl;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU2_KERNEL(elementwise_add, ops::ElementwiseAddXPU2Kernel<float>);
+
+REGISTER_OP_XPU2_KERNEL(elementwise_add_grad,
+                        ops::ElementwiseAddGradXPU2Kernel<float>);
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 47027d69f82e0..a62fbcf990833 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -126,6 +126,10 @@ if(WITH_PYTHON)
     target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
   endif()
 
+  # if(WITH_XPU2)
+  #   target_link_libraries(op_function_generator ${XPU2_CLANGRTC_LIB})
+  # endif()
+
   set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
   set(tmp_impl_file ${impl_file}.tmp)
 
@@ -217,6 +221,10 @@ if(WITH_PYTHON)
     target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
   endif()
 
+  # if(WITH_XPU2)
+  #   target_link_libraries(paddle_pybind ${XPU2_CLANGRTC_LIB})
+  # endif()
+
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})
   add_dependencies(paddle_pybind op_function_generator_cmd)

From 3ce9a61d8fdfd997eeed51a822e995a168c9e31d Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 11 Nov 2021 09:14:40 +0000
Subject: [PATCH 02/41] clean code

---
 cmake/xpu2.cmake | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index 62754f41cb0d3..ecb5ecfd483af 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -40,26 +40,6 @@ endif()
 
 set(TOOLCHAIN_ARGS )
 
-if(API_ARCH MATCHES "aarch64")
-  set(TOOLCHAIN_ARGS --gcc-toolchain=${HOST_SYSROOT} )
-  set(HOST_SYSROOT ${HOST_SYSROOT}/aarch64-linux-gnu/libc )
-endif()
-
-if(API_ARCH MATCHES "sw_64")
-  set(SW_API_LINK_FLAGS --sysroot=${HOST_SYSROOT} )
-  if(NOT SW_PREFIX)
-   set(SW_PREFIX sw_64sw6-sunway-linux-gnu)
-  endif()
-  if(NOT GCC_VERSION)
-   set(GCC_VERSION 8.3.0)
-  endif()
-
-  set(XPU_MF_FLAGS -isystem ${HOST_SYSROOT}/usr/include/c++/${GCC_VERSION} -isystem ${HOST_SYSROOT}/usr/include/c++/${GCC_VERSION}/${SW_PREFIX})
-  set(HOST_XPU_FLAGS -fuse-as=${SW_PREFIX}-gcc -isystem ${HOST_SYSROOT}/usr/include/c++/${GCC_VERSION} -isystem ${HOST_SYSROOT}/usr/include/c++/${GCC_VERSION}/${SW_PREFIX})
-  set(HOST_CXX ${HOST_SYSROOT}/usr/bin/${SW_PREFIX}-g++)
-  set(HOST_AR ${HOST_SYSROOT}/usr/bin/${SW_PREFIX}-ar)
-endif()
-
 if(OPT_LEVEL)
   set(OPT_LEVEL ${OPT_LEVEL})
 else()

From b90bc3c696b9aacbeabe534e4add79621844efad Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 11 Nov 2021 11:01:06 +0000
Subject: [PATCH 03/41] clean useless code

---
 cmake/hip.cmake                    | 1 -
 cmake/inference_lib.cmake          | 7 -------
 paddle/fluid/pybind/CMakeLists.txt | 3 ---
 3 files changed, 11 deletions(-)

diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index f9c90c58c2adf..bd2e251004e7a 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -85,7 +85,6 @@ endif()
 message(STATUS "HIP library name: ${hip_library_name}")
 
 # set HIP link libs
-# 从后面找到叫hip_hcc或者amdhip64的库
 find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib)
 message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}")
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 5e418d3e33c7d..e5a7f0d2bef54 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -181,13 +181,6 @@ IF(WITH_XPU)
         DSTS ${dst_dir} ${dst_dir})
 ENDIF()
 
-# IF(WITH_XPU2)
-#     set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/xpu2")
-#     copy(inference_lib_dist
-#         SRCS ${XPU2_INC_DIR} ${XPU2_LIB_DIR}
-#         DSTS ${dst_dir} ${dst_dir})
-# ENDIF()
-
 # CMakeCache Info
 copy(inference_lib_dist
         SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index a62fbcf990833..27ffa589703cc 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -126,9 +126,6 @@ if(WITH_PYTHON)
     target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
   endif()
 
-  # if(WITH_XPU2)
-  #   target_link_libraries(op_function_generator ${XPU2_CLANGRTC_LIB})
-  # endif()
 
   set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
   set(tmp_impl_file ${impl_file}.tmp)

From 9ed1f1bb252068c2625b4729a502a1f027baa1db Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 11 Nov 2021 11:02:15 +0000
Subject: [PATCH 04/41] clean useless code

---
 paddle/fluid/framework/CMakeLists.txt | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index edf1b922694e3..1b45193e3d0a3 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -205,14 +205,6 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl
     pten pten_utils kernel_factory)
 ENDIF()
 
-# IF(WITH_XPU2)
-# cc_library(operator SRCS operator.cc DEPS xpu2_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-#     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
-# ELSE()
-# cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-#     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
-# ENDIF()
-
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context)
 

From 99ddb5ed2c7f71aabc53cc2024890d5c7bf45962 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 11 Nov 2021 11:07:23 +0000
Subject: [PATCH 05/41] clean useless code

---
 cmake/hip.cmake                    | 2 --
 cmake/third_party.cmake            | 6 ------
 paddle/fluid/pybind/CMakeLists.txt | 5 -----
 3 files changed, 13 deletions(-)

diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index bd2e251004e7a..514f5ea9deaa3 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -13,13 +13,11 @@ else()
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 
-#找到相关的编译器依赖环境
 find_package(HIP REQUIRED)
 include_directories(${ROCM_PATH}/include)
 message(STATUS "HIP version: ${HIP_VERSION}")
 message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}")
 
-# 添加相关环境
 macro(find_package_and_include PACKAGE_NAME)
   find_package("${PACKAGE_NAME}" REQUIRED)
   include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include")
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 65de76d2ea637..f2efc974073e5 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -278,12 +278,6 @@ if(WITH_XPU)
     list(APPEND third_party_deps extern_xpu)
 endif(WITH_XPU)
 
-# added bt lxd 
-if(WITH_XPU2)
-    #include(external/xpu2)          # download, build, install xpu
-    #list(APPEND third_party_deps extern_xpu)
-endif(WITH_XPU)
-
 if(WITH_PSLIB)
     include(external/pslib)          # download, build, install pslib
     list(APPEND third_party_deps extern_pslib)
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 27ffa589703cc..47027d69f82e0 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -126,7 +126,6 @@ if(WITH_PYTHON)
     target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
   endif()
 
-
   set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
   set(tmp_impl_file ${impl_file}.tmp)
 
@@ -218,10 +217,6 @@ if(WITH_PYTHON)
     target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
   endif()
 
-  # if(WITH_XPU2)
-  #   target_link_libraries(paddle_pybind ${XPU2_CLANGRTC_LIB})
-  # endif()
-
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})
   add_dependencies(paddle_pybind op_function_generator_cmd)

From 163f92012bf89e20f88330b792602f87424d6bb2 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 16 Nov 2021 10:46:41 +0000
Subject: [PATCH 06/41] test

---
 cmake/xpu2.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index ecb5ecfd483af..efa9c85a528a7 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -74,6 +74,7 @@ macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_ex
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
+  
 
   add_custom_command(
     OUTPUT

From 46bd3cb70d87230448aab1a29b107ab52ccfa1aa Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 16 Nov 2021 11:14:01 +0000
Subject: [PATCH 07/41] add include path

---
 cmake/generic.cmake                           |   6 +-
 cmake/xpu2.cmake                              | 127 +++++++++++-------
 .../elementwise/elementwise_add_op.xpu        |  49 +++++--
 3 files changed, 121 insertions(+), 61 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 21cf2c0bdc9ed..f5bc16021cef9 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -401,8 +401,8 @@ function(cc_binary TARGET_NAME)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
   if(WITH_XPU2)
-    target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
-    target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
+    #target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
+    #target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
   endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
@@ -674,7 +674,7 @@ function(xpu_library TARGET_NAME)
         # xpu_add_library(${TARGET_NAME} SHARED ${xpu_library_SRCS})
         message(FATAL_ERROR "XPU kernel currently does not support dynamic links")
       else()
-        xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS})
+        xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
         find_fluid_modules(${TARGET_NAME})
       endif()
       if (xpu_library_DEPS)
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index efa9c85a528a7..5d2f9bebd1a3d 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -52,7 +52,19 @@ message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
 message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
 message(STATUS "Build with HOST_AR=" ${HOST_AR})
 
-macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_extra_flags xpu_1_or_2)
+#macro(compile_kernel kernel_path kernel_name device_o_extra_flags host_o_extra_flags xpu_1_or_2 cc_depends)
+macro(compile_kernel COMPILE_ARGS)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs KERNEL XNAME DEVICE HOST XPU DEPENDS)
+  cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(kernel_path ${xpu_add_library_KERNEL})
+  set(kernel_name ${xpu_add_library_XNAME})
+  set(device_o_extra_flags ${xpu_add_library_DEVICE})
+  set(host_o_extra_flags ${xpu_add_library_HOST})
+  set(xpu_1_or_2 ${xpu_add_library_XPU})
+  set(cc_depends ${xpu_add_library_DEPENDS})
+
   set(kernel_target ${kernel_name}_kernel)
   add_custom_target(${kernel_target}
     WORKING_DIRECTORY
@@ -65,8 +77,14 @@ macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_ex
     VERBATIM
     )
 
-  set(arg_rule ${rule})
-  separate_arguments(arg_rule)
+  if(cc_depends)
+    message(STATUS "lxd_debug kernel dependencies: ${xpu_add_library_DEPENDS}")
+    add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS})
+    #target_link_libraries(${kernel_target} ${xpu_add_library_DEPENDS})
+  endif()
+
+  # set(arg_rule ${rule})
+  # separate_arguments(arg_rule)
   set(arg_device_o_extra_flags ${device_o_extra_flags})
   separate_arguments(arg_device_o_extra_flags)
   set(arg_host_o_extra_flags ${host_o_extra_flags})
@@ -74,7 +92,10 @@ macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_ex
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
-  
+  set(XPU_CXX_INCLUDES  -I/workspace/paddle/Paddle/build -I/workspace/paddle/Paddle/paddle/fluid/framework/io -I/workspace/paddle/Paddle/build/third_party/install/zlib/include -I/workspace/paddle/Paddle/build/third_party/install -I/workspace/paddle/Paddle/build/third_party/install/gflags/include -I/workspace/paddle/Paddle/build/third_party/install/glog/include -I/workspace/paddle/Paddle/build/third_party/boost/src/extern_boost -I/workspace/paddle/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/workspace/paddle/Paddle/build/third_party/threadpool/src/extern_threadpool -I/workspace/paddle/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/workspace/paddle/Paddle/build/third_party/install/xxhash/include -I/workspace/paddle/Paddle/build/third_party/install/warpctc/include -I/workspace/paddle/Paddle/build/third_party/install/openblas/include -I/workspace/paddle/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I/workspace/paddle/Paddle/build/third_party/pybind/src/extern_pybind/include -I/workspace/paddle/Paddle/build/third_party/install/gtest/include -I/workspace/paddle/Paddle/build/third_party/install/xpu/include -I/workspace/paddle/Paddle/build/third_party/install/gloo/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include/xbyak -I/workspace/paddle/Paddle/build/third_party/install/cryptopp/include -I/workspace/paddle/Paddle/build/third_party/pocketfft/src -I/workspace/paddle/Paddle)
+  #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -mavx -O3 -DNDEBUG )
+  set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
+  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU -DPADDLE_WITH_XPU2 -DXBYAK64 -DXBYAK_NO_OP_NAMES)
 
   add_custom_command(
     OUTPUT
@@ -82,9 +103,9 @@ macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_ex
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    # TODO(liuxiandong) xpu->kps
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -O2 -fno-builtin -g -mcpu=xpu2  
-        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.bin.o.sec /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+    # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11
+    ${XPU_CLANG} --sysroot=${CXX_DIR}   -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
+         -I.  -o kernel_build/${kernel_name}.bin.o.sec /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
       ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
@@ -95,7 +116,7 @@ macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_ex
     WORKING_DIRECTORY
       ${CMAKE_CURRENT_BINARY_DIR}
     DEPENDS
-      #${kernel_path}
+      ${xpu_add_library_DEPENDS}
     COMMENT
       kernel_build/${kernel_name}.bin.o
     VERBATIM
@@ -110,9 +131,9 @@ macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_ex
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    # TODO(liuxiandong) xpu->kps
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -O2 -fno-builtin -g -mcpu=xpu2  
-        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.host.o /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+    # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11
+    ${XPU_CLANG} --sysroot=${CXX_DIR}   -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
+         -I.  -o kernel_build/${kernel_name}.host.o /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-host-only -c -v 
     # COMMAND
     #   ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}
@@ -121,7 +142,7 @@ macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_ex
     WORKING_DIRECTORY
       ${CMAKE_CURRENT_BINARY_DIR}
     DEPENDS
-      #${kernel_path}
+      ${xpu_add_library_DEPENDS}
     COMMENT
       kernel_build/${kernel_name}.host.o
     VERBATIM
@@ -129,28 +150,28 @@ macro(compile_kernel kernel_path kernel_name rule device_o_extra_flags host_o_ex
     list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
 endmacro()
 
-macro(__compile_kernel_with_rules kernel_path kernel_name rules_path device_o_extra_flags host_o_extra_flags xpu_1_or_2)
-  file(STRINGS ${rules_path} rules)
-  foreach(rule IN LISTS rules)
-    message(STATUS "  Instantiate with '${rule}'")
-    execute_process(
-      COMMAND
-        bash "-c" "echo -n ${rule} | md5sum | cut -c1-6"
-      OUTPUT_VARIABLE
-        rule_md5
-      OUTPUT_STRIP_TRAILING_WHITESPACE
-      )
+# macro(__compile_kernel_with_rules kernel_path kernel_name rules_path device_o_extra_flags host_o_extra_flags xpu_1_or_2 cc_depends)
+#   file(STRINGS ${rules_path} rules)
+#   foreach(rule IN LISTS rules)
+#     message(STATUS "  Instantiate with '${rule}'")
+#     execute_process(
+#       COMMAND
+#         bash "-c" "echo -n ${rule} | md5sum | cut -c1-6"
+#       OUTPUT_VARIABLE
+#         rule_md5
+#       OUTPUT_STRIP_TRAILING_WHITESPACE
+#       )
 
-    set(kernel_name_md5 ${kernel_name}_${rule_md5})
-    compile_kernel(${kernel_path} ${kernel_name_md5} ${rule} ${device_o_extra_flags} ${host_o_extra_flags} ${xpu_1_or_2})
-  endforeach()
-endmacro()
+#     set(kernel_name_md5 ${kernel_name}_${rule_md5})
+#     compile_kernel(${kernel_path} ${kernel_name_md5} ${rule} ${device_o_extra_flags} ${host_o_extra_flags} ${xpu_1_or_2} ${cc_depends})
+#   endforeach()
+# endmacro()
 
-macro(compile_kernel_with_rules kernel_path kernel_name rules_path device_o_extra_flags host_o_extra_flags xpu_1_or_2)
-  # XXX: reconfigure if file |rules_path| was modified
-  set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${rules_path})
-  __compile_kernel_with_rules(${kernel_path} ${kernel_name} ${rules_path} ${device_o_extra_flags} ${host_o_extra_flags} ${xpu_1_or_2})
-endmacro()
+# macro(compile_kernel_with_rules kernel_path kernel_name rules_path device_o_extra_flags host_o_extra_flags xpu_1_or_2 cc_depends)
+#   # XXX: reconfigure if file |rules_path| was modified
+#   set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${rules_path})
+#   __compile_kernel_with_rules(${kernel_path} ${kernel_name} ${rules_path} ${device_o_extra_flags} ${host_o_extra_flags} ${xpu_1_or_2} ${cc_depends})
+# endmacro()
 
 ###############################################################################
 # XPU_ADD_LIBRARY
@@ -159,10 +180,12 @@ macro(xpu_add_library TARGET_NAME)
     # Separate the sources from the options
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs STATIC)
+    set(multiValueArgs STATIC DEPENDS)
     cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     set(xpu_srcs ${xpu_add_library_STATIC})
     set(xpu_target ${TARGET_NAME})
+    set(cc_srcs_depends ${xpu_add_library_DEPENDS})
+    #message(STATUS "lxd_debug: ${xpu_add_library_DEPENDS}---------------------------------")
     
     file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
     list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
@@ -183,6 +206,7 @@ macro(xpu_add_library TARGET_NAME)
 
     # Ensure that there is only one xpu kernel
     list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
+    list(LENGTH cc_srcs_depends cc_srcs_depends_num)
 
     if(${xpu_kernel_lists_num})
         foreach(xpu_kernel IN LISTS xpu_kernel_lists)
@@ -192,13 +216,13 @@ macro(xpu_add_library TARGET_NAME)
             #TODO(liuxiandong set default rules)
             set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
             set(kernel_name ${kernel_name})
-            if(EXISTS ${kernel_rules})
-                compile_kernel_with_rules(${xpu_kernel} ${kernel_name} ${kernel_rules}
-                    ${XPU1_DEVICE_O_EXTRA_FLAGS} ${XPU1_HOST_O_EXTRA_FLAGS} "xpu2")
-            else()
-                compile_kernel(${xpu_kernel} ${kernel_name} " "
-                    ${XPU1_DEVICE_O_EXTRA_FLAGS} ${XPU1_HOST_O_EXTRA_FLAGS} "xpu2")
-            endif()
+            # if(EXISTS ${kernel_rules})
+            #     # compile_kernel_with_rules(${xpu_kernel} ${kernel_name} ${kernel_rules}
+            #     #     ${XPU1_DEVICE_O_EXTRA_FLAGS} ${XPU1_HOST_O_EXTRA_FLAGS} "xpu2" ${cc_srcs_depends})
+            # else()
+            message(STATUS "lxd_debug: ${cc_srcs_depends}>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
+            compile_kernel(KERNEL ${xpu_kernel} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
+            # endif()
         endforeach()
 
         add_custom_target(${xpu_target}_src ALL
@@ -226,6 +250,9 @@ macro(xpu_add_library TARGET_NAME)
             VERBATIM
             ) 
         
+        # if(${cc_srcs_depends_num})
+        #   add_dependencies(${xpu_target}_kernel ${cc_srcs_depends})
+        # endif()
         add_library(${xpu_target} STATIC ${cc_kernel_lists})
         target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
     else()
@@ -259,17 +286,17 @@ list(APPEND XPU_CFLAGS -Wno-dev)
 set(XPU_XPUCC_FLAGS ${XPU_CFLAGS})
 
 # set HIP link libs
-set(xpuapi_library_name xpuapi)
-message(STATUS "XPU API library name: ${xpuapi_library_name}")
-# link in the generic.cmake
-find_library(XPU2_CLANG_API_LIB ${xpuapi_library_name} HINTS ${XPU2_PATH}/shlib)
-message(STATUS "XPU2_CLANG_API_LIB: ${XPU2_CLANG_API_LIB}")
+# set(xpuapi_library_name xpuapi)
+# message(STATUS "XPU API library name: ${xpuapi_library_name}")
+# # link in the generic.cmake
+# find_library(XPU2_CLANG_API_LIB ${xpuapi_library_name} HINTS ${XPU2_PATH}/shlib)
+# message(STATUS "XPU2_CLANG_API_LIB: ${XPU2_CLANG_API_LIB}")
 
-set(xpurt_library_name xpurt)
-message(STATUS "XPU RT library name: ${xpurt_library_name}")
-# link in the generic.cmake
-find_library(XPU2_CLANG_RT_LIB ${xpurt_library_name} HINTS ${XPU2_PATH}/runtime/shlib)
-message(STATUS "XPU2_CLANG_RT_LIB: ${XPU2_CLANG_RT_LIB}")
+# set(xpurt_library_name xpurt)
+# message(STATUS "XPU RT library name: ${xpurt_library_name}")
+# # link in the generic.cmake
+# find_library(XPU2_CLANG_RT_LIB ${xpurt_library_name} HINTS ${XPU2_PATH}/runtime/shlib)
+# message(STATUS "XPU2_CLANG_RT_LIB: ${XPU2_CLANG_RT_LIB}")
 
 # # Ensure that xpu/api.h can be included without dependency errors.
 # file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
index 214f112e4a00c..723c33dc811a5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
@@ -12,23 +12,56 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_XPU2
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/data_type.h"
+//#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/function_traits.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/framework/tensor.h"
+//#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+//#include "xpu/kernel/math.h"
+#include <xpu/runtime.h>
 
 namespace paddle {
 namespace operators {
 
-template<int VecSize, typename InT, typename OutT, typename Functor >
-__global__ void elementwise(InT *in0,
-                            InT *in1, OutT *out,
-                            int size, Functor func) {
+__global__ void elementwise() {
   //TODO
 }
 
+void ComputeTest(){
+
+}
+
+template <typename T>
+class ElementwiseAddXPU2Kernel : public framework::OpKernel<T> {
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::cout<<"lxd_debug: XPU2 forward element_add !"<<std::endl;
+    ComputeTest();
+  }
+};
+
+template <typename T>
+class ElementwiseAddGradXPU2Kernel : public ElemwiseGradKernel<T> {
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::cout<<"lxd_debug: XPU2 backward element_add !"<<std::endl;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
-#endif
\ No newline at end of file
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU2_KERNEL(elementwise_add, ops::ElementwiseAddXPU2Kernel<float>);
+                       
+REGISTER_OP_XPU2_KERNEL(
+    elementwise_add_grad, ops::ElementwiseAddGradXPU2Kernel<float>);
\ No newline at end of file

From 931edb504bb2f2e3eb4ce8d4f06d3f36f13accdc Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Fri, 19 Nov 2021 08:30:01 +0000
Subject: [PATCH 08/41] use clang compiler

---
 CMakeLists.txt                                |  5 +--
 cmake/generic.cmake                           | 10 +++---
 cmake/operators.cmake                         |  3 +-
 cmake/xpu2.cmake                              | 31 ++++++++--------
 paddle/fluid/framework/op_registry.h          |  4 ---
 .../elementwise/elementwise_add_op.h          |  1 -
 .../elementwise/elementwise_add_op.xpu        | 35 ++++++++++++++-----
 paddle/fluid/platform/aligned_vector.h        |  2 +-
 8 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ed04ed994e15..502275bbb3a4a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,10 +37,11 @@ project(paddle CXX C)
 
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
-find_package(CUDA QUIET)
+# find_package(CUDA QUIET) (TODO:liuxiandong)
 find_package(MKL CONFIG QUIET)
 option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
-option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+#option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          OFF)
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_XPU2         "Compile PaddlePaddle with BAIDU KUNLUN XPU2"    OFF)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index f5bc16021cef9..d5bc9c5294695 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -401,8 +401,8 @@ function(cc_binary TARGET_NAME)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
   if(WITH_XPU2)
-    #target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
-    #target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
+    target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
+    target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
   endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
@@ -429,9 +429,9 @@ function(cc_test_build TARGET_NAME)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
     # added by lxd
-    # if(WITH_XPU2)
-    #   target_link_libraries(${TARGET_NAME} ${XPU2_CLANGRTC_LIB})
-    # endif()
+    if(WITH_XPU2)
+      target_link_libraries(${TARGET_NAME} ${XPU2_CLANGRTC_LIB})
+    endif()
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
 endfunction()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 47181fada781a..a271b1c528767 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -325,7 +325,7 @@ function(op_library TARGET)
     endif()
 
     if (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+        #file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
@@ -396,7 +396,6 @@ function(register_operators)
     file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
     string(REPLACE "_mkldnn" "" OPS "${OPS}")
     string(REPLACE "_xpu" "" OPS "${OPS}")
-    string(REPLACE "_kps" "" OPS "${OPS}")
     string(REPLACE "_npu" "" OPS "${OPS}")
     string(REPLACE ".cc" "" OPS "${OPS}")
     list(REMOVE_DUPLICATES OPS)
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index 5d2f9bebd1a3d..f81fe3ac09653 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -92,10 +92,11 @@ macro(compile_kernel COMPILE_ARGS)
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
-  set(XPU_CXX_INCLUDES  -I/workspace/paddle/Paddle/build -I/workspace/paddle/Paddle/paddle/fluid/framework/io -I/workspace/paddle/Paddle/build/third_party/install/zlib/include -I/workspace/paddle/Paddle/build/third_party/install -I/workspace/paddle/Paddle/build/third_party/install/gflags/include -I/workspace/paddle/Paddle/build/third_party/install/glog/include -I/workspace/paddle/Paddle/build/third_party/boost/src/extern_boost -I/workspace/paddle/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/workspace/paddle/Paddle/build/third_party/threadpool/src/extern_threadpool -I/workspace/paddle/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/workspace/paddle/Paddle/build/third_party/install/xxhash/include -I/workspace/paddle/Paddle/build/third_party/install/warpctc/include -I/workspace/paddle/Paddle/build/third_party/install/openblas/include -I/workspace/paddle/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I/workspace/paddle/Paddle/build/third_party/pybind/src/extern_pybind/include -I/workspace/paddle/Paddle/build/third_party/install/gtest/include -I/workspace/paddle/Paddle/build/third_party/install/xpu/include -I/workspace/paddle/Paddle/build/third_party/install/gloo/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include/xbyak -I/workspace/paddle/Paddle/build/third_party/install/cryptopp/include -I/workspace/paddle/Paddle/build/third_party/pocketfft/src -I/workspace/paddle/Paddle)
+  set(XPU_CXX_INCLUDES  -I/workspace/paddle/Paddle/build -I/workspace/paddle/Paddle/paddle/fluid/framework/io -I/workspace/paddle/Paddle/build/third_party/install/zlib/include -I/workspace/paddle/Paddle/build/third_party/install -I/workspace/paddle/Paddle/build/third_party/install/gflags/include -I/workspace/paddle/Paddle/build/third_party/install/glog/include -I/workspace/paddle/Paddle/build/third_party/boost/src/extern_boost -I/workspace/paddle/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/workspace/paddle/Paddle/build/third_party/threadpool/src/extern_threadpool -I/workspace/paddle/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/workspace/paddle/Paddle/build/third_party/install/xxhash/include -I/workspace/paddle/Paddle/build/third_party/install/warpctc/include -I/workspace/paddle/Paddle/build/third_party/install/openblas/include -I/workspace/paddle/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I/workspace/paddle/Paddle/build/third_party/pybind/src/extern_pybind/include -I/workspace/paddle/Paddle/build/third_party/install/gtest/include -I/workspace/paddle/Paddle/build/third_party/install/gloo/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include/xbyak -I/workspace/paddle/Paddle/build/third_party/install/cryptopp/include -I/workspace/paddle/Paddle/build/third_party/pocketfft/src -I/workspace/paddle/Paddle)
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -mavx -O3 -DNDEBUG )
-  set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
-  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU -DPADDLE_WITH_XPU2 -DXBYAK64 -DXBYAK_NO_OP_NAMES)
+  set(XPU_CXX_FLAGS  -Wno-c++11-narrowing -Wno-shift-count-overflow -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
+  #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
+  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU2 -DXBYAK64 -DXBYAK_NO_OP_NAMES)
 
   add_custom_command(
     OUTPUT
@@ -105,7 +106,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
     # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11
     ${XPU_CLANG} --sysroot=${CXX_DIR}   -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-         -I.  -o kernel_build/${kernel_name}.bin.o.sec /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.bin.o.sec /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
       ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
@@ -133,7 +134,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
     # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11
     ${XPU_CLANG} --sysroot=${CXX_DIR}   -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-         -I.  -o kernel_build/${kernel_name}.host.o /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.host.o /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-host-only -c -v 
     # COMMAND
     #   ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}
@@ -286,17 +287,17 @@ list(APPEND XPU_CFLAGS -Wno-dev)
 set(XPU_XPUCC_FLAGS ${XPU_CFLAGS})
 
 # set HIP link libs
-# set(xpuapi_library_name xpuapi)
-# message(STATUS "XPU API library name: ${xpuapi_library_name}")
-# # link in the generic.cmake
-# find_library(XPU2_CLANG_API_LIB ${xpuapi_library_name} HINTS ${XPU2_PATH}/shlib)
-# message(STATUS "XPU2_CLANG_API_LIB: ${XPU2_CLANG_API_LIB}")
+set(xpuapi_library_name xpuapi)
+message(STATUS "XPU API library name: ${xpuapi_library_name}")
+# link in the generic.cmake
+find_library(XPU2_CLANG_API_LIB ${xpuapi_library_name} HINTS ${XPU2_PATH}/shlib)
+message(STATUS "XPU2_CLANG_API_LIB: ${XPU2_CLANG_API_LIB}")
 
-# set(xpurt_library_name xpurt)
-# message(STATUS "XPU RT library name: ${xpurt_library_name}")
-# # link in the generic.cmake
-# find_library(XPU2_CLANG_RT_LIB ${xpurt_library_name} HINTS ${XPU2_PATH}/runtime/shlib)
-# message(STATUS "XPU2_CLANG_RT_LIB: ${XPU2_CLANG_RT_LIB}")
+set(xpurt_library_name xpurt)
+message(STATUS "XPU RT library name: ${xpurt_library_name}")
+# link in the generic.cmake
+find_library(XPU2_CLANG_RT_LIB ${xpurt_library_name} HINTS ${XPU2_PATH}/runtime/shlib)
+message(STATUS "XPU2_CLANG_RT_LIB: ${XPU2_CLANG_RT_LIB}")
 
 # # Ensure that xpu/api.h can be included without dependency errors.
 # file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 1a382ba1ae7d1..348ca5b952bfe 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -330,10 +330,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
 
-// added by lxd
-#define REGISTER_OP_XPU2_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
-
 #define REGISTER_OP_NPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index dc618d13888b7..f0292ffe17869 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -58,7 +58,6 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    std::cout << "lxd_debug: element_add in CPU !" << std::endl;
     auto *x = ctx.Input<framework::LoDTensor>("X");
     auto *y = ctx.Input<framework::LoDTensor>("Y");
     auto *z = ctx.Output<framework::LoDTensor>("Out");
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
index 723c33dc811a5..5cbeafd7ec606 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
@@ -12,26 +12,44 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#if defined(__CUDA_ARCH__)
+  #undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+  #undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+  #undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+  #undef __NVCC__
+#endif
+
+
 #include <string>
 #include <vector>
-//#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/fluid/framework/op_registry.h"
+/*
 #include "paddle/fluid/framework/tensor.h"
-//#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 
-#include "xpu/kernel/cluster_header.h"
-#include "xpu/kernel/debug.h"
+//#include "xpu/kernel/cluster_header.h"
+//#include "xpu/kernel/debug.h"
 //#include "xpu/kernel/math.h"
-#include <xpu/runtime.h>
+//#include <xpu/runtime.h>
+
 
 namespace paddle {
 namespace operators {
 
-__global__ void elementwise() {
+//__global__ void elementwise() {
   //TODO
-}
+//}
 
 void ComputeTest(){
 
@@ -64,4 +82,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU2_KERNEL(elementwise_add, ops::ElementwiseAddXPU2Kernel<float>);
                        
 REGISTER_OP_XPU2_KERNEL(
-    elementwise_add_grad, ops::ElementwiseAddGradXPU2Kernel<float>);
\ No newline at end of file
+    elementwise_add_grad, ops::ElementwiseAddGradXPU2Kernel<float>);
+*/
diff --git a/paddle/fluid/platform/aligned_vector.h b/paddle/fluid/platform/aligned_vector.h
index 7d014f6bdcb0b..a1ca602996d79 100644
--- a/paddle/fluid/platform/aligned_vector.h
+++ b/paddle/fluid/platform/aligned_vector.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/hostdevice.h"
-
+#define CHAR_BIT 8
 namespace paddle {
 namespace platform {
 

From 408419c6aef60970c68b03baf6e0f40c32dc5c75 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Fri, 19 Nov 2021 08:51:41 +0000
Subject: [PATCH 09/41] xpu2.cmake

---
 cmake/xpu2.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index f81fe3ac09653..822c9c97f8c00 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -94,7 +94,7 @@ macro(compile_kernel COMPILE_ARGS)
   set(CXX_DIR ${HOST_SYSROOT})
   set(XPU_CXX_INCLUDES  -I/workspace/paddle/Paddle/build -I/workspace/paddle/Paddle/paddle/fluid/framework/io -I/workspace/paddle/Paddle/build/third_party/install/zlib/include -I/workspace/paddle/Paddle/build/third_party/install -I/workspace/paddle/Paddle/build/third_party/install/gflags/include -I/workspace/paddle/Paddle/build/third_party/install/glog/include -I/workspace/paddle/Paddle/build/third_party/boost/src/extern_boost -I/workspace/paddle/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/workspace/paddle/Paddle/build/third_party/threadpool/src/extern_threadpool -I/workspace/paddle/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/workspace/paddle/Paddle/build/third_party/install/xxhash/include -I/workspace/paddle/Paddle/build/third_party/install/warpctc/include -I/workspace/paddle/Paddle/build/third_party/install/openblas/include -I/workspace/paddle/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I/workspace/paddle/Paddle/build/third_party/pybind/src/extern_pybind/include -I/workspace/paddle/Paddle/build/third_party/install/gtest/include -I/workspace/paddle/Paddle/build/third_party/install/gloo/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include/xbyak -I/workspace/paddle/Paddle/build/third_party/install/cryptopp/include -I/workspace/paddle/Paddle/build/third_party/pocketfft/src -I/workspace/paddle/Paddle)
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -mavx -O3 -DNDEBUG )
-  set(XPU_CXX_FLAGS  -Wno-c++11-narrowing -Wno-shift-count-overflow -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
+  set(XPU_CXX_FLAGS  -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
   set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU2 -DXBYAK64 -DXBYAK_NO_OP_NAMES)
 

From dd0aef3f99ef14e2389a52f5d20f18a2549e3018 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Fri, 19 Nov 2021 10:46:14 +0000
Subject: [PATCH 10/41] XPU2 compiler passed

---
 cmake/operators.cmake                                    | 2 +-
 cmake/xpu2.cmake                                         | 5 +++--
 .../fluid/operators/elementwise/elementwise_add_op.xpu   | 9 +++++----
 paddle/fluid/platform/enforce.h                          | 2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index a271b1c528767..753ff5c76407c 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -325,7 +325,7 @@ function(op_library TARGET)
     endif()
 
     if (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
-        #file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index 822c9c97f8c00..1d698aa82ab5b 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -10,7 +10,7 @@ if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
   message(FATAL_ERROR "Directory ${XPU_TOOLCHAIN} not found!")
 endif()
 message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
-set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang)
+set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++)
 message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
 
 if(NOT HOST_SYSROOT)
@@ -92,9 +92,10 @@ macro(compile_kernel COMPILE_ARGS)
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
+  #-Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow
   set(XPU_CXX_INCLUDES  -I/workspace/paddle/Paddle/build -I/workspace/paddle/Paddle/paddle/fluid/framework/io -I/workspace/paddle/Paddle/build/third_party/install/zlib/include -I/workspace/paddle/Paddle/build/third_party/install -I/workspace/paddle/Paddle/build/third_party/install/gflags/include -I/workspace/paddle/Paddle/build/third_party/install/glog/include -I/workspace/paddle/Paddle/build/third_party/boost/src/extern_boost -I/workspace/paddle/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/workspace/paddle/Paddle/build/third_party/threadpool/src/extern_threadpool -I/workspace/paddle/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/workspace/paddle/Paddle/build/third_party/install/xxhash/include -I/workspace/paddle/Paddle/build/third_party/install/warpctc/include -I/workspace/paddle/Paddle/build/third_party/install/openblas/include -I/workspace/paddle/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I/workspace/paddle/Paddle/build/third_party/pybind/src/extern_pybind/include -I/workspace/paddle/Paddle/build/third_party/install/gtest/include -I/workspace/paddle/Paddle/build/third_party/install/gloo/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include/xbyak -I/workspace/paddle/Paddle/build/third_party/install/cryptopp/include -I/workspace/paddle/Paddle/build/third_party/pocketfft/src -I/workspace/paddle/Paddle)
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -mavx -O3 -DNDEBUG )
-  set(XPU_CXX_FLAGS  -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
+  set(XPU_CXX_FLAGS  -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
   set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU2 -DXBYAK64 -DXBYAK_NO_OP_NAMES)
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
index 5cbeafd7ec606..1f1577003940d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
@@ -34,9 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/fluid/framework/op_registry.h"
-/*
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 //#include "xpu/kernel/cluster_header.h"
 //#include "xpu/kernel/debug.h"
@@ -79,8 +79,9 @@ class ElementwiseAddGradXPU2Kernel : public ElemwiseGradKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_XPU2_KERNEL(elementwise_add, ops::ElementwiseAddXPU2Kernel<float>);
+
+REGISTER_OP_XPU_KERNEL(elementwise_add, ops::ElementwiseAddXPU2Kernel<float>);
                        
-REGISTER_OP_XPU2_KERNEL(
+REGISTER_OP_XPU_KERNEL(
     elementwise_add_grad, ops::ElementwiseAddGradXPU2Kernel<float>);
-*/
+
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index bdb901f583e26..6a5e4e8c52d82 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -382,7 +382,7 @@ struct EnforceNotMet : public std::exception {
 
   EnforceNotMet(const ErrorSummary& error, const char* file, int line)
       : code_(error.code()),
-        err_str_(GetTraceBackString(error.to_string(), file, line)) {
+        err_str_(GetTraceBackString("lxd_debug error summy", file, line)) {
     simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
   }
 

From 8a9cdada8b98288a235dc5d982817b111f536314 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Mon, 22 Nov 2021 02:48:23 +0000
Subject: [PATCH 11/41] update

---
 cmake/operators.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 753ff5c76407c..ee1c398f0dfb3 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -321,7 +321,7 @@ function(op_library TARGET)
     endif()
 
     if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+        #file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
     if (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)

From a49660e9b83e34c56686dd04fd3995f2e164af40 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Mon, 22 Nov 2021 12:28:01 +0000
Subject: [PATCH 12/41] update after pten

---
 cmake/xpu2.cmake                              | 10 ++--
 .../elementwise/elementwise_add_op.xpu        | 10 ++--
 .../elementwise/elementwise_add_op_kps.cc     | 50 -------------------
 paddle/pten/common/data_type.h                | 12 +++--
 4 files changed, 18 insertions(+), 64 deletions(-)
 delete mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc

diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index 1d698aa82ab5b..85a7fce0ab171 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -95,7 +95,7 @@ macro(compile_kernel COMPILE_ARGS)
   #-Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow
   set(XPU_CXX_INCLUDES  -I/workspace/paddle/Paddle/build -I/workspace/paddle/Paddle/paddle/fluid/framework/io -I/workspace/paddle/Paddle/build/third_party/install/zlib/include -I/workspace/paddle/Paddle/build/third_party/install -I/workspace/paddle/Paddle/build/third_party/install/gflags/include -I/workspace/paddle/Paddle/build/third_party/install/glog/include -I/workspace/paddle/Paddle/build/third_party/boost/src/extern_boost -I/workspace/paddle/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/workspace/paddle/Paddle/build/third_party/threadpool/src/extern_threadpool -I/workspace/paddle/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/workspace/paddle/Paddle/build/third_party/install/xxhash/include -I/workspace/paddle/Paddle/build/third_party/install/warpctc/include -I/workspace/paddle/Paddle/build/third_party/install/openblas/include -I/workspace/paddle/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I/workspace/paddle/Paddle/build/third_party/pybind/src/extern_pybind/include -I/workspace/paddle/Paddle/build/third_party/install/gtest/include -I/workspace/paddle/Paddle/build/third_party/install/gloo/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include/xbyak -I/workspace/paddle/Paddle/build/third_party/install/cryptopp/include -I/workspace/paddle/Paddle/build/third_party/pocketfft/src -I/workspace/paddle/Paddle)
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -mavx -O3 -DNDEBUG )
-  set(XPU_CXX_FLAGS  -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
+  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
   set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU2 -DXBYAK64 -DXBYAK_NO_OP_NAMES)
 
@@ -105,8 +105,8 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11
-    ${XPU_CLANG} --sysroot=${CXX_DIR}   -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
+    # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
         -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.bin.o.sec /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
@@ -133,8 +133,8 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11
-    ${XPU_CLANG} --sysroot=${CXX_DIR}   -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
+    # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
         -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.host.o /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-host-only -c -v 
     # COMMAND
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
index 1f1577003940d..95989e8c4200a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
@@ -33,15 +33,15 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_registry.h" //TODO
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-//#include "xpu/kernel/cluster_header.h"
-//#include "xpu/kernel/debug.h"
-//#include "xpu/kernel/math.h"
-//#include <xpu/runtime.h>
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+#include <xpu/runtime.h>
 
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
deleted file mode 100644
index 07755a77f4276..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU2
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseAddXPU2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::cout << "lxd_debug: XPU2 forward element_add !" << std::endl;
-  }
-};
-
-template <typename T>
-class ElementwiseAddGradXPU2Kernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::cout << "lxd_debug: XPU2 backward element_add !" << std::endl;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_XPU2_KERNEL(elementwise_add, ops::ElementwiseAddXPU2Kernel<float>);
-
-REGISTER_OP_XPU2_KERNEL(elementwise_add_grad,
-                        ops::ElementwiseAddGradXPU2Kernel<float>);
-#endif
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index 1ddee0746d4d1..9e66fbe1ea85b 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -14,10 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-#include "bfloat16.h"  // NOLINT
-#include "complex.h"   // NOLINT
-#include "float16.h"   // NOLINT
-
+//#include "bfloat16.h"  // NOLINT
+//#include "complex.h"   // NOLINT
+//#include "float16.h"   // NOLINT
+
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/api/ext/exception.h"
 
 namespace paddle {

From 9c0a1ddbda2d13b8a34034025b74d9fdd4c59d1e Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 23 Nov 2021 06:08:31 +0000
Subject: [PATCH 13/41] combination the WITH_XPU and WITH_XPU2

---
 CMakeLists.txt        | 14 ++++-----
 cmake/generic.cmake   |  6 ++--
 cmake/operators.cmake | 55 +++++++++++++++++++++++++++++-----
 cmake/xpu2.cmake      | 68 +++++++++++++++++++++----------------------
 4 files changed, 91 insertions(+), 52 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 502275bbb3a4a..3226d5a52461c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -270,13 +270,13 @@ if (NOT WITH_GPU AND WITH_NCCL)
         "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
-# XPU XPU2 use BKCL
-# if (NOT (WITH_XPU OR WITH_XPU2) AND WITH_XPU_BKCL)
-#     MESSAGE(WARNING
-#         "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
-#     set(WITH_XPU_BKCL OFF CACHE STRING
-#         "Disable BKCL when compiling without XPU" FORCE)
-# endif()
+# force XPU on when WITH_XPU2
+if (WITH_XPU2 AND NOT WITH_XPU)
+    MESSAGE(WARNING
+        "Enable XPU when compiling with XPU2. Force WITH_XPU=ON.")
+    set(WITH_XPU ON CACHE STRING
+        "Enable XPU when compiling with XPU2" FORCE)
+endif()
 
 #
 if (NOT WITH_XPU AND WITH_XPU_BKCL)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d5bc9c5294695..cac92483136db 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -401,8 +401,8 @@ function(cc_binary TARGET_NAME)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
   if(WITH_XPU2)
-    target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
-    target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
+    # target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
+    # target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
   endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
@@ -430,7 +430,7 @@ function(cc_test_build TARGET_NAME)
     endif()
     # added by lxd
     if(WITH_XPU2)
-      target_link_libraries(${TARGET_NAME} ${XPU2_CLANGRTC_LIB})
+      #target_link_libraries(${TARGET_NAME} ${XPU2_CLANGRTC_LIB})
     endif()
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index ee1c398f0dfb3..60181f22a1f7b 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -132,8 +132,6 @@ function(op_library TARGET)
                 list(APPEND cu_cc_srcs ${src})
             elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
-            elseif(WITH_XPU2 AND ${src} MATCHES ".*_op_kps.cc$")
-                list(APPEND xpu2_cc_srcs ${src})
             elseif(WITH_XPU2 AND ${src} MATCHES ".*\\.xpu$")
                 list(APPEND xpu2_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
@@ -141,11 +139,46 @@ function(op_library TARGET)
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu or .xpu")
+            endif()
+        endforeach()
+    endif()
+    
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+    list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
+    if(WITH_XPU AND WITH_XPU2)
+        if(${xpu2_cc_srcs_len})
+            # message(STATUS "lxd_debug: >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>delete duplicate op in xpu")
+            # message(STATUS " xpu ${xpu_cc_srcs} xpu2 ${xpu2_cc_srcs}")
+        endif()
+    endif()
+
+    #TODO(liuxiandong) 
+    if(WITH_XPU2 AND ${xpu2_cc_srcs_len})
+        foreach(src ${xpu2_cc_srcs})
+            get_filename_component(op_name ${src} NAME_WE)
+            message(STATUS "lxd_debug op_name ${op_name}")
+            if(WITH_XPU)
+                #TODO
+                if(${xpu_cc_srcs} MATCHES "elementwise_add_op_xpu.cc")
+                    # delete it from the xpu_cc_srcs
+                    #list(REMOVE_ITEM ${xpu_cc_srcs} ${op_name}_xpu.cc)
+                    list(REMOVE_ITEM xpu_cc_srcs "elementwise_add_op_xpu.cc")
+                endif()
             endif()
         endforeach()
     endif()
 
+    if(WITH_XPU AND WITH_XPU2)
+        if(${xpu2_cc_srcs_len})
+            # message(STATUS "lxd_debug: <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<delete duplicate op in xpu")
+            # message(STATUS " xpu ${xpu_cc_srcs} xpu2 ${xpu2_cc_srcs}")
+        endif()
+    endif()
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+    list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
+    # message(STATUS "lxd_debug: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! xpu ${xpu_cc_srcs} xpu2 ${xpu2_cc_srcs}")
+
     list(LENGTH cc_srcs cc_srcs_len)
     if (${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
@@ -208,9 +241,12 @@ function(op_library TARGET)
         list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
-    elseif (WITH_XPU2)
+    elseif (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
         xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu2_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
     else()
+        # if(WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
+        #     xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu2_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
+        # endif()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
             # Combine the cc source files.
@@ -277,8 +313,8 @@ function(op_library TARGET)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH hip_cc_srcs hip_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
-    list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
+    #list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+    #list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
     list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
     list(LENGTH npu_cc_srcs npu_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
@@ -320,10 +356,13 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
-    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0)
-        #file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0 AND ${xpu2_cc_srcs_len} EQUAL 0)
+        #message(STATUS "lxd_debug: ${TARGET} op in XPU1")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
+    #message(STATUS "lxd_debug: cmake source dir is: ${CMAKE_SOURCE_DIR}")
+
     if (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index 85a7fce0ab171..ca97ae25315a4 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -93,7 +93,7 @@ macro(compile_kernel COMPILE_ARGS)
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
   #-Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow
-  set(XPU_CXX_INCLUDES  -I/workspace/paddle/Paddle/build -I/workspace/paddle/Paddle/paddle/fluid/framework/io -I/workspace/paddle/Paddle/build/third_party/install/zlib/include -I/workspace/paddle/Paddle/build/third_party/install -I/workspace/paddle/Paddle/build/third_party/install/gflags/include -I/workspace/paddle/Paddle/build/third_party/install/glog/include -I/workspace/paddle/Paddle/build/third_party/boost/src/extern_boost -I/workspace/paddle/Paddle/build/third_party/eigen3/src/extern_eigen3 -I/workspace/paddle/Paddle/build/third_party/threadpool/src/extern_threadpool -I/workspace/paddle/Paddle/build/third_party/dlpack/src/extern_dlpack/include -I/workspace/paddle/Paddle/build/third_party/install/xxhash/include -I/workspace/paddle/Paddle/build/third_party/install/warpctc/include -I/workspace/paddle/Paddle/build/third_party/install/openblas/include -I/workspace/paddle/Paddle/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I/workspace/paddle/Paddle/build/third_party/pybind/src/extern_pybind/include -I/workspace/paddle/Paddle/build/third_party/install/gtest/include -I/workspace/paddle/Paddle/build/third_party/install/gloo/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include -I/workspace/paddle/Paddle/build/third_party/install/xbyak/include/xbyak -I/workspace/paddle/Paddle/build/third_party/install/cryptopp/include -I/workspace/paddle/Paddle/build/third_party/pocketfft/src -I/workspace/paddle/Paddle)
+  set(XPU_CXX_INCLUDES  -I${CMAKE_SOURCE_DIR}/build -I${CMAKE_SOURCE_DIR}/paddle/fluid/framework/io -I${CMAKE_SOURCE_DIR}/build/third_party/install/zlib/include -I${CMAKE_SOURCE_DIR}/build/third_party/install -I${CMAKE_SOURCE_DIR}/build/third_party/install/gflags/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/glog/include -I${CMAKE_SOURCE_DIR}/build/third_party/boost/src/extern_boost -I${CMAKE_SOURCE_DIR}/build/third_party/eigen3/src/extern_eigen3 -I${CMAKE_SOURCE_DIR}/build/third_party/threadpool/src/extern_threadpool -I${CMAKE_SOURCE_DIR}/build/third_party/dlpack/src/extern_dlpack/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xxhash/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/warpctc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/utf8proc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/openblas/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I${CMAKE_SOURCE_DIR}/build/third_party/pybind/src/extern_pybind/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gtest/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xpu/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gloo/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include/xbyak -I${CMAKE_SOURCE_DIR}/build/third_party/install/cryptopp/include -I${CMAKE_SOURCE_DIR}/build/third_party/pocketfft/src -I${CMAKE_SOURCE_DIR} -I${CMAKE_SOURCE_DIR}/paddle/fluid/platform)
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -mavx -O3 -DNDEBUG )
   set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
@@ -107,7 +107,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
     # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.bin.o.sec /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.bin.o.sec ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
       ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
@@ -135,7 +135,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
     # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.host.o /workspace/paddle/Paddle/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.host.o ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-host-only -c -v 
     # COMMAND
     #   ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}
@@ -263,42 +263,42 @@ macro(xpu_add_library TARGET_NAME)
 endmacro()
 
 # XPU2 PATH
-if(NOT DEFINED ENV{XPU2_PATH})
-    set(XPU2_PATH "/workspace/paddle/xpu-demo/XTDK" CACHE PATH "Path to which XPU2 has been installed")
-    set(XPU_CLANG_PATH ${XPU2_PATH}/bin/clang CACHE PATH "Path to which XPU2 CLANG has been installed")
-else()
-    set(XPU2_PATH $ENV{XPU2_PATH} CACHE PATH "Path to which ROCm has been installed")
-    set(XPU_CLANG_PATH ${XPU2_PATH}/bin/clang CACHE PATH "Path to which XPU2 CLANG has been installed")
-endif()
-set(CMAKE_MODULE_PATH "${XPU2_CLANG_PATH}/cmake" ${CMAKE_MODULE_PATH})
+# if(NOT DEFINED ENV{XPU2_PATH})
+#     set(XPU2_PATH "/workspace/paddle/xpu-demo/XTDK" CACHE PATH "Path to which XPU2 has been installed")
+#     set(XPU_CLANG_PATH ${XPU2_PATH}/bin/clang CACHE PATH "Path to which XPU2 CLANG has been installed")
+# else()
+#     set(XPU2_PATH $ENV{XPU2_PATH} CACHE PATH "Path to which ROCm has been installed")
+#     set(XPU_CLANG_PATH ${XPU2_PATH}/bin/clang CACHE PATH "Path to which XPU2 CLANG has been installed")
+# endif()
+# set(CMAKE_MODULE_PATH "${XPU2_CLANG_PATH}/cmake" ${CMAKE_MODULE_PATH})
 
-# define XPU_CXX_FLAGS
-list(APPEND XPU_CFLAGS -fPIC)
-list(APPEND XPU_CFLAGS --sysroot = /opt/compiler/gcc-8.2)
-list(APPEND XPU_CFLAGS -std=c++11)
-list(APPEND XPU_CFLAGS -O2)
-list(APPEND XPU_CFLAGS -g)
-list(APPEND XPU_CFLAGS -mcpu=xpu2)
-list(APPEND XPU_CFLAGS --target=x86_64-linux-gnu)
-list(APPEND XPU_CFLAGS -v)
-list(APPEND XPU_CFLAGS --dyld-prefix=/opt/compiler/gcc-8.2)
-list(APPEND XPU_CFLAGS -fno-builtin)
-list(APPEND XPU_CFLAGS -Wno-dev)
+# # define XPU_CXX_FLAGS
+# list(APPEND XPU_CFLAGS -fPIC)
+# list(APPEND XPU_CFLAGS --sysroot = /opt/compiler/gcc-8.2)
+# list(APPEND XPU_CFLAGS -std=c++11)
+# list(APPEND XPU_CFLAGS -O2)
+# list(APPEND XPU_CFLAGS -g)
+# list(APPEND XPU_CFLAGS -mcpu=xpu2)
+# list(APPEND XPU_CFLAGS --target=x86_64-linux-gnu)
+# list(APPEND XPU_CFLAGS -v)
+# list(APPEND XPU_CFLAGS --dyld-prefix=/opt/compiler/gcc-8.2)
+# list(APPEND XPU_CFLAGS -fno-builtin)
+# list(APPEND XPU_CFLAGS -Wno-dev)
 
-set(XPU_XPUCC_FLAGS ${XPU_CFLAGS})
+# set(XPU_XPUCC_FLAGS ${XPU_CFLAGS})
 
 # set HIP link libs
-set(xpuapi_library_name xpuapi)
-message(STATUS "XPU API library name: ${xpuapi_library_name}")
-# link in the generic.cmake
-find_library(XPU2_CLANG_API_LIB ${xpuapi_library_name} HINTS ${XPU2_PATH}/shlib)
-message(STATUS "XPU2_CLANG_API_LIB: ${XPU2_CLANG_API_LIB}")
+# set(xpuapi_library_name xpuapi)
+# message(STATUS "XPU API library name: ${xpuapi_library_name}")
+# # link in the generic.cmake
+# find_library(XPU2_CLANG_API_LIB ${xpuapi_library_name} HINTS ${XPU2_PATH}/shlib)
+# message(STATUS "XPU2_CLANG_API_LIB: ${XPU2_CLANG_API_LIB}")
 
-set(xpurt_library_name xpurt)
-message(STATUS "XPU RT library name: ${xpurt_library_name}")
-# link in the generic.cmake
-find_library(XPU2_CLANG_RT_LIB ${xpurt_library_name} HINTS ${XPU2_PATH}/runtime/shlib)
-message(STATUS "XPU2_CLANG_RT_LIB: ${XPU2_CLANG_RT_LIB}")
+# set(xpurt_library_name xpurt)
+# message(STATUS "XPU RT library name: ${xpurt_library_name}")
+# # link in the generic.cmake
+# find_library(XPU2_CLANG_RT_LIB ${xpurt_library_name} HINTS ${XPU2_PATH}/runtime/shlib)
+# message(STATUS "XPU2_CLANG_RT_LIB: ${XPU2_CLANG_RT_LIB}")
 
 # # Ensure that xpu/api.h can be included without dependency errors.
 # file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")

From d473766cd70edd5db79f59dfc3596a535d93985f Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Fri, 26 Nov 2021 03:01:08 +0000
Subject: [PATCH 14/41] update the fuse operation in WITH_XPU and WITH_XPU2

---
 CMakeLists.txt        | 7 ++++++-
 cmake/operators.cmake | 9 +++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3226d5a52461c..f3dd9deb67fc1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -278,6 +278,11 @@ if (WITH_XPU2 AND NOT WITH_XPU)
         "Enable XPU when compiling with XPU2" FORCE)
 endif()
 
+if (WITH_XPU AND NOT WITH_XPU2)
+    set(WITH_XPU2 OFF CACHE STRING
+        "Disable XPU2 when compiling with XPU" FORCE)
+endif()
+
 #
 if (NOT WITH_XPU AND WITH_XPU_BKCL)
     MESSAGE(WARNING
@@ -321,7 +326,7 @@ endif(WITH_ROCM)
 
 if(WITH_XPU2)
     include(xpu2)
-endif(WITH_ROCM)
+endif()
 
 if (NOT WITH_ROCM AND WITH_RCCL)
     MESSAGE(WARNING
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 60181f22a1f7b..c2f44c8d024dc 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -356,17 +356,18 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
-    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0 AND ${xpu2_cc_srcs_len} EQUAL 0)
-        #message(STATUS "lxd_debug: ${TARGET} op in XPU1")
+    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0 AND ${xpu2_cc_srcs_len} EQUAL 0) 
+        message(STATUS "lxd_debug: ${TARGET} op in XPU1")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
-    #message(STATUS "lxd_debug: cmake source dir is: ${CMAKE_SOURCE_DIR}")
-
     if (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
+        message(STATUS "lxd_debug: ${TARGET} op in XPU2")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
+    
+
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
         file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
         # It is different from the logic above, becareful

From 78af7ff523c940126ee10971c6e35ac0320d7fbb Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Mon, 6 Dec 2021 02:53:23 +0000
Subject: [PATCH 15/41] update

---
 cmake/operators.cmake                         | 22 ++---
 cmake/xpu2.cmake                              | 74 +++++------------
 paddle/fluid/framework/tensor_impl.h          |  8 +-
 paddle/fluid/imperative/prepared_operator.cc  |  2 +
 .../elementwise/elementwise_add_op.h          | 21 ++---
 .../elementwise/elementwise_add_op.xpu        | 58 ++++---------
 .../compute_primitives_xpu2.h                 | 82 +++++++++----------
 .../datamover_primitives_xpu2.h               | 68 ++++++++-------
 .../kernel_primitives/functor_primitives.h    | 69 ++++++++--------
 9 files changed, 159 insertions(+), 245 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index c2f44c8d024dc..b77644081c345 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -134,6 +134,8 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${src})
             elseif(WITH_XPU2 AND ${src} MATCHES ".*\\.xpu$")
                 list(APPEND xpu2_cc_srcs ${src})
+            elseif(WITH_XPU2 AND ${src} MATCHES ".*_op_kps.cc$")
+                list(APPEND xpu2_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
@@ -148,8 +150,6 @@ function(op_library TARGET)
     list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
     if(WITH_XPU AND WITH_XPU2)
         if(${xpu2_cc_srcs_len})
-            # message(STATUS "lxd_debug: >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>delete duplicate op in xpu")
-            # message(STATUS " xpu ${xpu_cc_srcs} xpu2 ${xpu2_cc_srcs}")
         endif()
     endif()
 
@@ -161,8 +161,6 @@ function(op_library TARGET)
             if(WITH_XPU)
                 #TODO
                 if(${xpu_cc_srcs} MATCHES "elementwise_add_op_xpu.cc")
-                    # delete it from the xpu_cc_srcs
-                    #list(REMOVE_ITEM ${xpu_cc_srcs} ${op_name}_xpu.cc)
                     list(REMOVE_ITEM xpu_cc_srcs "elementwise_add_op_xpu.cc")
                 endif()
             endif()
@@ -171,13 +169,10 @@ function(op_library TARGET)
 
     if(WITH_XPU AND WITH_XPU2)
         if(${xpu2_cc_srcs_len})
-            # message(STATUS "lxd_debug: <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<delete duplicate op in xpu")
-            # message(STATUS " xpu ${xpu_cc_srcs} xpu2 ${xpu2_cc_srcs}")
         endif()
     endif()
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
-    # message(STATUS "lxd_debug: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! xpu ${xpu_cc_srcs} xpu2 ${xpu2_cc_srcs}")
 
     list(LENGTH cc_srcs cc_srcs_len)
     if (${cc_srcs_len} EQUAL 0)
@@ -250,7 +245,7 @@ function(op_library TARGET)
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
             # Combine the cc source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu2_cc_srcs} ${npu_cc_srcs})
+            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
             if(TARGET ${UNITY_TARGET})
                 # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                 target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@@ -261,7 +256,7 @@ function(op_library TARGET)
             # Add alias library to handle dependencies.
             add_library(${TARGET} ALIAS ${UNITY_TARGET})
         else()
-            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu2_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
+            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
         endif()
     endif()
@@ -356,18 +351,17 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
-    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0 AND ${xpu2_cc_srcs_len} EQUAL 0) 
-        message(STATUS "lxd_debug: ${TARGET} op in XPU1")
+    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0 AND ${xpu2_cc_srcs_len} EQUAL 0)
+        #message(STATUS "lxd_debug: ${TARGET} op in XPU1")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
+    #message(STATUS "lxd_debug: cmake source dir is: ${CMAKE_SOURCE_DIR}")
+
     if (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
-        message(STATUS "lxd_debug: ${TARGET} op in XPU2")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
-    
-
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
         file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
         # It is different from the logic above, becareful
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index ca97ae25315a4..e5eba69e29584 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -83,8 +83,6 @@ macro(compile_kernel COMPILE_ARGS)
     #target_link_libraries(${kernel_target} ${xpu_add_library_DEPENDS})
   endif()
 
-  # set(arg_rule ${rule})
-  # separate_arguments(arg_rule)
   set(arg_device_o_extra_flags ${device_o_extra_flags})
   separate_arguments(arg_device_o_extra_flags)
   set(arg_host_o_extra_flags ${host_o_extra_flags})
@@ -97,7 +95,7 @@ macro(compile_kernel COMPILE_ARGS)
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -mavx -O3 -DNDEBUG )
   set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
   #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
-  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU2 -DXBYAK64 -DXBYAK_NO_OP_NAMES)
+  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU2 -DPADDLE_WITH_XPU -DXBYAK64 -DXBYAK_NO_OP_NAMES)
 
   add_custom_command(
     OUTPUT
@@ -107,14 +105,10 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
     # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.bin.o.sec ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
       ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
-    # COMMAND
-    #   ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}
-    #     ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}
-    #     ${CMAKE_BINARY_DIR}/CMakeFiles/${kernel_target}.dir/DependInfo.cmake --color=$(COLOR)
     WORKING_DIRECTORY
       ${CMAKE_CURRENT_BINARY_DIR}
     DEPENDS
@@ -124,7 +118,6 @@ macro(compile_kernel COMPILE_ARGS)
     VERBATIM
     )
     # TODO attention here
-    #set(xpu_kernel_depends ${kernel_name}_depends)
     list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
 
   add_custom_command(
@@ -135,12 +128,8 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
     # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-        -I${XTDK_DIR}/include -I.  -o kernel_build/${kernel_name}.host.o ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+        -I.  -o kernel_build/${kernel_name}.host.o ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/elementwise/${kernel_name}.xpu
         --xpu-host-only -c -v 
-    # COMMAND
-    #   ${CMAKE_COMMAND} -E cmake_depends "Unix Makefiles" ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}
-    #     ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}
-    #     ${CMAKE_BINARY_DIR}/CMakeFiles/${kernel_target}.dir/DependInfo.cmake --color=$(COLOR)
     WORKING_DIRECTORY
       ${CMAKE_CURRENT_BINARY_DIR}
     DEPENDS
@@ -152,29 +141,6 @@ macro(compile_kernel COMPILE_ARGS)
     list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
 endmacro()
 
-# macro(__compile_kernel_with_rules kernel_path kernel_name rules_path device_o_extra_flags host_o_extra_flags xpu_1_or_2 cc_depends)
-#   file(STRINGS ${rules_path} rules)
-#   foreach(rule IN LISTS rules)
-#     message(STATUS "  Instantiate with '${rule}'")
-#     execute_process(
-#       COMMAND
-#         bash "-c" "echo -n ${rule} | md5sum | cut -c1-6"
-#       OUTPUT_VARIABLE
-#         rule_md5
-#       OUTPUT_STRIP_TRAILING_WHITESPACE
-#       )
-
-#     set(kernel_name_md5 ${kernel_name}_${rule_md5})
-#     compile_kernel(${kernel_path} ${kernel_name_md5} ${rule} ${device_o_extra_flags} ${host_o_extra_flags} ${xpu_1_or_2} ${cc_depends})
-#   endforeach()
-# endmacro()
-
-# macro(compile_kernel_with_rules kernel_path kernel_name rules_path device_o_extra_flags host_o_extra_flags xpu_1_or_2 cc_depends)
-#   # XXX: reconfigure if file |rules_path| was modified
-#   set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${rules_path})
-#   __compile_kernel_with_rules(${kernel_path} ${kernel_name} ${rules_path} ${device_o_extra_flags} ${host_o_extra_flags} ${xpu_1_or_2} ${cc_depends})
-# endmacro()
-
 ###############################################################################
 # XPU_ADD_LIBRARY
 ###############################################################################
@@ -232,31 +198,33 @@ macro(xpu_add_library TARGET_NAME)
                 ${CMAKE_CURRENT_BINARY_DIR}
             DEPENDS
                 ${xpu_kernel_depends}
-                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+                #${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
             COMMENT
                 ${xpu_target}_src
             VERBATIM
             )
 
-        add_custom_command(
-            OUTPUT
-            ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-            COMMAND
-                ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
-            WORKING_DIRECTORY
-                ${CMAKE_CURRENT_BINARY_DIR}
-            DEPENDS
-                ${xpu_kernel_depends}
-            COMMENT
-                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-            VERBATIM
-            ) 
+        # add_custom_command(
+        #     OUTPUT
+        #     ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+        #     COMMAND
+        #         ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
+        #     WORKING_DIRECTORY
+        #         ${CMAKE_CURRENT_BINARY_DIR}
+        #     DEPENDS
+        #         ${xpu_kernel_depends}
+        #     COMMENT
+        #         ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+        #     VERBATIM
+        #     ) 
         
         # if(${cc_srcs_depends_num})
         #   add_dependencies(${xpu_target}_kernel ${cc_srcs_depends})
         # endif()
         add_library(${xpu_target} STATIC ${cc_kernel_lists})
-        target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
+        add_dependencies(${xpu_target} ${xpu_target}_src)
+        #target_link_libraries(${xpu_target} ${xpu_target}_src)
+        #target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
     else()
         add_library(${xpu_target} STATIC ${cc_kernel_lists})
     endif()
@@ -304,4 +272,4 @@ endmacro()
 # file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
 # add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
 # add_dependencies(xpu_headers_dummy extern_xpu)
-# link_libraries(xpu_headers_dummy)
\ No newline at end of file
+# link_libraries(xpu_headers_dummy)
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 986551b935e88..c7746e7e04cc0 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -28,9 +28,7 @@ inline const T* Tensor::data() const {
   PADDLE_ENFORCE_EQ(
       valid, true,
       platform::errors::InvalidArgument(
-          "Tensor holds the wrong type, it holds %s, but desires to be %s.",
-          DataTypeToString(type_),
-          DataTypeToString(DataTypeTrait<T>::DataType())));
+          "Tensor holds the wrong type, it holds, but desires to be."));
 
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -46,9 +44,7 @@ inline T* Tensor::data() {
   PADDLE_ENFORCE_EQ(
       valid, true,
       platform::errors::InvalidArgument(
-          "Tensor holds the wrong type, it holds %s, but desires to be %s",
-          DataTypeToString(type_),
-          DataTypeToString(DataTypeTrait<T>::DataType())));
+          "Tensor holds the wrong type, it holds, but desires to be"));
 
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 32ee8aceee85c..7310ad02898e1 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -202,6 +202,8 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
+  } else {
+    VLOG(3) << "This is XPU : " << op.Type();
   }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index f0292ffe17869..0ce4ca665dd9d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -20,13 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
-#include "paddle/fluid/framework/pten_utils.h"
-
-// only can include the headers in paddle/pten/include dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
-
 namespace paddle {
 namespace operators {
 
@@ -62,14 +55,12 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto *y = ctx.Input<framework::LoDTensor>("Y");
     auto *z = ctx.Output<framework::LoDTensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
-    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::ElementwiseAdd<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
-                            pt_z.get());
+    if (x->dims() == y->dims()) {
+      SameDimsElemwiseAdd<DeviceContext, T> LaunchElementwiseCpuKernel;
+      LaunchElementwiseCpuKernel(ctx, x, y, z);
+    } else {
+      LaunchBroadcastElementwiseCpuKernel<DeviceContext, T>(ctx, x, y, z);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
index 95989e8c4200a..ceeedaeab1ab1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef PADDLE_WITH_XPU
 
 #if defined(__CUDA_ARCH__)
   #undef __CUDA_ARCH__
@@ -28,60 +29,33 @@ limitations under the License. */
   #undef __NVCC__
 #endif
 
+#if defined(EIGEN_HAS_BUILTIN_INT128)
+  #undef EIGEN_HAS_BUILTIN_INT128
+#endif
 
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/function_traits.h"
-#include "paddle/fluid/framework/op_registry.h" //TODO
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
+#include <xpu/runtime.h>
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
-#include <xpu/runtime.h>
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 
 
 namespace paddle {
 namespace operators {
 
-//__global__ void elementwise() {
-  //TODO
-//}
-
-void ComputeTest(){
-
+void ElementwiseAddXPU2Compute(const framework::ExecutionContext& ctx){
+  std::cout<<"lxd_debug: XPU2 forward element_add !"<<std::endl;
 }
 
-template <typename T>
-class ElementwiseAddXPU2Kernel : public framework::OpKernel<T> {
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::cout<<"lxd_debug: XPU2 forward element_add !"<<std::endl;
-    ComputeTest();
-  }
-};
-
-template <typename T>
-class ElementwiseAddGradXPU2Kernel : public ElemwiseGradKernel<T> {
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::cout<<"lxd_debug: XPU2 backward element_add !"<<std::endl;
-  }
-};
+void ElementwiseAddGradXPU2Compute(const framework::ExecutionContext& ctx){
+  std::cout<<"lxd_debug: XPU2 backward element_add !"<<std::endl;
+}
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-
-
-REGISTER_OP_XPU_KERNEL(elementwise_add, ops::ElementwiseAddXPU2Kernel<float>);
-                       
-REGISTER_OP_XPU_KERNEL(
-    elementwise_add_grad, ops::ElementwiseAddGradXPU2Kernel<float>);
-
+#endif
diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h b/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
index 3235591580916..26678dd7d644c 100644
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
@@ -48,31 +48,31 @@ static inline __device__ void sync_all() {
 #define ncores 64
 template <typename T, typename OpFunc, int VecSize>
 __device__ void BlockXReduce(T* data, OpFunc reducer) {
-  __shared__ T sum_array[ncores * VecSize];
-  int core_idx = core_id() * VecSize;
-  mfence();
-  sync_all();
-
-#pragma unroll
-  for (int i = 0; i < VecSize; i++) {
-    mfence();
-    sum_array[core_idx + i] = data[i];
-    mfence();
-    data[i] = 0;
-  }
-  sync_all();
-#pragma unroll
-  for (int i = 0; i < VecSize; i++) {
-#pragma unroll
-    for (int j = 0; j < ncores; j++) {
-      mfence();
-      T tmp = sum_array[j * VecSize + i];
-      mfence();
-      data[i] = reducer(data[i], tmp);
-      mfence();
-    }
-  }
-  sync_all();
+  //  __shared__ T sum_array[ncores * VecSize];
+  //  int core_idx = core_id() * VecSize;
+  //  mfence();
+  //  sync_all();
+  //
+  // #pragma unroll
+  //  for (int i = 0; i < VecSize; i++) {
+  //    mfence();
+  //    sum_array[core_idx + i] = data[i];
+  //    mfence();
+  //    data[i] = 0;
+  //  }
+  //  sync_all();
+  // #pragma unroll
+  //  for (int i = 0; i < VecSize; i++) {
+  // #pragma unroll
+  //    for (int j = 0; j < ncores; j++) {
+  //      mfence();
+  //      T tmp = sum_array[j * VecSize + i];
+  //      mfence();
+  //      data[i] = reducer(data[i], tmp);
+  //      mfence();
+  //    }
+  //  }
+  //  sync_all();
 }
 #undef ncores
 
@@ -104,8 +104,7 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
-                                                 OpFunc compute) {
+__device__ void ElementwiseUnary(OutT* out, const InT* in, OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; idx++) {
     out[idx] = static_cast<OutT>(compute(in[idx]));
@@ -139,9 +138,8 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
-                                                  const InT* in2,
-                                                  OpFunc compute) {
+__device__ void ElementwiseBinary(OutT* out, const InT* in1, const InT* in2,
+                                  OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
     out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx]));
@@ -177,10 +175,8 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
-                                                   const InT* in2,
-                                                   const InT* in3,
-                                                   OpFunc compute) {
+__device__ void ElementwiseTernary(OutT* out, const InT* in1, const InT* in2,
+                                   const InT* in3, OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
     out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
@@ -214,8 +210,7 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
           class OpFunc>
-__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
-                                               OpFunc compute) {
+__device__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], OpFunc compute) {
   __local__ InT args[Arity];
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
@@ -255,8 +250,8 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
-                                            const InT* in2, OpFunc compute) {
+__device__ void CycleBinary(OutT* out, const InT* in1, const InT* in2,
+                            OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX; idx++) {
 #pragma unroll
@@ -294,12 +289,11 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
  * reducer: Compute function which was declared like ReduceFunctor<InT>().
  * reduce_last_dim: if the last dim gets involved in reduction.
  */
-template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
+template <typename T, int NX, int NY, int BlockSize, typename ReduceFunctor,
           details::ReduceMode Mode>
-__device__ __forceinline__ void Reduce(T* out, const T* in,
-                                       ReduceFunctor reducer,
-                                       bool reduce_last_dim) {
-  if (Mode == kGlobalMode) {
+__device__ void Reduce(T* out, const T* in, ReduceFunctor reducer,
+                       bool reduce_last_dim) {
+  if (Mode == details::kGlobalMode) {
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
 #pragma unroll
@@ -307,7 +301,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
         out[i] = reducer(out[i], in[i * NX + j]);
       }
     }
-    BlockXReduce<T, OpFunc, NY>(out, reducer);
+    // BlockXReduce<T, ReduceFunctor, NY>(out, reducer);
   } else {  // else  kLocalMode
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
index b27ba27b3c6f1..6aef07d1bd463 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
@@ -56,11 +56,11 @@ struct BroadcastConfig {
     }
 
     for (int i = 1; i < dim_size - 1; ++i) {
-      strides_out[dim_size - i - 1] = std::accumulate(
-          out_dims.begin(), out_dims.begin() + i, 1, std::multiplies<int64_t>())
-          strides_in[dim_size - i - 1] =
-              std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
-                              std::multiplies<int64_t>())
+      strides_out[dim_size - i - 1] =
+          std::accumulate(out_dims.begin(), out_dims.begin() + i, 1,
+                          std::multiplies<int64_t>());
+      strides_in[dim_size - i - 1] = std::accumulate(
+          in_dims.begin(), in_dims.begin() + i, 1, std::multiplies<int64_t>());
     }
 
     memcpy(stride_in, strides_in.data(), kDims * sizeof(uint32_t));
@@ -99,12 +99,11 @@ struct BroadcastConfig {
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
-                                         int size_nx, int size_ny,
-                                         int stride_nx, int stride_ny) {
+__device__ void ReadData(Ty* dst, const Tx _global_ptr_* src, int size_nx,
+                         int size_ny, int stride_nx, int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
-  __local__ T in_temp[1];
+  __local__ Tx in_temp[1];
   // Each branch is added for better performance
   if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1
     if (IsBoundary) {
@@ -168,7 +167,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
  * init_data: Initial value.
  */
 template <typename T, int NX>
-__device__ __forceinline__ void Init(T* dst, T init_data) {
+__device__ void Init(T* dst, T init_data) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     dst[i] = init_data;
@@ -197,8 +196,7 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
  * size: The current block needs to load size data continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
-                                         int num) {
+__device__ void ReadData(T* dst, const T _global_ptr_* src, int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
   if (IsBoundary) {  // core_num() * NX > num
@@ -241,10 +239,10 @@ __device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataBc(
-    T* dst, const T _global_ptr_* src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
-    int stride_ny) {
+__device__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+                           uint32_t block_offset,
+                           details::BroadcastConfig<Rank> config,
+                           int total_num_output, int stride_nx, int stride_ny) {
   uint32_t thread_offset = block_offset + core_id();
   uint32_t index_src = 0;
   __local__ T in_temp[1];
@@ -256,7 +254,7 @@ __device__ __forceinline__ void ReadDataBc(
       uint32_t index_output = thread_offset + ny * stride_ny + nx * stride_nx;
       index_src = 0;
       if (IsBoundary) {
-        if (index_output >= total_num_output) {
+        if (index_output >= (uint32_t)total_num_output) {
           break;
         }
       }
@@ -305,10 +303,10 @@ __device__ __forceinline__ void ReadDataBc(
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           typename IndexCal, bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataReduce(
-    T* dst, const T _global_ptr_* src, int block_offset,
-    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
-    int stride_ny, bool reduce_last_dim) {
+__device__ void ReadDataReduce(T* dst, const T _global_ptr_* src,
+                               int block_offset, const IndexCal& index_cal,
+                               int size_nx, int size_ny, int stride_nx,
+                               int stride_ny, bool reduce_last_dim) {
   __local__ T in_temp[1];
   int thread_offset = 0;
   int left_size_nx = size_nx;
@@ -421,9 +419,8 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
-__device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
-                                          int size_nx, int size_ny,
-                                          int stride_nx, int stride_ny) {
+__device__ void WriteData(Ty _global_ptr_* dst, const Tx* src, int size_nx,
+                          int size_ny, int stride_nx, int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
   __local__ Ty in_temp[1];
@@ -433,11 +430,11 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
     if (IsBoundary) {
       if (left_size_nx > 0) {
         in_temp[0] = static_cast<Ty>(src[0]);
-        LM2GM(in_temp, dst + thread_offset, sizeof(T));
+        LM2GM(in_temp, dst + thread_offset, sizeof(Ty));
       }
     } else {
       in_temp[0] = static_cast<Ty>(src[0]);
-      LM2GM(in_temp, dst + thread_offset, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset, sizeof(Ty));
     }
   } else if (NX == 1) {
 #pragma unroll
@@ -449,7 +446,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
       }
 
       in_temp[0] = static_cast<Ty>(src[idy]);
-      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(Ty));
     }
   } else if (NY == 1) {  // for NY == 1 and NX != 1
 #pragma unroll
@@ -461,7 +458,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
       }
 
       in_temp[0] = static_cast<Ty>(src[idx]);
-      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(Ty));
     }
   } else {  // for NX != 1 and NY != 1
 #pragma unroll
@@ -480,7 +477,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
         }
         in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
         LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
-              sizeof(T));
+              sizeof(Ty));
       }
     }
   }
@@ -498,7 +495,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
  * init_data: The register pointer of init data, the size is NX.
  */
 template <typename T, int NX, bool IsBoundary = false>
-__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
+__device__ void Init(T* dst, T* init_data, int num) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     if (IsBoundary) {
@@ -535,9 +532,10 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataBc(
-    T* dst, const T _global_ptr_* src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output) {
+__device__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+                           uint32_t block_offset,
+                           details::BroadcastConfig<Rank> config,
+                           int total_num_output) {
   uint32_t thread_offset = block_offset + core_id() * NX;
   uint32_t index_src = 0;
   __local__ T in_temp[1];
@@ -547,7 +545,7 @@ __device__ __forceinline__ void ReadDataBc(
     uint32_t index_output = thread_offset + nx;
     index_src = 0;
     if (IsBoundary) {
-      if (index_output >= total_num_output) {
+      if (index_output >= (uint32_t)total_num_output) {
         break;
       }
     }
@@ -558,7 +556,7 @@ __device__ __forceinline__ void ReadDataBc(
       index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
     }
     GM2LM(src + index_src, in_temp, sizeof(T));
-    dst[nx + ny * NX] = in_temp[0];
+    dst[nx] = in_temp[0];
   }
 }
 
diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
index 3fce3b1c0920a..5a609bd513ab1 100644
--- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -15,27 +15,31 @@
 #pragma once
 
 #include "paddle/fluid/platform/eigen_ext.h"
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
 
 namespace paddle {
 namespace operators {
 namespace kernel_primitives {
 namespace details {
 
-static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) {
-  return ::Eigen::numext::exp(x);
-}
+// static __device__ platform::float16 Exp(platform::float16 x) {
+//  // return 1;//::Eigen::numext::exp(x);
+//  //return static_cast<platform::float16>(1);//::Eigen::numext::log(x);
+//}
 
-static __device__ __forceinline__ float Exp(float x) { return expf(x); }
+// static __device__  float Exp(float x) { return expf(x); }
 
-static __device__ __forceinline__ double Exp(double x) { return exp(x); }
+// static __device__  double Exp(double x) { return exp(x); }
 
-static __device__ __forceinline__ platform::float16 Log(platform::float16 x) {
-  return ::Eigen::numext::log(x);
-}
+// static __device__  platform::float16 Log(platform::float16 x) {
+//  return static_cast<platform::float16>(1);//::Eigen::numext::log(x);
+//}
 
-static __device__ __forceinline__ float Log(float x) { return logf(x); }
+// static __device__  float Log(float x) { return logf(x); }
 
-static __device__ __forceinline__ double Log(double x) { return log(x); }
+// static __device__  double Log(double x) { return log(x); }
 
 }  // namespace details
 
@@ -44,16 +48,16 @@ static __device__ __forceinline__ double Log(double x) { return log(x); }
 /**
  * @brief Default unary exp functor
  */
-template <typename Tx, typename Ty = Tx>
-struct ExpFunctor {
-  HOSTDEVICE inline ExpFunctor() {}
-
-  HOSTDEVICE explicit inline ExpFunctor(int n) {}
-
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
-    return static_cast<Ty>(details::Exp(x));
-  }
-};
+// template <typename Tx, typename Ty = Tx>
+// struct ExpFunctor {
+//  HOSTDEVICE inline ExpFunctor() {}
+//
+//  HOSTDEVICE explicit inline ExpFunctor(int n) {}
+//
+//  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+//    return static_cast<Ty>(details::Exp(x));
+//  }
+//};
 
 /**
  * @brief Default unary identity functor
@@ -107,9 +111,10 @@ struct SquareFunctor {
  */
 template <typename T>
 struct MinFunctor {
-  inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
+  inline T initial() { /*return static_cast<T>(std::numeric_limits<T>::max());*/
+  }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+  __device__ T operator()(const T& a, const T& b) const {
     return (b < a) ? b : a;
   }
 };
@@ -120,10 +125,10 @@ struct MinFunctor {
 template <typename T>
 struct MaxFunctor {
   inline T initial() {
-    return static_cast<T>(std::numeric_limits<T>::lowest());
+    // return static_cast<T>(std::numeric_limits<T>::lowest());
   }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+  __device__ T operator()(const T& a, const T& b) const {
     return (b > a) ? b : a;
   }
 };
@@ -135,9 +140,7 @@ template <typename T>
 struct AddFunctor {
   inline T initial() { return static_cast<T>(0.0f); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
-    return b + a;
-  }
+  __device__ T operator()(const T& a, const T& b) const { return b + a; }
 };
 
 /**
@@ -147,9 +150,7 @@ template <typename T>
 struct MulFunctor {
   inline T initial() { return static_cast<T>(1.0f); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
-    return b * a;
-  }
+  __device__ T operator()(const T& a, const T& b) const { return b * a; }
 };
 
 /**
@@ -159,9 +160,7 @@ template <typename T>
 struct LogicalOrFunctor {
   inline T initial() { return static_cast<T>(false); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
-    return b || a;
-  }
+  __device__ T operator()(const T& a, const T& b) const { return b || a; }
 };
 
 /**
@@ -171,9 +170,7 @@ template <typename T>
 struct LogicalAndFunctor {
   inline T initial() { return static_cast<T>(true); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
-    return b && a;
-  }
+  __device__ T operator()(const T& a, const T& b) const { return b && a; }
 };
 
 /**

From bdaa02ce8d138cbc056d984d85f2a845dfd8ccf6 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 7 Dec 2021 08:51:41 +0000
Subject: [PATCH 16/41] update

---
 cmake/operators.cmake | 20 +++------
 cmake/xpu2.cmake      | 97 ++++++++++++++++---------------------------
 2 files changed, 41 insertions(+), 76 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index b77644081c345..83ba56535147e 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -148,29 +148,22 @@ function(op_library TARGET)
     
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
-    if(WITH_XPU AND WITH_XPU2)
-        if(${xpu2_cc_srcs_len})
-        endif()
-    endif()
 
     #TODO(liuxiandong) 
     if(WITH_XPU2 AND ${xpu2_cc_srcs_len})
         foreach(src ${xpu2_cc_srcs})
+            #message(STATUS "lxd_debug src----------- ${src}")
             get_filename_component(op_name ${src} NAME_WE)
-            message(STATUS "lxd_debug op_name ${op_name}")
+            #message(STATUS "lxd_debug op_name ${op_name}")
             if(WITH_XPU)
-                #TODO
-                if(${xpu_cc_srcs} MATCHES "elementwise_add_op_xpu.cc")
-                    list(REMOVE_ITEM xpu_cc_srcs "elementwise_add_op_xpu.cc")
+                if(xpu_cc_srcs MATCHES ".*_op_xpu.cc$") 
+                    #message(STATUS "the target is matched")
+                    list(REMOVE_ITEM xpu_cc_srcs "${op_name}_xpu.cc")
                 endif()
             endif()
         endforeach()
     endif()
 
-    if(WITH_XPU AND WITH_XPU2)
-        if(${xpu2_cc_srcs_len})
-        endif()
-    endif()
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
 
@@ -239,9 +232,6 @@ function(op_library TARGET)
     elseif (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
         xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu2_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
     else()
-        # if(WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
-        #     xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu2_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
-        # endif()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
             # Combine the cc source files.
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index e5eba69e29584..550d64b7a33f8 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -52,13 +52,12 @@ message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
 message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
 message(STATUS "Build with HOST_AR=" ${HOST_AR})
 
-#macro(compile_kernel kernel_path kernel_name device_o_extra_flags host_o_extra_flags xpu_1_or_2 cc_depends)
 macro(compile_kernel COMPILE_ARGS)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs KERNEL XNAME DEVICE HOST XPU DEPENDS)
+  set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS)
   cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(kernel_path ${xpu_add_library_KERNEL})
+  set(kernel_path ${xpu_add_library_DIRPATH})
   set(kernel_name ${xpu_add_library_XNAME})
   set(device_o_extra_flags ${xpu_add_library_DEVICE})
   set(host_o_extra_flags ${xpu_add_library_HOST})
@@ -105,7 +104,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
     # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
       ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
@@ -128,7 +127,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
     # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-        -I.  -o kernel_build/${kernel_name}.host.o ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/elementwise/${kernel_name}.xpu
+        -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
         --xpu-host-only -c -v 
     WORKING_DIRECTORY
       ${CMAKE_CURRENT_BINARY_DIR}
@@ -153,7 +152,6 @@ macro(xpu_add_library TARGET_NAME)
     set(xpu_srcs ${xpu_add_library_STATIC})
     set(xpu_target ${TARGET_NAME})
     set(cc_srcs_depends ${xpu_add_library_DEPENDS})
-    #message(STATUS "lxd_debug: ${xpu_add_library_DEPENDS}---------------------------------")
     
     file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
     list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
@@ -181,16 +179,11 @@ macro(xpu_add_library TARGET_NAME)
             message(STATUS "Process ${xpu_kernel}")
             get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
             get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
+            #message(STATUS "lxd_debug PATH ${kernel_dir}")
             #TODO(liuxiandong set default rules)
             set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
-            set(kernel_name ${kernel_name})
-            # if(EXISTS ${kernel_rules})
-            #     # compile_kernel_with_rules(${xpu_kernel} ${kernel_name} ${kernel_rules}
-            #     #     ${XPU1_DEVICE_O_EXTRA_FLAGS} ${XPU1_HOST_O_EXTRA_FLAGS} "xpu2" ${cc_srcs_depends})
-            # else()
-            message(STATUS "lxd_debug: ${cc_srcs_depends}>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
-            compile_kernel(KERNEL ${xpu_kernel} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
-            # endif()
+            set(kernel_name ${kernel_name}) #DIRPATH ${kernel_dir}
+            compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
         endforeach()
 
         add_custom_target(${xpu_target}_src ALL
@@ -218,58 +211,40 @@ macro(xpu_add_library TARGET_NAME)
         #     VERBATIM
         #     ) 
         
-        # if(${cc_srcs_depends_num})
-        #   add_dependencies(${xpu_target}_kernel ${cc_srcs_depends})
-        # endif()
         add_library(${xpu_target} STATIC ${cc_kernel_lists})
         add_dependencies(${xpu_target} ${xpu_target}_src)
         #target_link_libraries(${xpu_target} ${xpu_target}_src)
         #target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
+
+        # TEST
+        # add_library(${xpu_target}_obj OBJECT ${cc_kernel_lists})
+        #add_dependencies(${xpu_target} ${xpu_target}_src)
+
+        # add_custom_target(${xpu_target} ALL
+        #   WORKING_DIRECTORY
+        #     ${CMAKE_CURRENT_BINARY_DIR}
+        #   DEPENDS
+        #     ${xpu_kernel_depends}
+        #     ${xpu_target}_obj
+        #     ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}.a
+        #   COMMENT
+        #     ${xpu_target}
+        #   VERBATIM
+        #   )
+        # add_custom_command(
+        #   OUTPUT
+        #     ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}.a
+        #   COMMAND
+        #     ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}.a ${xpu_kernel_depends} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${xpu_target}.dir/*.o
+        #   WORKING_DIRECTORY
+        #     ${CMAKE_CURRENT_BINARY_DIR}
+        #   DEPENDS
+        #     ${xpuapi_wrapper_a_depends}
+        #   COMMENT
+        #     ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}.a
+        #   VERBATIM
+        #   )
     else()
         add_library(${xpu_target} STATIC ${cc_kernel_lists})
     endif()
 endmacro()
-
-# XPU2 PATH
-# if(NOT DEFINED ENV{XPU2_PATH})
-#     set(XPU2_PATH "/workspace/paddle/xpu-demo/XTDK" CACHE PATH "Path to which XPU2 has been installed")
-#     set(XPU_CLANG_PATH ${XPU2_PATH}/bin/clang CACHE PATH "Path to which XPU2 CLANG has been installed")
-# else()
-#     set(XPU2_PATH $ENV{XPU2_PATH} CACHE PATH "Path to which ROCm has been installed")
-#     set(XPU_CLANG_PATH ${XPU2_PATH}/bin/clang CACHE PATH "Path to which XPU2 CLANG has been installed")
-# endif()
-# set(CMAKE_MODULE_PATH "${XPU2_CLANG_PATH}/cmake" ${CMAKE_MODULE_PATH})
-
-# # define XPU_CXX_FLAGS
-# list(APPEND XPU_CFLAGS -fPIC)
-# list(APPEND XPU_CFLAGS --sysroot = /opt/compiler/gcc-8.2)
-# list(APPEND XPU_CFLAGS -std=c++11)
-# list(APPEND XPU_CFLAGS -O2)
-# list(APPEND XPU_CFLAGS -g)
-# list(APPEND XPU_CFLAGS -mcpu=xpu2)
-# list(APPEND XPU_CFLAGS --target=x86_64-linux-gnu)
-# list(APPEND XPU_CFLAGS -v)
-# list(APPEND XPU_CFLAGS --dyld-prefix=/opt/compiler/gcc-8.2)
-# list(APPEND XPU_CFLAGS -fno-builtin)
-# list(APPEND XPU_CFLAGS -Wno-dev)
-
-# set(XPU_XPUCC_FLAGS ${XPU_CFLAGS})
-
-# set HIP link libs
-# set(xpuapi_library_name xpuapi)
-# message(STATUS "XPU API library name: ${xpuapi_library_name}")
-# # link in the generic.cmake
-# find_library(XPU2_CLANG_API_LIB ${xpuapi_library_name} HINTS ${XPU2_PATH}/shlib)
-# message(STATUS "XPU2_CLANG_API_LIB: ${XPU2_CLANG_API_LIB}")
-
-# set(xpurt_library_name xpurt)
-# message(STATUS "XPU RT library name: ${xpurt_library_name}")
-# # link in the generic.cmake
-# find_library(XPU2_CLANG_RT_LIB ${xpurt_library_name} HINTS ${XPU2_PATH}/runtime/shlib)
-# message(STATUS "XPU2_CLANG_RT_LIB: ${XPU2_CLANG_RT_LIB}")
-
-# # Ensure that xpu/api.h can be included without dependency errors.
-# file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
-# add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
-# add_dependencies(xpu_headers_dummy extern_xpu)
-# link_libraries(xpu_headers_dummy)

From d8185636be45a28ba484ae419bfdac0a5f317e29 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 21 Dec 2021 04:12:46 +0000
Subject: [PATCH 17/41] update

---
 cmake/operators.cmake | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 83ba56535147e..470989d38e9f1 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -150,19 +150,19 @@ function(op_library TARGET)
     list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
 
     #TODO(liuxiandong) 
-    if(WITH_XPU2 AND ${xpu2_cc_srcs_len})
-        foreach(src ${xpu2_cc_srcs})
-            #message(STATUS "lxd_debug src----------- ${src}")
-            get_filename_component(op_name ${src} NAME_WE)
-            #message(STATUS "lxd_debug op_name ${op_name}")
-            if(WITH_XPU)
-                if(xpu_cc_srcs MATCHES ".*_op_xpu.cc$") 
-                    #message(STATUS "the target is matched")
-                    list(REMOVE_ITEM xpu_cc_srcs "${op_name}_xpu.cc")
-                endif()
-            endif()
-        endforeach()
-    endif()
+    # if(WITH_XPU2 AND ${xpu2_cc_srcs_len})
+    #     foreach(src ${xpu2_cc_srcs})
+    #         #message(STATUS "lxd_debug src----------- ${src}")
+    #         get_filename_component(op_name ${src} NAME_WE)
+    #         #message(STATUS "lxd_debug op_name ${op_name}")
+    #         if(WITH_XPU)
+    #             if(xpu_cc_srcs MATCHES ".*_op_xpu.cc$") 
+    #                 #message(STATUS "the target is matched")
+    #                 list(REMOVE_ITEM xpu_cc_srcs "${op_name}_xpu.cc")
+    #             endif()
+    #         endif()
+    #     endforeach()
+    # endif()
 
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)

From d461b0a08f1a37f16571d9c1eceb54d28ae21052 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 21 Dec 2021 04:45:31 +0000
Subject: [PATCH 18/41] fix the merge error

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8edbcd1a49d2f..3b721be887540 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          OFF)
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_XPU2         "Compile PaddlePaddle with BAIDU KUNLUN XPU2"    OFF)
+option(WITH_MLU    "Compile PaddlePaddle with CAMBRICON MLU"     OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
 option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)

From 27ff9d80d64c7d4f0e0c9eb2f6c3b5afa0fdd08e Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 21 Dec 2021 07:35:15 +0000
Subject: [PATCH 19/41] update

---
 fuck_log | 1349 ------------------------------------------------------
 1 file changed, 1349 deletions(-)
 delete mode 100644 fuck_log

diff --git a/fuck_log b/fuck_log
deleted file mode 100644
index 2678e7a6c5660..0000000000000
--- a/fuck_log
+++ /dev/null
@@ -1,1349 +0,0 @@
-[INFO] Checking merge-conflict files only.
-CRLF end-lines remover...............................(no files to check)Skipped
-yapf.................................................(no files to check)Skipped
-Check for added large files..............................................Passed
-Check for merge conflicts................................................Passed
-Check for broken symlinks................................................Passed
-Detect Private Key...................................(no files to check)Skipped
-Fix End of Files.........................................................Passed
-clang-format.............................................................Passed
-cpplint..................................................................Failed
-hookid: cpplint-cpp-source
-
-Done processing CMakeLists.txt
-Done processing cmake/configure.cmake
-Done processing cmake/external/cinn.cmake
-Done processing cmake/external/concurrentqueue.cmake
-Done processing cmake/external/cryptopp.cmake
-Done processing cmake/external/gtest.cmake
-Done processing cmake/external/llvm.cmake
-Done processing cmake/external/mkldnn.cmake
-Done processing cmake/inference_lib.cmake
-Done processing cmake/infrt_lib.cmake
-Done processing cmake/neuware.cmake
-Done processing cmake/operators.cmake
-Done processing cmake/pten.cmake
-Done processing cmake/third_party.cmake
-Done processing paddle/CMakeLists.txt
-Done processing paddle/fluid/distributed/common/cost_timer.h
-Done processing paddle/fluid/distributed/fleet.cc
-Done processing paddle/fluid/distributed/fleet_executor/CMakeLists.txt
-Done processing paddle/fluid/distributed/fleet_executor/carrier.cc
-Done processing paddle/fluid/distributed/fleet_executor/carrier.h
-Done processing paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
-Done processing paddle/fluid/distributed/fleet_executor/compute_interceptor.h
-Done processing paddle/fluid/distributed/fleet_executor/fleet_executor.cc
-Done processing paddle/fluid/distributed/fleet_executor/fleet_executor.h
-Done processing paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
-Done processing paddle/fluid/distributed/fleet_executor/interceptor.cc
-Done processing paddle/fluid/distributed/fleet_executor/interceptor.h
-Done processing paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
-Done processing paddle/fluid/distributed/fleet_executor/message_bus.cc
-Done processing paddle/fluid/distributed/fleet_executor/message_bus.h
-Done processing paddle/fluid/distributed/fleet_executor/runtime_graph.cc
-Done processing paddle/fluid/distributed/fleet_executor/runtime_graph.h
-Done processing paddle/fluid/distributed/fleet_executor/task_node.cc
-Done processing paddle/fluid/distributed/fleet_executor/task_node.h
-Done processing paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
-Done processing paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
-Done processing paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
-Done processing paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
-Done processing paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
-Done processing paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
-Done processing paddle/fluid/distributed/service/communicator.cc
-Done processing paddle/fluid/distributed/service/heter_server.h
-Done processing paddle/fluid/eager/accumulation/gradient_accumulation.cc
-Done processing paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
-Done processing paddle/fluid/eager/api/utils/CMakeLists.txt
-Done processing paddle/fluid/eager/api/utils/global_utils.h
-Done processing paddle/fluid/eager/auto_code_generator/CMakeLists.txt
-Done processing paddle/fluid/eager/auto_code_generator/eager_generator.cc
-Done processing paddle/fluid/eager/backward.cc
-Done processing paddle/fluid/eager/eager_tensor.h
-Done processing paddle/fluid/eager/legacy/amp_auto_cast.cc
-Done processing paddle/fluid/eager/legacy/amp_auto_cast.h
-Done processing paddle/fluid/eager/legacy/infer_shape_context.h
-Done processing paddle/fluid/eager/legacy/op_runner.cc
-Done processing paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
-Done processing paddle/fluid/eager/utils.cc
-Done processing paddle/fluid/eager/utils.h
-Done processing paddle/fluid/framework/custom_operator.cc
-Done processing paddle/fluid/framework/details/nan_inf_utils_detail.cc
-Done processing paddle/fluid/framework/distributed_strategy.proto
-Done processing paddle/fluid/framework/dlpack_tensor.cc
-Done processing paddle/fluid/framework/executor.cc
-Done processing paddle/fluid/framework/fleet/box_wrapper.cu
-Done processing paddle/fluid/framework/fleet/box_wrapper_impl.h
-Done processing paddle/fluid/framework/fleet/heter_ps/heter_comm.h
-Done processing paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
-Done processing paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
-Done processing paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
-Done processing paddle/fluid/framework/garbage_collector.cc
-Done processing paddle/fluid/framework/garbage_collector.h
-Done processing paddle/fluid/framework/heter_pipeline_trainer_test.cc
-Done processing paddle/fluid/framework/heter_section_worker.cc
-Done processing paddle/fluid/framework/ir/CMakeLists.txt
-Done processing paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/graph_pattern_detector.cc
-Done processing paddle/fluid/framework/ir/graph_pattern_detector.h
-Done processing paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/avg_shard_pass.h
-Done processing paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
-Done processing paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/infer_shape_pass.h
-Done processing paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
-Done processing paddle/fluid/framework/ir/ipu/inference_process_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/inference_process_pass.h
-Done processing paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
-Done processing paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
-Done processing paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
-Done processing paddle/fluid/framework/ir/ipu/ipu_pass_base.h
-Done processing paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
-Done processing paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
-Done processing paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
-Done processing paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
-Done processing paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
-Done processing paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/matmul_scale_fuse_pass.h
-Done processing paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
-Done processing paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
-Done processing paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
-Done processing paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
-Done processing paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
-Done processing paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
-Done processing paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
-Done processing paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
-Done processing paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
-Done processing paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc
-Done processing paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h
-Done processing paddle/fluid/framework/ir/pass_tester_helper.h
-Done processing paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
-Done processing paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
-Done processing paddle/fluid/framework/library_type.h
-Done processing paddle/fluid/framework/new_executor/CMakeLists.txt
-Done processing paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
-Done processing paddle/fluid/framework/new_executor/new_executor_defs.cc
-Done processing paddle/fluid/framework/new_executor/new_executor_defs.h
-Done processing paddle/fluid/framework/new_executor/standalone_executor_test.cc
-Done processing paddle/fluid/framework/op_desc.cc
-Done processing paddle/fluid/framework/op_registry.h
-Done processing paddle/fluid/framework/operator.cc
-Done processing paddle/fluid/framework/operator.h
-Done processing paddle/fluid/framework/paddle2cinn/CMakeLists.txt
-Done processing paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
-Done processing paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
-Done processing paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
-Done processing paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
-Done processing paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
-Done processing paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
-Done processing paddle/fluid/framework/paddle2cinn/cinn_compiler.h
-Done processing paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
-Done processing paddle/fluid/framework/parallel_executor.cc
-Done processing paddle/fluid/framework/ps_gpu_trainer.cc
-Done processing paddle/fluid/framework/ps_gpu_worker.cc
-Done processing paddle/fluid/framework/pten_utils.cc
-Done processing paddle/fluid/framework/shape_inference.h
-Done processing paddle/fluid/framework/tensor.cc
-Done processing paddle/fluid/framework/tensor.h
-Done processing paddle/fluid/framework/tensor_util.cc
-Done processing paddle/fluid/framework/tensor_util.h
-Done processing paddle/fluid/framework/variable.h
-Done processing paddle/fluid/imperative/basic_engine.cc
-Done processing paddle/fluid/imperative/dygraph_grad_maker.h
-Done processing paddle/fluid/imperative/gradient_accumulator.cc
-Done processing paddle/fluid/imperative/heter_ccl_context.cc
-Done processing paddle/fluid/imperative/infer_shape_context.h
-Done processing paddle/fluid/imperative/prepared_operator.cc
-Done processing paddle/fluid/imperative/py_layer_fwd.h
-Done processing paddle/fluid/imperative/reducer.cc
-Done processing paddle/fluid/imperative/tracer.cc
-Done processing paddle/fluid/imperative/variable_wrapper.h
-Done processing paddle/fluid/inference/CMakeLists.txt
-Done processing paddle/fluid/inference/analysis/argument.h
-Done processing paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
-Done processing paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
-Done processing paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
-Done processing paddle/fluid/inference/api/analysis_config.cc
-Done processing paddle/fluid/inference/api/analysis_predictor.cc
-Done processing paddle/fluid/inference/api/demo_ci/run.sh
-Done processing paddle/fluid/inference/api/details/zero_copy_tensor.cc
-Done processing paddle/fluid/inference/api/mkldnn_quantizer.cc
-Done processing paddle/fluid/inference/api/mkldnn_quantizer_config.cc
-Done processing paddle/fluid/inference/api/paddle_analysis_config.h
-Done processing paddle/fluid/inference/api/paddle_pass_builder.cc
-Done processing paddle/fluid/inference/api/paddle_pass_builder.h
-Done processing paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
-Done processing paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
-Done processing paddle/fluid/inference/tensorrt/engine.cc
-Done processing paddle/fluid/inference/tensorrt/engine.h
-Done processing paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
-Done processing paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
-Done processing paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
-Done processing paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
-Done processing paddle/fluid/inference/tests/api/CMakeLists.txt
-Done processing paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc
-Done processing paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
-Done processing paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
-Done processing paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
-Done processing paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
-Done processing paddle/fluid/memory/CMakeLists.txt
-Done processing paddle/fluid/memory/allocation/CMakeLists.txt
-Done processing paddle/fluid/memory/allocation/aligned_allocator.cc
-Done processing paddle/fluid/memory/allocation/allocator.h
-Done processing paddle/fluid/memory/allocation/allocator_facade.cc
-Done processing paddle/fluid/memory/allocation/allocator_facade.h
-Done processing paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
-Done processing paddle/fluid/memory/allocation/base_ptr_test.cu
-Done processing paddle/fluid/memory/allocation/cuda_device_context_allocator.h
-Done processing paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
-Done processing paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
-Done processing paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
-Done processing paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
-Done processing paddle/fluid/memory/detail/CMakeLists.txt
-Done processing paddle/fluid/memory/detail/buddy_allocator.cc
-Done processing paddle/fluid/memory/detail/buddy_allocator_test.cc
-Done processing paddle/fluid/memory/detail/system_allocator.cc
-Done processing paddle/fluid/memory/detail/system_allocator.h
-Done processing paddle/fluid/memory/detail/system_allocator_test.cc
-Done processing paddle/fluid/memory/malloc.cc
-Done processing paddle/fluid/memory/malloc.h
-Done processing paddle/fluid/memory/memcpy.cc
-Done processing paddle/fluid/memory/memcpy.h
-Done processing paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
-Done processing paddle/fluid/operators/CMakeLists.txt
-Done processing paddle/fluid/operators/activation_op.cc
-Done processing paddle/fluid/operators/activation_op.cu
-Done processing paddle/fluid/operators/activation_op.h
-Done processing paddle/fluid/operators/activation_op_mlu.cc
-Done processing paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
-Done processing paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
-Done processing paddle/fluid/operators/broadcast_tensors_op.cu
-Done processing paddle/fluid/operators/cast_op.cu
-Done processing paddle/fluid/operators/cinn/CMakeLists.txt
-Done processing paddle/fluid/operators/cinn/cinn_launch_context.cc
-Done processing paddle/fluid/operators/cinn/cinn_launch_context.h
-Done processing paddle/fluid/operators/cinn/cinn_launch_context_test.cc
-Done processing paddle/fluid/operators/cinn/cinn_launch_op.cc
-Done processing paddle/fluid/operators/cinn/cinn_launch_op.h
-Done processing paddle/fluid/operators/class_center_sample_op.cu
-Done processing paddle/fluid/operators/clip_by_norm_op.cu
-Done processing paddle/fluid/operators/compat/gelu.pbtxt
-Done processing paddle/fluid/operators/compat/matmul_v2.pbtxt
-Done processing paddle/fluid/operators/complex_op.cc
-Done processing paddle/fluid/operators/complex_op.cu
-Done processing paddle/fluid/operators/complex_op.h
-Done processing paddle/fluid/operators/complex_view_op.cc
-Done processing paddle/fluid/operators/complex_view_op.cu
-Done processing paddle/fluid/operators/complex_view_op.h
-Done processing paddle/fluid/operators/conj_op.h
-Done processing paddle/fluid/operators/controlflow/compare_all_op.cu
-Done processing paddle/fluid/operators/detection/generate_proposals_op.cc
-Done processing paddle/fluid/operators/elementwise/elementwise_add_op.cu
-Done processing paddle/fluid/operators/elementwise/elementwise_add_op.h
-Done processing paddle/fluid/operators/elementwise/elementwise_div_op.h
-Done processing paddle/fluid/operators/elementwise/elementwise_functor.h
-Done processing paddle/fluid/operators/elementwise/elementwise_max_op.cc
-Done processing paddle/fluid/operators/elementwise/elementwise_max_op.cu
-Done processing paddle/fluid/operators/elementwise/elementwise_max_op.h
-Done processing paddle/fluid/operators/elementwise/elementwise_min_op.cc
-Done processing paddle/fluid/operators/elementwise/elementwise_min_op.cu
-Done processing paddle/fluid/operators/elementwise/elementwise_min_op.h
-Done processing paddle/fluid/operators/elementwise/elementwise_mul_op.cu
-Done processing paddle/fluid/operators/elementwise/elementwise_mul_op.h
-Done processing paddle/fluid/operators/elementwise/elementwise_op.h
-Done processing paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
-Done processing paddle/fluid/operators/elementwise/elementwise_op_function.h
-Done processing paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
-Done processing paddle/fluid/operators/elementwise/elementwise_sub_op.cu
-Done processing paddle/fluid/operators/elementwise/elementwise_sub_op.h
-Done processing paddle/fluid/operators/fill_any_like_op.cc
-Done processing paddle/fluid/operators/fill_any_like_op.h
-Done processing paddle/fluid/operators/fill_constant_op.cc
-Done processing paddle/fluid/operators/filter_by_instag_op.h
-Done processing paddle/fluid/operators/flatten_op.cc
-Done processing paddle/fluid/operators/flip_op.cu
-Done processing paddle/fluid/operators/fused/attn_bias_add.cu.h
-Done processing paddle/fluid/operators/fused/attn_gemm.h
-Done processing paddle/fluid/operators/ipu_runtime_op.cc
-Done processing paddle/fluid/operators/ipu_runtime_op.h
-Done processing paddle/fluid/operators/kernel_primitives/functor_primitives.h
-Done processing paddle/fluid/operators/kron_op.h
-Done processing paddle/fluid/operators/label_smooth_op.cu
-Done processing paddle/fluid/operators/layer_norm_kernel.cu.h
-Done processing paddle/fluid/operators/layer_norm_op.cc
-Done processing paddle/fluid/operators/layer_norm_op.cu
-Done processing paddle/fluid/operators/lerp_op.cc
-Done processing paddle/fluid/operators/lerp_op.cu
-Done processing paddle/fluid/operators/lerp_op.h
-Done processing paddle/fluid/operators/margin_cross_entropy_op.cu
-Done processing paddle/fluid/operators/masked_select_op_xpu.cc
-Done processing paddle/fluid/operators/math/concat_and_split.cu
-Done processing paddle/fluid/operators/math/math_function.cc
-Done processing paddle/fluid/operators/math/math_function.cu
-Done processing paddle/fluid/operators/math/segment_pooling.cu
-Done processing paddle/fluid/operators/matmul_v2_op.cc
-Done processing paddle/fluid/operators/matmul_v2_op.h
-Done processing paddle/fluid/operators/mean_op.cu
-Done processing paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
-Done processing paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
-Done processing paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
-Done processing paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
-Done processing paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
-Total errors found: 8
-Done processing paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
-Done processing paddle/fluid/operators/mlu/CMakeLists.txt
-Done processing paddle/fluid/operators/mlu/activation_op_mlu_test.cc
-Done processing paddle/fluid/operators/mlu/mlu_baseop.cc
-Done processing paddle/fluid/operators/mlu/mlu_baseop.h
-Done processing paddle/fluid/operators/optimizers/lamb_op.cc
-Done processing paddle/fluid/operators/optimizers/lamb_op.cu
-Done processing paddle/fluid/operators/optimizers/lamb_op.h
-Done processing paddle/fluid/operators/p_norm_op.cu
-Done processing paddle/fluid/operators/pool_op.h
-Done processing paddle/fluid/operators/prelu_op.cu
-Done processing paddle/fluid/operators/pscore/send_op.cc
-Done processing paddle/fluid/operators/py_layer_op.cc
-Done processing paddle/fluid/operators/py_layer_op.h
-Done processing paddle/fluid/operators/range_op_xpu.cc
-Done processing paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
-Done processing paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_all_op.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_any_op.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_functor_op.h
-Done processing paddle/fluid/operators/reduce_ops/reduce_max_op.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_mean_op.h
-Done processing paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_min_op.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_op.cu.h
-Done processing paddle/fluid/operators/reduce_ops/reduce_op.h
-Done processing paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
-Done processing paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
-Done processing paddle/fluid/operators/repeat_interleave_op.cc
-Done processing paddle/fluid/operators/repeat_interleave_op.cu
-Done processing paddle/fluid/operators/repeat_interleave_op.h
-Done processing paddle/fluid/operators/reshape_op.cc
-Done processing paddle/fluid/operators/roi_align_op.cu
-Done processing paddle/fluid/operators/roi_align_op.h
-Total errors found: 4
-Done processing paddle/fluid/operators/scale_op.h
-Done processing paddle/fluid/operators/scatter_op.cu
-Done processing paddle/fluid/operators/slice_op.cc
-Done processing paddle/fluid/operators/softmax_cudnn_op.cu.h
-Done processing paddle/fluid/operators/softmax_with_cross_entropy_op.cu
-Done processing paddle/fluid/operators/solve_op.h
-Done processing paddle/fluid/operators/sparse_attention_op.cc
-Done processing paddle/fluid/operators/sparse_attention_op.cu
-Done processing paddle/fluid/operators/spectral_op.cu
-Done processing paddle/fluid/operators/tensor_formatter.h
-Done processing paddle/fluid/operators/trace_op.cu
-Done processing paddle/fluid/operators/triangular_solve_op.cu
-Done processing paddle/fluid/operators/unity_build_rule.cmake
-Done processing paddle/fluid/operators/where_index_op_xpu.cc
-Done processing paddle/fluid/platform/CMakeLists.txt
-Done processing paddle/fluid/platform/cuda_graph_with_memory_pool.h
-Done processing paddle/fluid/platform/device/CMakeLists.txt
-Done processing paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
-Done processing paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
-Done processing paddle/fluid/platform/device/gpu/gpu_info.cc
-Done processing paddle/fluid/platform/device/gpu/gpu_info.h
-Done processing paddle/fluid/platform/device/gpu/gpu_primitives.h
-Done processing paddle/fluid/platform/device/ipu/CMakeLists.txt
-Done processing paddle/fluid/platform/device/ipu/device.cc
-Done processing paddle/fluid/platform/device/ipu/ipu_info.cc
-Done processing paddle/fluid/platform/device/ipu/ipu_info.h
-Done processing paddle/fluid/platform/device/ipu/ipu_optimizer.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
-Done processing paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
-Done processing paddle/fluid/platform/device/ipu/supported_ops_autogen.h
-Done processing paddle/fluid/platform/device/mlu/CMakeLists.txt
-Done processing paddle/fluid/platform/device/mlu/device_context.cc
-Done processing paddle/fluid/platform/device/mlu/device_context.h
-Done processing paddle/fluid/platform/device/mlu/device_context_allocator.h
-Done processing paddle/fluid/platform/device/mlu/device_context_test.cc
-Done processing paddle/fluid/platform/device/mlu/enforce.h
-Done processing paddle/fluid/platform/device/mlu/enforce_test.cc
-Done processing paddle/fluid/platform/device/mlu/mlu_info.cc
-Done processing paddle/fluid/platform/device/mlu/mlu_info.h
-Done processing paddle/fluid/platform/device/mlu/mlu_stream.cc
-Done processing paddle/fluid/platform/device/mlu/mlu_stream.h
-Done processing paddle/fluid/platform/device/xpu/xpu1_op_list.h
-Done processing paddle/fluid/platform/device/xpu/xpu2_op_list.h
-Done processing paddle/fluid/platform/device_context.cc
-Done processing paddle/fluid/platform/device_context.h
-Done processing paddle/fluid/platform/device_tracer.cc
-Done processing paddle/fluid/platform/device_tracer.h
-Done processing paddle/fluid/platform/event.h
-Done processing paddle/fluid/platform/flags.cc
-Done processing paddle/fluid/platform/init.cc
-Done processing paddle/fluid/platform/init_test.cc
-Done processing paddle/fluid/platform/lodtensor_printer.cc
-Done processing paddle/fluid/platform/monitor.cc
-Done processing paddle/fluid/platform/monitor.h
-Done processing paddle/fluid/platform/os_info.cc
-Done processing paddle/fluid/platform/os_info.h
-Done processing paddle/fluid/platform/place.cc
-Done processing paddle/fluid/platform/place.h
-Done processing paddle/fluid/platform/place_test.cc
-Done processing paddle/fluid/platform/profiler.cc
-Done processing paddle/fluid/platform/profiler.h
-Done processing paddle/fluid/platform/profiler_helper.h
-Done processing paddle/fluid/platform/stream_callback_manager.cc
-Done processing paddle/fluid/pybind/.gitignore
-Done processing paddle/fluid/pybind/CMakeLists.txt
-Done processing paddle/fluid/pybind/bind_fleet_executor.cc
-Done processing paddle/fluid/pybind/eager.cc
-Done processing paddle/fluid/pybind/eager_functions.cc
-Done processing paddle/fluid/pybind/eager_method.cc
-Done processing paddle/fluid/pybind/eager_op_function_generator.cc
-Done processing paddle/fluid/pybind/eager_properties.cc
-Done processing paddle/fluid/pybind/eager_utils.cc
-Done processing paddle/fluid/pybind/eager_utils.h
-Done processing paddle/fluid/pybind/exception.h
-Done processing paddle/fluid/pybind/imperative.cc
-Done processing paddle/fluid/pybind/op_function.h
-Done processing paddle/fluid/pybind/op_function_common.cc
-Done processing paddle/fluid/pybind/op_function_common.h
-Done processing paddle/fluid/pybind/op_function_generator.cc
-Done processing paddle/fluid/pybind/op_function_generator.h
-Done processing paddle/fluid/pybind/pybind.cc
-Done processing paddle/fluid/pybind/reader_py.cc
-Done processing paddle/fluid/pybind/tensor_py.h
-Done processing paddle/infrt/CMakeLists.txt
-Done processing paddle/infrt/api/CMakeLists.txt
-Done processing paddle/infrt/api/infrt_api.cc
-Done processing paddle/infrt/api/infrt_api.h
-Done processing paddle/infrt/api/infrt_api_test.cc
-Done processing paddle/infrt/common/CMakeLists.txt
-Done processing paddle/infrt/common/buffer.cc
-Done processing paddle/infrt/common/buffer.h
-Done processing paddle/infrt/common/common.h
-Done processing paddle/infrt/common/dtype.cc
-Done processing paddle/infrt/common/dtype.def
-Done processing paddle/infrt/common/dtype.h
-Done processing paddle/infrt/common/global.cc
-Done processing paddle/infrt/common/global.h
-Done processing paddle/infrt/common/macros.h
-Done processing paddle/infrt/common/memory.cc
-Done processing paddle/infrt/common/memory.h
-Done processing paddle/infrt/common/object.cc
-Done processing paddle/infrt/common/object.h
-Done processing paddle/infrt/common/shared.cc
-Done processing paddle/infrt/common/shared.h
-Done processing paddle/infrt/common/string.cc
-Done processing paddle/infrt/common/string.h
-Done processing paddle/infrt/common/target.cc
-Done processing paddle/infrt/common/target.h
-Done processing paddle/infrt/common/type.cc
-Done processing paddle/infrt/common/type.h
-Done processing paddle/infrt/dialect/CMakeLists.txt
-Done processing paddle/infrt/dialect/basic_kernels.cc
-Done processing paddle/infrt/dialect/basic_kernels.h
-Done processing paddle/infrt/dialect/basic_kernels.td
-Done processing paddle/infrt/dialect/dense_tensor.cc
-Done processing paddle/infrt/dialect/dense_tensor.h
-Done processing paddle/infrt/dialect/dense_tensor.td
-Done processing paddle/infrt/dialect/diagnostic_utils.cc
-Done processing paddle/infrt/dialect/diagnostic_utils.h
-Done processing paddle/infrt/dialect/dialect.cc
-Done processing paddle/infrt/dialect/infrt_base.cc
-Done processing paddle/infrt/dialect/infrt_base.h
-Done processing paddle/infrt/dialect/infrt_base.td
-Done processing paddle/infrt/dialect/init_infrt_dialects.cc
-Done processing paddle/infrt/dialect/init_infrt_dialects.h
-Done processing paddle/infrt/dialect/mlir_loader.cc
-Done processing paddle/infrt/dialect/mlir_loader.h
-Done processing paddle/infrt/dialect/mlir_loader_test.cc
-Done processing paddle/infrt/dialect/mlir_tests/basic.mlir
-Done processing paddle/infrt/dialect/mlir_tests/benchmark.mlir
-Done processing paddle/infrt/dialect/mlir_tests/dense_tensor.mlir
-Done processing paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
-Done processing paddle/infrt/dialect/mlir_tests/rewrite.mlir
-Done processing paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
-Done processing paddle/infrt/dialect/mlir_tests/tensor_map.mlir
-Done processing paddle/infrt/dialect/mlir_tests/tensor_shape.mlir
-Done processing paddle/infrt/dialect/mlir_tests/tensor_type.mlir
-Done processing paddle/infrt/dialect/ops.td
-Done processing paddle/infrt/dialect/opt.cc
-Done processing paddle/infrt/dialect/pd_op_base.td
-Done processing paddle/infrt/dialect/pd_ops.cc
-Done processing paddle/infrt/dialect/pd_ops.h
-Done processing paddle/infrt/dialect/pd_ops.td
-Done processing paddle/infrt/dialect/pd_types.cc
-Done processing paddle/infrt/dialect/pd_types.h
-Done processing paddle/infrt/dialect/print_ir.cc
-Done processing paddle/infrt/dialect/rewrite.td
-Done processing paddle/infrt/dialect/tensor_shape.cc
-Done processing paddle/infrt/dialect/tensor_shape.h
-Done processing paddle/infrt/dialect/tensor_shape.td
-Done processing paddle/infrt/dialect/tensor_shape_base.td
-Done processing paddle/infrt/dialect/test_kernels.cc
-Done processing paddle/infrt/dialect/test_kernels.h
-Done processing paddle/infrt/dialect/test_kernels.td
-Done processing paddle/infrt/dialect/types.cc
-Done processing paddle/infrt/dialect/types.h
-Done processing paddle/infrt/external_kernels/CMakeLists.txt
-Done processing paddle/infrt/external_kernels/basic.mlir
-Done processing paddle/infrt/external_kernels/basic_kernels.cc
-Done processing paddle/infrt/external_kernels/fc.mlir
-Done processing paddle/infrt/external_kernels/paddle.mlir
-Done processing paddle/infrt/gtest_main.cc
-Done processing paddle/infrt/host_context/CMakeLists.txt
-Done processing paddle/infrt/host_context/core_runtime.cc
-Done processing paddle/infrt/host_context/core_runtime.h
-Done processing paddle/infrt/host_context/core_runtime_test.cc
-Done processing paddle/infrt/host_context/function.cc
-Done processing paddle/infrt/host_context/function.h
-Done processing paddle/infrt/host_context/kernel_frame.cc
-Done processing paddle/infrt/host_context/kernel_frame.h
-Done processing paddle/infrt/host_context/kernel_registry.cc
-Done processing paddle/infrt/host_context/kernel_registry.h
-Done processing paddle/infrt/host_context/kernel_registry_test.cc
-Done processing paddle/infrt/host_context/kernel_utils.cc
-Done processing paddle/infrt/host_context/kernel_utils.h
-Done processing paddle/infrt/host_context/kernel_utils_test.cc
-Done processing paddle/infrt/host_context/mlir_exec.cc
-Done processing paddle/infrt/host_context/mlir_function_executable.cc
-Done processing paddle/infrt/host_context/mlir_function_executable.h
-Done processing paddle/infrt/host_context/mlir_program_executor.cc
-Done processing paddle/infrt/host_context/mlir_program_executor.h
-Done processing paddle/infrt/host_context/mlir_tests/basic.mlir
-Done processing paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
-Done processing paddle/infrt/host_context/mlir_tests/shape.mlir
-Done processing paddle/infrt/host_context/mlir_to_runtime_translate.cc
-Done processing paddle/infrt/host_context/mlir_to_runtime_translate.h
-Done processing paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
-Done processing paddle/infrt/host_context/op_executable.cc
-Done processing paddle/infrt/host_context/op_executable.h
-Done processing paddle/infrt/host_context/op_executable_test.cc
-Done processing paddle/infrt/host_context/symbol_table.cc
-Done processing paddle/infrt/host_context/symbol_table.h
-Done processing paddle/infrt/host_context/value.cc
-Done processing paddle/infrt/host_context/value.h
-Done processing paddle/infrt/host_context/value_test.cc
-Done processing paddle/infrt/kernel/CMakeLists.txt
-Done processing paddle/infrt/kernel/basic_kernels.cc
-Done processing paddle/infrt/kernel/basic_kernels.h
-Done processing paddle/infrt/kernel/control_flow_kernels.cc
-Done processing paddle/infrt/kernel/control_flow_kernels.h
-Done processing paddle/infrt/kernel/tensor_kernels.cc
-Done processing paddle/infrt/kernel/tensor_kernels.h
-Done processing paddle/infrt/kernel/tensor_shape_kernels.cc
-Done processing paddle/infrt/kernel/tensor_shape_kernels.h
-Done processing paddle/infrt/kernel/test_kernels.cc
-Done processing paddle/infrt/kernel/test_kernels.h
-Done processing paddle/infrt/paddle/CMakeLists.txt
-Done processing paddle/infrt/paddle/cpp/CMakeLists.txt
-Done processing paddle/infrt/paddle/cpp/desc_api.h
-Done processing paddle/infrt/paddle/framework.proto
-Done processing paddle/infrt/paddle/model_parser.cc
-Done processing paddle/infrt/paddle/model_parser.h
-Done processing paddle/infrt/paddle/pb/CMakeLists.txt
-Done processing paddle/infrt/paddle/pb/block_desc.cc
-Done processing paddle/infrt/paddle/pb/block_desc.h
-Done processing paddle/infrt/paddle/pb/op_desc.cc
-Done processing paddle/infrt/paddle/pb/op_desc.h
-Done processing paddle/infrt/paddle/pb/program_desc.cc
-Done processing paddle/infrt/paddle/pb/program_desc.h
-Done processing paddle/infrt/paddle/pb/var_desc.cc
-Done processing paddle/infrt/paddle/pb/var_desc.h
-Done processing paddle/infrt/paddle/scope.cc
-Done processing paddle/infrt/paddle/scope.h
-Done processing paddle/infrt/paddle/tensor.cc
-Done processing paddle/infrt/paddle/tensor.h
-Done processing paddle/infrt/support/CMakeLists.txt
-Done processing paddle/infrt/support/type_traits.h
-Done processing paddle/infrt/support/variant.h
-Done processing paddle/infrt/tensor/CMakeLists.txt
-Done processing paddle/infrt/tensor/dense_host_tensor.cc
-Done processing paddle/infrt/tensor/dense_host_tensor.h
-Done processing paddle/infrt/tensor/dense_tensor_view.cc
-Done processing paddle/infrt/tensor/dense_tensor_view.h
-Done processing paddle/infrt/tensor/tensor_map.cc
-Done processing paddle/infrt/tensor/tensor_map.h
-Done processing paddle/infrt/tensor/tensor_metadata.cc
-Done processing paddle/infrt/tensor/tensor_metadata.h
-Done processing paddle/infrt/tensor/tensor_shape.cc
-Done processing paddle/infrt/tensor/tensor_shape.h
-Done processing paddle/pten/CMakeLists.txt
-Done processing paddle/pten/api/all.h
-Done processing paddle/pten/api/ext/dll_decl.h
-Done processing paddle/pten/api/ext/op_meta_info.h
-Done processing paddle/pten/api/ext/tensor_compat.h
-Done processing paddle/pten/api/include/kernel_signature.h
-Done processing paddle/pten/api/include/tensor.h
-Done processing paddle/pten/api/include/utils.h
-Done processing paddle/pten/api/lib/CMakeLists.txt
-Done processing paddle/pten/api/lib/api_registry.h
-Done processing paddle/pten/api/lib/kernel_declare.h
-Done processing paddle/pten/api/lib/kernel_dispatch.cc
-Done processing paddle/pten/api/lib/kernel_dispatch.h
-Done processing paddle/pten/api/lib/tensor.cc
-Done processing paddle/pten/api/lib/utils.cc
-Done processing paddle/pten/api/lib/utils/tensor_utils.cc
-Done processing paddle/pten/api/lib/utils/tensor_utils.h
-Done processing paddle/pten/backends/CMakeLists.txt
-Done processing paddle/pten/backends/all_context.cc
-Done processing paddle/pten/backends/all_context.h
-Done processing paddle/pten/backends/cpu/cpu_context.h
-Done processing paddle/pten/backends/npu/npu_context.h
-Done processing paddle/pten/backends/xpu/xpu_context.h
-Done processing paddle/pten/common/backend.h
-Done processing paddle/pten/common/data_type.h
-Done processing paddle/pten/common/layout.h
-Done processing paddle/pten/common/scalar_array.h
-Done processing paddle/pten/core/CMakeLists.txt
-Done processing paddle/pten/core/convert_utils.cc
-Done processing paddle/pten/core/convert_utils.h
-Done processing paddle/pten/core/kernel_alias_name.h
-Done processing paddle/pten/core/kernel_context.cc
-Done processing paddle/pten/core/kernel_context.h
-Done processing paddle/pten/core/kernel_factory.cc
-Done processing paddle/pten/core/kernel_factory.h
-Done processing paddle/pten/core/kernel_registry.h
-Done processing paddle/pten/core/kernel_utils.h
-Done processing paddle/pten/include/creation.h
-Done processing paddle/pten/include/linalg.h
-Done processing paddle/pten/include/manipulation.h
-Done processing paddle/pten/include/math.h
-Done processing paddle/pten/infermeta/binary.cc
-Done processing paddle/pten/infermeta/unary.cc
-Done processing paddle/pten/infermeta/unary.h
-Done processing paddle/pten/kernels/CMakeLists.txt
-Done processing paddle/pten/kernels/cpu/CMakeLists.txt
-Done processing paddle/pten/kernels/cpu/conj_kernel.h
-Done processing paddle/pten/kernels/cpu/full_kernel.cc
-Done processing paddle/pten/kernels/cpu/linalg.cc
-Done processing paddle/pten/kernels/cpu/linalg.h
-Done processing paddle/pten/kernels/cpu/manipulation.cc
-Done processing paddle/pten/kernels/cpu/manipulation.h
-Done processing paddle/pten/kernels/cpu/math.cc
-Done processing paddle/pten/kernels/cpu/math.h
-Done processing paddle/pten/kernels/cpu/scale_kernel.cc
-Done processing paddle/pten/kernels/cpu/utils.cc
-Done processing paddle/pten/kernels/cpu/utils.h
-Done processing paddle/pten/kernels/cuda/CMakeLists.txt
-Done processing paddle/pten/kernels/cuda/conj_kernel.cu
-Done processing paddle/pten/kernels/cuda/conj_kernel.h
-Done processing paddle/pten/kernels/cuda/full_kernel.cu
-Done processing paddle/pten/kernels/cuda/linalg.cu
-Done processing paddle/pten/kernels/cuda/linalg.h
-Done processing paddle/pten/kernels/cuda/manipulation.cu
-Done processing paddle/pten/kernels/cuda/manipulation.h
-Done processing paddle/pten/kernels/cuda/math.cu
-Done processing paddle/pten/kernels/cuda/math.h
-Done processing paddle/pten/kernels/cuda/scale_kernel.cu
-Done processing paddle/pten/kernels/cuda/utils.cu
-Done processing paddle/pten/kernels/cuda/utils.h
-Done processing paddle/pten/kernels/hybird/CMakeLists.txt
-Done processing paddle/pten/kernels/hybird/math/conj_impl.h
-Done processing paddle/pten/kernels/impl/full_kernel_impl.h
-Done processing paddle/pten/kernels/primitive/CMakeLists.txt
-Done processing paddle/pten/kernels/scale_kernel.h
-Done processing paddle/pten/kernels/xpu/manipulation.cc
-Done processing paddle/pten/kernels/xpu/manipulation.h
-Done processing paddle/pten/kernels/xpu/utils.cc
-Done processing paddle/pten/ops/CMakeLists.txt
-Done processing paddle/pten/tests/api/CMakeLists.txt
-Done processing paddle/pten/tests/api/scale_api.h
-Done processing paddle/pten/tests/api/test_conj_api.cc
-Done processing paddle/pten/tests/api/test_pten_tensor.cc
-Done processing paddle/pten/tests/api/test_reshape_api.cc
-Done processing paddle/pten/tests/api/test_scale_api.cc
-Done processing paddle/pten/tests/api/test_scale_benchmark.cc
-Done processing paddle/pten/tests/api/test_sum_api.cc
-Done processing paddle/pten/tests/common/test_data_layout.cc
-Done processing paddle/pten/tests/core/test_kernel_factory.cc
-Done processing paddle/pten/tests/kernels/CMakeLists.txt
-Done processing paddle/pten/tests/kernels/test_conj_dev_api.cc
-Done processing paddle/pten/tests/kernels/test_fill_dev_api.cc
-Done processing paddle/scripts/docker/root/.bashrc
-Done processing paddle/scripts/infrt_build.sh
-Done processing paddle/scripts/paddle_build.bat
-Done processing paddle/scripts/paddle_build.sh
-Done processing python/paddle/_C_ops.py
-Done processing python/paddle/__init__.py
-Done processing python/paddle/device/__init__.py
-Done processing python/paddle/distributed/auto_parallel/cluster.py
-Done processing python/paddle/distributed/auto_parallel/cost_model.py
-Done processing python/paddle/distributed/auto_parallel/mapper.py
-Done processing python/paddle/distributed/auto_parallel/operators/common.py
-Done processing python/paddle/distributed/auto_parallel/operators/dist_embedding.py
-Done processing python/paddle/distributed/auto_parallel/operators/dist_matmul.py
-Done processing python/paddle/distributed/auto_parallel/operators/dist_reshape.py
-Done processing python/paddle/distributed/auto_parallel/operators/dist_softmax.py
-Done processing python/paddle/distributed/auto_parallel/operators/dist_transpose.py
-Done processing python/paddle/distributed/auto_parallel/parallelizer.py
-Done processing python/paddle/distributed/auto_parallel/planner.py
-Done processing python/paddle/distributed/auto_parallel/process_group.py
-Done processing python/paddle/distributed/auto_parallel/utils.py
-Done processing python/paddle/distributed/collective.py
-Done processing python/paddle/distributed/fleet/base/distributed_strategy.py
-Done processing python/paddle/distributed/fleet/base/fleet_base.py
-Done processing python/paddle/distributed/fleet/base/private_helper_function.py
-Done processing python/paddle/distributed/fleet/fleet_executor_utils.py
-Done processing python/paddle/distributed/fleet/launch.py
-Done processing python/paddle/distributed/fleet/launch_utils.py
-Done processing python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
-Done processing python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
-Done processing python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
-Done processing python/paddle/distributed/fleet/runtime/the_one_ps.py
-Done processing python/paddle/distributed/fleet/utils/internal_storage.py
-Done processing python/paddle/distributed/passes/pass_base.py
-Done processing python/paddle/distribution.py
-Done processing python/paddle/fluid/__init__.py
-Done processing python/paddle/fluid/clip.py
-Done processing python/paddle/fluid/contrib/mixed_precision/decorator.py
-Done processing python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
-Done processing python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
-Done processing python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
-Done processing python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
-Done processing python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
-Done processing python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
-Done processing python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
-Done processing python/paddle/fluid/contrib/tests/test_amp_list.py
-Done processing python/paddle/fluid/core.py
-Done processing python/paddle/fluid/dataloader/dataloader_iter.py
-Done processing python/paddle/fluid/dygraph/amp/auto_cast.py
-Done processing python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
-Done processing python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
-Done processing python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
-Done processing python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
-Done processing python/paddle/fluid/dygraph/dygraph_to_static/utils.py
-Done processing python/paddle/fluid/dygraph/io.py
-Done processing python/paddle/fluid/dygraph/layers.py
-Done processing python/paddle/fluid/dygraph/varbase_patch_methods.py
-Done processing python/paddle/fluid/eager/eager_tensor_patch_methods.py
-Done processing python/paddle/fluid/executor.py
-Done processing python/paddle/fluid/framework.py
-Done processing python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
-Done processing python/paddle/fluid/layers/control_flow.py
-Done processing python/paddle/fluid/layers/ops.py
-Done processing python/paddle/fluid/layers/tensor.py
-Done processing python/paddle/fluid/optimizer.py
-Done processing python/paddle/fluid/reader.py
-Done processing python/paddle/fluid/tests/custom_op/attr_test_op.cc
-Done processing python/paddle/fluid/tests/unittests/CMakeLists.txt
-Done processing python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
-Done processing python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
-Done processing python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_planner.py
-Done processing python/paddle/fluid/tests/unittests/auto_parallel/launch.py
-Done processing python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
-Done processing python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py
-Done processing python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
-Done processing python/paddle/fluid/tests/unittests/distributed_passes/check_pass_conflict_example.py
-Done processing python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
-Done processing python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
-Done processing python/paddle/fluid/tests/unittests/distributed_passes/pass_run_main.py
-Done processing python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_all_reduce_pass.py
-Done processing python/paddle/fluid/tests/unittests/distributed_passes/test_white_lists.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
-Done processing python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
-Done processing python/paddle/fluid/tests/unittests/ipu/ernie_training.py
-Done processing python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_save_load.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
-Done processing python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
-Done processing python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/program_config.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_layer_norm_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
-Done processing python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
-Done processing python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
-Done processing python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
-Done processing python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py
-Done processing python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py
-Done processing python/paddle/fluid/tests/unittests/test_activation_op.py
-Done processing python/paddle/fluid/tests/unittests/test_assign_op.py
-Done processing python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
-Done processing python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
-Done processing python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
-Done processing python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
-Done processing python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
-Done processing python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
-Done processing python/paddle/fluid/tests/unittests/test_base_layer.py
-Done processing python/paddle/fluid/tests/unittests/test_collective_base.py
-Done processing python/paddle/fluid/tests/unittests/test_complex_op.py
-Done processing python/paddle/fluid/tests/unittests/test_complex_view_op.py
-Done processing python/paddle/fluid/tests/unittests/test_cuda_graph.py
-Done processing python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
-Done processing python/paddle/fluid/tests/unittests/test_distribution.py
-Done processing python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
-Done processing python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
-Done processing python/paddle/fluid/tests/unittests/test_egr_python_api.py
-Done processing python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
-Done processing python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
-Done processing python/paddle/fluid/tests/unittests/test_fleet_executor.py
-Done processing python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
-Done processing python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
-Done processing python/paddle/fluid/tests/unittests/test_fmax_op.py
-Done processing python/paddle/fluid/tests/unittests/test_fmin_op.py
-Done processing python/paddle/fluid/tests/unittests/test_gcd.py
-Done processing python/paddle/fluid/tests/unittests/test_gradient_clip.py
-Done processing python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
-Done processing python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
-Done processing python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
-Done processing python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
-Done processing python/paddle/fluid/tests/unittests/test_is_complex.py
-Done processing python/paddle/fluid/tests/unittests/test_is_integer.py
-Done processing python/paddle/fluid/tests/unittests/test_lambv2_op.py
-Done processing python/paddle/fluid/tests/unittests/test_layer_norm_op.py
-Done processing python/paddle/fluid/tests/unittests/test_lcm.py
-Done processing python/paddle/fluid/tests/unittests/test_lerp_op.py
-Done processing python/paddle/fluid/tests/unittests/test_logit_op.py
-Done processing python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
-Done processing python/paddle/fluid/tests/unittests/test_max_op.py
-Done processing python/paddle/fluid/tests/unittests/test_mean_op.py
-Done processing python/paddle/fluid/tests/unittests/test_momentum_op.py
-Done processing python/paddle/fluid/tests/unittests/test_pylayer_op.py
-Done processing python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py
-Done processing python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
-Done processing python/paddle/fluid/tests/unittests/test_rot90_op.py
-Done processing python/paddle/fluid/tests/unittests/test_scatter_op.py
-Done processing python/paddle/fluid/tests/unittests/test_set_value_op.py
-Done processing python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
-Done processing python/paddle/fluid/tests/unittests/test_translated_layer.py
-Done processing python/paddle/fluid/tests/unittests/test_transpose_op.py
-Done processing python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
-Done processing python/paddle/fluid/transpiler/details/checkport.py
-Done processing python/paddle/framework/__init__.py
-Done processing python/paddle/nn/__init__.py
-Done processing python/paddle/nn/functional/__init__.py
-Done processing python/paddle/nn/functional/loss.py
-Done processing python/paddle/nn/functional/sparse_attention.py
-Done processing python/paddle/nn/layer/__init__.py
-Done processing python/paddle/nn/layer/loss.py
-Done processing python/paddle/nn/layer/pooling.py
-Done processing python/paddle/nn/layer/transformer.py
-Done processing python/paddle/optimizer/lamb.py
-Done processing python/paddle/optimizer/momentum.py
-Done processing python/paddle/optimizer/optimizer.py
-Done processing python/paddle/tensor/__init__.py
-Done processing python/paddle/tensor/attribute.py
-Done processing python/paddle/tensor/creation.py
-Done processing python/paddle/tensor/manipulation.py
-Done processing python/paddle/tensor/math.py
-Done processing python/paddle/tensor/stat.py
-Done processing python/paddle/utils/code_gen/api.yaml
-Done processing python/paddle/utils/code_gen/api_gen.py
-Done processing python/paddle/utils/cpp_extension/cpp_extension.py
-Done processing python/paddle/utils/cpp_extension/extension_utils.py
-Done processing python/requirements.txt
-Done processing python/setup.py.in
-Done processing python/unittest_py/requirements.txt
-Done processing tools/check_file_diff_approvals.sh
-Done processing tools/coverage/paddle_coverage.sh
-Done processing tools/dockerfile/Dockerfile.ipu
-Done processing tools/dockerfile/ci_dockerfile.sh
-Done processing tools/parallel_UT_rule.py
-Done processing tools/static_mode_white_list.py
-Done processing tools/windows/run_unittests.sh
-Ignoring CMakeLists.txt; not a valid file name (cuh, h++, hpp, c++, cc, c, hh, cpp, cu, hxx, cxx, h)
-Ignoring cmake/configure.cmake; not a valid file name (cuh, cpp, c++, h, c, hxx, cu, cxx, hpp, hh, h++, cc)
-Ignoring cmake/external/cinn.cmake; not a valid file name (h, cxx, cu, hxx, hpp, c, h++, hh, cuh, c++, cc, cpp)
-Ignoring cmake/external/concurrentqueue.cmake; not a valid file name (h++, c++, hxx, hh, hpp, cxx, cuh, cu, cpp, h, cc, c)
-Ignoring cmake/external/cryptopp.cmake; not a valid file name (h, h++, cuh, cu, c++, hpp, hh, hxx, cxx, cc, cpp, c)
-Ignoring cmake/external/gtest.cmake; not a valid file name (h++, hpp, cpp, c++, hh, cuh, cc, cu, cxx, h, hxx, c)
-Ignoring cmake/external/llvm.cmake; not a valid file name (cc, hpp, h++, hxx, cpp, cu, h, hh, cuh, c, cxx, c++)
-Ignoring cmake/external/mkldnn.cmake; not a valid file name (hpp, cpp, hxx, c++, hh, cuh, h, cc, c, cu, cxx, h++)
-Ignoring cmake/inference_lib.cmake; not a valid file name (c, hh, hxx, hpp, cc, h++, c++, cuh, cu, h, cxx, cpp)
-Ignoring cmake/infrt_lib.cmake; not a valid file name (cxx, h, cc, hxx, c++, cu, cuh, hh, cpp, hpp, c, h++)
-Ignoring cmake/neuware.cmake; not a valid file name (cc, c++, c, hh, hpp, cpp, cu, cxx, cuh, h++, hxx, h)
-Ignoring cmake/operators.cmake; not a valid file name (hpp, c, cuh, cxx, h++, c++, cu, cc, cpp, h, hh, hxx)
-Ignoring cmake/pten.cmake; not a valid file name (c++, cxx, cpp, hxx, h, c, h++, cc, cuh, hh, hpp, cu)
-Ignoring cmake/third_party.cmake; not a valid file name (hpp, c, cc, h++, cpp, cuh, c++, cxx, hh, cu, h, hxx)
-Ignoring paddle/CMakeLists.txt; not a valid file name (cu, cxx, h++, h, c++, hh, cuh, hxx, hpp, cpp, c, cc)
-Ignoring paddle/fluid/distributed/fleet_executor/CMakeLists.txt; not a valid file name (cc, h++, hpp, cxx, cu, c++, hxx, c, h, cuh, cpp, hh)
-Ignoring paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto; not a valid file name (c++, hh, hxx, cu, cc, cuh, h++, c, hpp, cxx, cpp, h)
-Ignoring paddle/fluid/eager/api/utils/CMakeLists.txt; not a valid file name (cxx, hpp, h, hh, cpp, hxx, h++, cc, cu, cuh, c++, c)
-Ignoring paddle/fluid/eager/auto_code_generator/CMakeLists.txt; not a valid file name (cuh, cpp, h++, c, cc, c++, cxx, cu, h, hxx, hpp, hh)
-Ignoring paddle/fluid/eager/tests/performance_tests/CMakeLists.txt; not a valid file name (hh, cuh, h++, hxx, cu, c++, cc, cxx, c, hpp, cpp, h)
-Ignoring paddle/fluid/framework/distributed_strategy.proto; not a valid file name (cpp, h, cuh, cu, h++, hpp, hh, c, c++, hxx, cc, cxx)
-Ignoring paddle/fluid/framework/ir/CMakeLists.txt; not a valid file name (cc, hh, cuh, h++, hxx, cxx, c, hpp, c++, cpp, cu, h)
-Ignoring paddle/fluid/framework/new_executor/CMakeLists.txt; not a valid file name (hpp, h++, hh, hxx, cpp, cuh, cc, h, c, cu, c++, cxx)
-Ignoring paddle/fluid/framework/paddle2cinn/CMakeLists.txt; not a valid file name (cu, hpp, hh, cxx, cc, h++, cpp, c++, cuh, c, h, hxx)
-Ignoring paddle/fluid/inference/CMakeLists.txt; not a valid file name (c, cpp, hh, c++, cc, cxx, h++, cuh, hpp, h, cu, hxx)
-Ignoring paddle/fluid/inference/api/demo_ci/run.sh; not a valid file name (hxx, cpp, h++, cu, hh, c, h, cuh, c++, cxx, cc, hpp)
-Ignoring paddle/fluid/inference/tests/api/CMakeLists.txt; not a valid file name (cc, h++, cpp, c++, c, h, cu, cxx, hh, hxx, hpp, cuh)
-Ignoring paddle/fluid/memory/CMakeLists.txt; not a valid file name (cuh, c++, h++, hh, hxx, h, hpp, c, cu, cc, cxx, cpp)
-Ignoring paddle/fluid/memory/allocation/CMakeLists.txt; not a valid file name (h, hpp, cpp, cuh, hh, c++, cu, hxx, cc, c, h++, cxx)
-Ignoring paddle/fluid/memory/detail/CMakeLists.txt; not a valid file name (hxx, hpp, c++, h++, cu, cc, cuh, cxx, hh, cpp, c, h)
-Ignoring paddle/fluid/operators/CMakeLists.txt; not a valid file name (c, hpp, h++, cc, hxx, cxx, cuh, cpp, hh, c++, cu, h)
-Ignoring paddle/fluid/operators/cinn/CMakeLists.txt; not a valid file name (hh, cuh, c, cc, cpp, cu, c++, hxx, hpp, cxx, h, h++)
-Skipping input 'paddle/fluid/operators/cinn_launch_op.cu.cc': Can't open for reading
-Skipping input 'paddle/fluid/operators/cinn_launch_op_test.cc': Can't open for reading
-Ignoring paddle/fluid/operators/compat/gelu.pbtxt; not a valid file name (hh, h++, hxx, cc, hpp, cuh, c, cpp, h, cxx, c++, cu)
-Ignoring paddle/fluid/operators/compat/matmul_v2.pbtxt; not a valid file name (c++, c, cc, hpp, cxx, hh, cu, hxx, h++, h, cuh, cpp)
-paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc:237:  Is this a non-const reference? If so, make const or use a pointer: std::vector<int64_t>& x_dims  [runtime/references] [2]
-paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc:238:  Is this a non-const reference? If so, make const or use a pointer: std::vector<int64_t>& y_dims  [runtime/references] [2]
-paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc:239:  Is this a non-const reference? If so, make const or use a pointer: std::vector<int64_t>& out_dims  [runtime/references] [2]
-paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc:288:  Is this a non-const reference? If so, make const or use a pointer: std::vector<int64_t>& x_bd_dims  [runtime/references] [2]
-paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc:289:  Is this a non-const reference? If so, make const or use a pointer: std::vector<int64_t>& y_bd_dims  [runtime/references] [2]
-paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc:290:  Is this a non-const reference? If so, make const or use a pointer: std::vector<int64_t>& out_dims  [runtime/references] [2]
-paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc:367:  Is this a non-const reference? If so, make const or use a pointer: std::vector<int64_t>& dx_bd_dims  [runtime/references] [2]
-paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc:368:  Is this a non-const reference? If so, make const or use a pointer: std::vector<int64_t>& dy_bd_dims  [runtime/references] [2]
-Ignoring paddle/fluid/operators/mlu/CMakeLists.txt; not a valid file name (cpp, cuh, c++, h++, cu, hxx, hpp, cxx, hh, cc, c, h)
-paddle/fluid/operators/roi_align_op.h:26:  Do not use unnamed namespaces in header files.  See https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces for more information.  [build/namespaces_headers] [4]
-paddle/fluid/operators/roi_align_op.h:44:  You don't need a ; after a }  [readability/braces] [4]
-paddle/fluid/operators/roi_align_op.h:134:  Is this a non-const reference? If so, make const or use a pointer: std::vector<T>& interpolated_values  [runtime/references] [2]
-paddle/fluid/operators/roi_align_op.h:170:  Anonymous namespace should be terminated with "// namespace"  [readability/namespace] [5]
-Ignoring paddle/fluid/operators/unity_build_rule.cmake; not a valid file name (cu, cxx, hpp, hh, cuh, cc, c++, c, h, hxx, h++, cpp)
-Ignoring paddle/fluid/platform/CMakeLists.txt; not a valid file name (hxx, cc, c, h++, cu, c++, hpp, cpp, h, cxx, hh, cuh)
-Ignoring paddle/fluid/platform/device/CMakeLists.txt; not a valid file name (c, c++, cpp, cuh, hh, cxx, h, hpp, h++, cu, cc, hxx)
-Ignoring paddle/fluid/platform/device/ipu/CMakeLists.txt; not a valid file name (cxx, hxx, h++, cpp, c++, cuh, cc, cu, hh, hpp, c, h)
-Ignoring paddle/fluid/platform/device/mlu/CMakeLists.txt; not a valid file name (hh, c++, hpp, cu, hxx, h++, cpp, cxx, cc, c, h, cuh)
-Ignoring paddle/fluid/pybind/.gitignore; not a valid file name (cpp, cu, hxx, h, hpp, h++, cc, hh, c, cuh, cxx, c++)
-Ignoring paddle/fluid/pybind/CMakeLists.txt; not a valid file name (h, cpp, cxx, h++, c++, hpp, c, cc, hh, cu, hxx, cuh)
-Ignoring paddle/infrt/CMakeLists.txt; not a valid file name (hpp, c, cpp, cu, c++, cuh, h++, cxx, hh, hxx, h, cc)
-Ignoring paddle/infrt/api/CMakeLists.txt; not a valid file name (hxx, cc, hh, c++, cxx, h++, cpp, cu, cuh, c, hpp, h)
-Ignoring paddle/infrt/common/CMakeLists.txt; not a valid file name (h, cu, hxx, c++, hpp, cxx, c, cc, cuh, cpp, h++, hh)
-Ignoring paddle/infrt/common/dtype.def; not a valid file name (c, hxx, h, cc, c++, hpp, cuh, hh, cxx, cu, h++, cpp)
-Ignoring paddle/infrt/dialect/CMakeLists.txt; not a valid file name (c, h, cuh, cpp, cxx, cu, cc, hpp, hh, hxx, h++, c++)
-Ignoring paddle/infrt/dialect/basic_kernels.td; not a valid file name (h++, cxx, c++, cpp, cc, h, hpp, hxx, cu, hh, c, cuh)
-Ignoring paddle/infrt/dialect/dense_tensor.td; not a valid file name (cu, hh, cpp, cxx, c, h++, hpp, h, cuh, cc, c++, hxx)
-Ignoring paddle/infrt/dialect/infrt_base.td; not a valid file name (cuh, cxx, hxx, h, hh, hpp, h++, c++, cpp, cu, cc, c)
-Ignoring paddle/infrt/dialect/mlir_tests/basic.mlir; not a valid file name (h, cxx, cc, cuh, cu, h++, hxx, cpp, hh, c, hpp, c++)
-Ignoring paddle/infrt/dialect/mlir_tests/benchmark.mlir; not a valid file name (c++, c, hxx, cuh, cxx, h, hh, hpp, h++, cc, cu, cpp)
-Ignoring paddle/infrt/dialect/mlir_tests/dense_tensor.mlir; not a valid file name (cpp, h, hpp, cuh, c, cu, hxx, c++, cxx, h++, cc, hh)
-Ignoring paddle/infrt/dialect/mlir_tests/paddle_ops.mlir; not a valid file name (c++, h, cc, cxx, cu, hpp, cpp, h++, c, cuh, hxx, hh)
-Ignoring paddle/infrt/dialect/mlir_tests/rewrite.mlir; not a valid file name (c, c++, hpp, h, cpp, cxx, h++, cc, cuh, hh, hxx, cu)
-Ignoring paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir; not a valid file name (hh, hpp, h++, cu, c, cuh, c++, hxx, cpp, h, cxx, cc)
-Ignoring paddle/infrt/dialect/mlir_tests/tensor_map.mlir; not a valid file name (cxx, hxx, h, hpp, cu, cc, c++, cpp, c, cuh, h++, hh)
-Ignoring paddle/infrt/dialect/mlir_tests/tensor_shape.mlir; not a valid file name (hxx, hh, cc, hpp, cpp, h++, h, cu, c++, cxx, c, cuh)
-Ignoring paddle/infrt/dialect/mlir_tests/tensor_type.mlir; not a valid file name (cuh, cc, cpp, h++, c, h, c++, hh, hxx, cu, cxx, hpp)
-Ignoring paddle/infrt/dialect/ops.td; not a valid file name (h++, cu, hh, cc, c++, hpp, cxx, cpp, cuh, hxx, h, c)
-Ignoring paddle/infrt/dialect/pd_op_base.td; not a valid file name (c++, h++, cu, hh, h, hpp, cuh, cpp, c, cc, hxx, cxx)
-Ignoring paddle/infrt/dialect/pd_ops.td; not a valid file name (cxx, h, h++, hh, cc, c++, cuh, cu, cpp, hpp, hxx, c)
-Ignoring paddle/infrt/dialect/rewrite.td; not a valid file name (c++, c, cxx, h, cc, hpp, cuh, h++, cpp, hh, cu, hxx)
-Ignoring paddle/infrt/dialect/tensor_shape.td; not a valid file name (hxx, h, h++, c, hpp, c++, cuh, cpp, cc, cxx, hh, cu)
-Ignoring paddle/infrt/dialect/tensor_shape_base.td; not a valid file name (h, cc, cpp, cuh, cu, c, hxx, cxx, hpp, c++, h++, hh)
-Ignoring paddle/infrt/dialect/test_kernels.td; not a valid file name (cpp, hpp, cxx, cc, h++, cuh, h, hh, cu, hxx, c++, c)
-Ignoring paddle/infrt/external_kernels/CMakeLists.txt; not a valid file name (c, cxx, cu, hpp, c++, hxx, cpp, hh, h++, cc, cuh, h)
-Ignoring paddle/infrt/external_kernels/basic.mlir; not a valid file name (cu, h, cuh, hpp, hh, h++, c, cxx, hxx, c++, cc, cpp)
-Ignoring paddle/infrt/external_kernels/fc.mlir; not a valid file name (h, c, cpp, cu, c++, hpp, h++, hh, hxx, cuh, cxx, cc)
-Ignoring paddle/infrt/external_kernels/paddle.mlir; not a valid file name (h, c++, cc, cxx, cuh, hxx, h++, cu, hh, cpp, hpp, c)
-Ignoring paddle/infrt/host_context/CMakeLists.txt; not a valid file name (hh, hxx, cc, c, h++, cxx, hpp, cpp, cu, cuh, c++, h)
-Ignoring paddle/infrt/host_context/mlir_tests/basic.mlir; not a valid file name (cc, h, cpp, c++, hpp, cxx, cuh, hxx, h++, hh, c, cu)
-Ignoring paddle/infrt/host_context/mlir_tests/dense_tensor.mlir; not a valid file name (hxx, hpp, cpp, hh, cc, cuh, cu, c, h++, c++, h, cxx)
-Ignoring paddle/infrt/host_context/mlir_tests/shape.mlir; not a valid file name (hh, h++, cu, cpp, cxx, hxx, cuh, c++, c, hpp, cc, h)
-Ignoring paddle/infrt/kernel/CMakeLists.txt; not a valid file name (hxx, hpp, cuh, c, cc, c++, h++, cpp, cxx, cu, h, hh)
-Ignoring paddle/infrt/paddle/CMakeLists.txt; not a valid file name (cuh, c++, h, hpp, c, h++, cpp, hxx, cu, hh, cxx, cc)
-Ignoring paddle/infrt/paddle/cpp/CMakeLists.txt; not a valid file name (cuh, h, cxx, cu, hxx, cpp, hpp, h++, hh, c++, cc, c)
-Ignoring paddle/infrt/paddle/framework.proto; not a valid file name (c++, h++, hpp, cpp, cxx, hxx, cuh, cc, c, cu, hh, h)
-Ignoring paddle/infrt/paddle/pb/CMakeLists.txt; not a valid file name (cuh, hxx, c, cu, h++, cxx, cc, cpp, c++, h, hpp, hh)
-Ignoring paddle/infrt/support/CMakeLists.txt; not a valid file name (cu, c++, h, cxx, c, cpp, cuh, hpp, cc, hh, h++, hxx)
-Ignoring paddle/infrt/tensor/CMakeLists.txt; not a valid file name (c++, cxx, cu, h, hh, cuh, c, cc, h++, hxx, hpp, cpp)
-Ignoring paddle/pten/CMakeLists.txt; not a valid file name (cpp, hxx, hh, cu, cxx, hpp, c, h, c++, cuh, h++, cc)
-Ignoring paddle/pten/api/lib/CMakeLists.txt; not a valid file name (hpp, cc, cuh, hh, cxx, h, cu, c, h++, cpp, c++, hxx)
-Ignoring paddle/pten/backends/CMakeLists.txt; not a valid file name (cc, hh, c++, cu, c, hxx, cpp, cxx, cuh, h++, hpp, h)
-Skipping input 'paddle/pten/kernels/functions/eigen/fill.h': Can't open for reading
-Ignoring paddle/pten/core/CMakeLists.txt; not a valid file name (cpp, h, h++, c, cxx, hxx, cc, hpp, c++, cuh, hh, cu)
-Ignoring paddle/pten/kernels/CMakeLists.txt; not a valid file name (hxx, cc, h++, c++, hh, cxx, c, cpp, cuh, cu, h, hpp)
-Ignoring paddle/pten/kernels/cpu/CMakeLists.txt; not a valid file name (cu, cpp, c++, cc, hxx, cuh, hpp, h++, cxx, hh, h, c)
-Skipping input 'paddle/pten/kernels/cuda/creation.h': Can't open for reading
-Ignoring paddle/pten/kernels/cuda/CMakeLists.txt; not a valid file name (c, h, hpp, hxx, c++, cpp, cu, cxx, cuh, cc, hh, h++)
-Skipping input 'paddle/pten/kernels/functions/blas/CMakeLists.txt': Can't open for reading
-Skipping input 'paddle/pten/kernels/cpu/creation.h': Can't open for reading
-Ignoring paddle/pten/kernels/hybird/CMakeLists.txt; not a valid file name (h, h++, hxx, c++, c, cxx, cu, hpp, cuh, cpp, hh, cc)
-Skipping input 'paddle/pten/kernels/functions/cpu/CMakeLists.txt': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/blas/elementwise.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cuda/CMakeLists.txt': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cpu/elementwise.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/eigen/CMakeLists.txt': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cuda/cast_kernel_impl.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cuda/elementwise/elementwise.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cuda/reduce/reduce.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/general/CMakeLists.txt': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/eigen/common.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/eigen/dot.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/eigen/elementwise.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/eigen/reduce.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/eigen/sign.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/mkldnn/CMakeLists.txt': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/general/elementwise_base.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/general/elementwise_functor.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/general/manipulation.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/general/reduce_impl.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/math/cast_func.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/math/matmul_func.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/math/cpu/transpose.cc': Can't open for reading
-Skipping input 'paddle/pten/kernels/math/cuda/transpose.cu': Can't open for reading
-Skipping input 'paddle/pten/kernels/math/transpose.h': Can't open for reading
-Skipping input 'paddle/pten/kernels/functions/eigen/scale.h': Can't open for reading
-Ignoring paddle/pten/kernels/primitive/CMakeLists.txt; not a valid file name (hxx, cxx, cpp, h++, c++, c, hpp, h, hh, cc, cuh, cu)
-Ignoring paddle/pten/ops/CMakeLists.txt; not a valid file name (h++, c, hpp, cxx, cc, hh, h, c++, cu, cpp, cuh, hxx)
-Ignoring paddle/pten/tests/api/CMakeLists.txt; not a valid file name (c, hxx, c++, cu, h++, hh, hpp, cuh, cxx, h, cc, cpp)
-Ignoring paddle/pten/tests/kernels/CMakeLists.txt; not a valid file name (hxx, h, cuh, hh, hpp, c++, cxx, c, cu, cpp, h++, cc)
-Ignoring paddle/scripts/docker/root/.bashrc; not a valid file name (cu, h, hpp, h++, c++, cuh, c, cpp, hxx, cc, hh, cxx)
-Ignoring paddle/scripts/infrt_build.sh; not a valid file name (hh, c++, cc, hxx, h, cxx, cu, hpp, cuh, cpp, c, h++)
-Ignoring paddle/scripts/paddle_build.bat; not a valid file name (hh, cpp, hpp, c++, cc, c, cuh, hxx, cxx, cu, h, h++)
-Ignoring paddle/scripts/paddle_build.sh; not a valid file name (hpp, cxx, cuh, h, hxx, h++, c++, c, cc, hh, cu, cpp)
-Ignoring python/paddle/_C_ops.py; not a valid file name (cxx, cu, cpp, h, hh, hpp, hxx, c++, h++, cc, cuh, c)
-Ignoring python/paddle/__init__.py; not a valid file name (cxx, hpp, c, h, c++, cc, cpp, cu, hxx, cuh, h++, hh)
-Ignoring python/paddle/device/__init__.py; not a valid file name (hxx, c++, cxx, c, cpp, cuh, hpp, h++, h, cu, hh, cc)
-Ignoring python/paddle/distributed/auto_parallel/cluster.py; not a valid file name (c++, hh, hpp, cpp, cxx, cuh, cu, h++, hxx, h, c, cc)
-Ignoring python/paddle/distributed/auto_parallel/cost_model.py; not a valid file name (c, h++, hxx, hpp, h, cc, hh, cpp, cuh, cu, c++, cxx)
-Ignoring python/paddle/distributed/auto_parallel/mapper.py; not a valid file name (cpp, h++, hh, h, hxx, c++, c, cu, hpp, cuh, cxx, cc)
-Ignoring python/paddle/distributed/auto_parallel/operators/common.py; not a valid file name (hh, hpp, c, cuh, h++, cpp, cxx, cc, h, c++, hxx, cu)
-Ignoring python/paddle/distributed/auto_parallel/operators/dist_embedding.py; not a valid file name (h, hh, cpp, cc, hxx, c++, c, cuh, cu, hpp, h++, cxx)
-Ignoring python/paddle/distributed/auto_parallel/operators/dist_matmul.py; not a valid file name (c++, cc, c, cxx, h++, hxx, cuh, hpp, cpp, h, hh, cu)
-Ignoring python/paddle/distributed/auto_parallel/operators/dist_reshape.py; not a valid file name (h++, c, cxx, hpp, cpp, cuh, c++, cu, hh, hxx, h, cc)
-Ignoring python/paddle/distributed/auto_parallel/operators/dist_softmax.py; not a valid file name (hpp, hh, cpp, c++, h++, cxx, cuh, c, h, cc, hxx, cu)
-Ignoring python/paddle/distributed/auto_parallel/operators/dist_transpose.py; not a valid file name (hh, h, c++, cu, cuh, cxx, hpp, c, hxx, cpp, cc, h++)
-Ignoring python/paddle/distributed/auto_parallel/parallelizer.py; not a valid file name (c++, h, cxx, hxx, c, cpp, cu, cc, cuh, h++, hpp, hh)
-Ignoring python/paddle/distributed/auto_parallel/planner.py; not a valid file name (hh, hxx, cpp, h++, c++, hpp, c, h, cu, cxx, cuh, cc)
-Ignoring python/paddle/distributed/auto_parallel/process_group.py; not a valid file name (cuh, h, hxx, cpp, h++, hpp, hh, c++, cxx, c, cu, cc)
-Ignoring python/paddle/distributed/auto_parallel/utils.py; not a valid file name (h, cuh, cu, cc, cpp, h++, c, cxx, hpp, c++, hxx, hh)
-Ignoring python/paddle/distributed/collective.py; not a valid file name (hh, c++, c, hpp, h++, h, cpp, cc, cxx, cu, hxx, cuh)
-Ignoring python/paddle/distributed/fleet/base/distributed_strategy.py; not a valid file name (hxx, cuh, c++, cxx, h++, hpp, cpp, c, h, cu, hh, cc)
-Ignoring python/paddle/distributed/fleet/base/fleet_base.py; not a valid file name (cc, cuh, hxx, cxx, c, cu, hh, cpp, hpp, h, c++, h++)
-Ignoring python/paddle/distributed/fleet/base/private_helper_function.py; not a valid file name (cxx, h, hh, c, cuh, cpp, cc, h++, cu, hxx, hpp, c++)
-Ignoring python/paddle/distributed/fleet/fleet_executor_utils.py; not a valid file name (hpp, cc, h, h++, c++, cxx, cuh, cpp, hh, cu, c, hxx)
-Ignoring python/paddle/distributed/fleet/launch.py; not a valid file name (cuh, cu, hxx, h, cpp, h++, c, hh, hpp, c++, cxx, cc)
-Ignoring python/paddle/distributed/fleet/launch_utils.py; not a valid file name (hpp, h++, hxx, h, c, cxx, cuh, cc, cu, hh, cpp, c++)
-Ignoring python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py; not a valid file name (cu, hpp, h++, hh, hxx, c, cuh, cxx, cpp, cc, h, c++)
-Ignoring python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py; not a valid file name (cuh, cxx, hpp, hxx, cpp, cu, c++, h, hh, h++, cc, c)
-Ignoring python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py; not a valid file name (cpp, cc, hpp, hxx, hh, h++, cxx, cu, h, c++, c, cuh)
-Ignoring python/paddle/distributed/fleet/runtime/the_one_ps.py; not a valid file name (c, h, hxx, cc, cpp, cu, hh, cxx, h++, hpp, c++, cuh)
-Ignoring python/paddle/distributed/fleet/utils/internal_storage.py; not a valid file name (cpp, hh, h++, cxx, cc, c++, c, cuh, h, hpp, cu, hxx)
-Ignoring python/paddle/distributed/passes/pass_base.py; not a valid file name (hxx, cxx, h++, cuh, c++, cpp, h, hpp, cu, cc, c, hh)
-Ignoring python/paddle/distribution.py; not a valid file name (hxx, c++, hpp, cxx, cc, c, cu, h, h++, cpp, hh, cuh)
-Ignoring python/paddle/fluid/__init__.py; not a valid file name (hh, cuh, h++, c++, h, cpp, cxx, hxx, cu, cc, hpp, c)
-Ignoring python/paddle/fluid/clip.py; not a valid file name (c++, h, h++, cu, cc, hh, c, cxx, cpp, hxx, cuh, hpp)
-Ignoring python/paddle/fluid/contrib/mixed_precision/decorator.py; not a valid file name (cuh, c, hpp, hxx, cu, cpp, cxx, c++, hh, h, cc, h++)
-Ignoring python/paddle/fluid/contrib/mixed_precision/fp16_lists.py; not a valid file name (cpp, hxx, c, cxx, cu, hh, cc, h, cuh, c++, hpp, h++)
-Ignoring python/paddle/fluid/contrib/mixed_precision/fp16_utils.py; not a valid file name (c++, cxx, cpp, cuh, hxx, h++, h, hh, cu, hpp, c, cc)
-Ignoring python/paddle/fluid/contrib/slim/quantization/imperative/qat.py; not a valid file name (hxx, h, c++, hpp, h++, hh, cuh, cxx, cu, cpp, cc, c)
-Ignoring python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py; not a valid file name (c++, cpp, c, h++, hpp, h, cuh, cc, hxx, cu, hh, cxx)
-Ignoring python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py; not a valid file name (c, hxx, cuh, h++, hh, cpp, h, c++, cc, cu, cxx, hpp)
-Ignoring python/paddle/fluid/contrib/slim/tests/CMakeLists.txt; not a valid file name (hxx, h++, cuh, cxx, hpp, h, cc, hh, c++, cu, c, cpp)
-Ignoring python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py; not a valid file name (c++, cpp, cuh, cxx, hh, hxx, cc, c, h++, h, cu, hpp)
-Ignoring python/paddle/fluid/contrib/tests/test_amp_list.py; not a valid file name (h++, cc, hh, cpp, cuh, hpp, c, hxx, c++, cxx, cu, h)
-Ignoring python/paddle/fluid/core.py; not a valid file name (h, hh, h++, c, cpp, cu, cuh, hpp, hxx, c++, cc, cxx)
-Ignoring python/paddle/fluid/dataloader/dataloader_iter.py; not a valid file name (hh, h++, hxx, h, c, cc, c++, hpp, cu, cpp, cxx, cuh)
-Ignoring python/paddle/fluid/dygraph/amp/auto_cast.py; not a valid file name (hxx, cu, h, cpp, hpp, cxx, c, h++, c++, cc, hh, cuh)
-Ignoring python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py; not a valid file name (cuh, cpp, h++, cxx, hpp, hh, hxx, c, cu, h, cc, c++)
-Ignoring python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py; not a valid file name (hpp, cxx, h, hh, cpp, hxx, c++, cc, cu, c, cuh, h++)
-Ignoring python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py; not a valid file name (cpp, hh, c, h, cc, cuh, c++, cu, h++, hpp, cxx, hxx)
-Ignoring python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py; not a valid file name (hpp, cc, cpp, c++, cu, h, cuh, cxx, hxx, hh, h++, c)
-Ignoring python/paddle/fluid/dygraph/dygraph_to_static/utils.py; not a valid file name (cpp, c++, hxx, cuh, hpp, c, cu, h++, cc, h, cxx, hh)
-Ignoring python/paddle/fluid/dygraph/io.py; not a valid file name (h++, hxx, cu, cxx, hh, hpp, cuh, c++, cpp, cc, h, c)
-Ignoring python/paddle/fluid/dygraph/layers.py; not a valid file name (cu, hxx, c, cuh, hh, h, hpp, h++, cpp, cc, cxx, c++)
-Ignoring python/paddle/fluid/dygraph/varbase_patch_methods.py; not a valid file name (hxx, hh, hpp, cuh, cc, cpp, cxx, c, c++, cu, h, h++)
-Ignoring python/paddle/fluid/eager/eager_tensor_patch_methods.py; not a valid file name (cpp, c, cu, cc, hxx, hpp, hh, h++, cuh, h, c++, cxx)
-Ignoring python/paddle/fluid/executor.py; not a valid file name (hpp, hxx, hh, cc, cxx, h++, cuh, cu, h, c++, c, cpp)
-Ignoring python/paddle/fluid/framework.py; not a valid file name (c++, hpp, cu, h++, hh, hxx, cuh, cc, c, h, cpp, cxx)
-Ignoring python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py; not a valid file name (cc, c++, cpp, cu, c, hxx, cuh, hh, cxx, hpp, h, h++)
-Ignoring python/paddle/fluid/layers/control_flow.py; not a valid file name (hpp, hh, h++, h, cu, c, c++, cuh, cpp, cxx, hxx, cc)
-Ignoring python/paddle/fluid/layers/ops.py; not a valid file name (hh, cc, h, c, h++, hxx, cu, hpp, cuh, c++, cxx, cpp)
-Ignoring python/paddle/fluid/layers/tensor.py; not a valid file name (c, c++, cc, cpp, hxx, cxx, h, hh, hpp, cuh, h++, cu)
-Ignoring python/paddle/fluid/optimizer.py; not a valid file name (c++, cc, cpp, c, cxx, h++, cuh, hpp, hh, h, cu, hxx)
-Ignoring python/paddle/fluid/reader.py; not a valid file name (c++, c, h++, cuh, cc, hxx, h, hpp, cu, cxx, hh, cpp)
-Ignoring python/paddle/fluid/tests/unittests/CMakeLists.txt; not a valid file name (hh, cu, c, cxx, cuh, c++, hpp, hxx, h++, h, cc, cpp)
-Ignoring python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt; not a valid file name (cc, c++, cpp, cuh, hxx, h++, hpp, h, cxx, cu, hh, c)
-Ignoring python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py; not a valid file name (cuh, h, cc, cpp, hh, cxx, h++, c, hpp, c++, cu, hxx)
-Ignoring python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_planner.py; not a valid file name (hh, hpp, cc, cu, cuh, c, c++, h, cpp, hxx, cxx, h++)
-Ignoring python/paddle/fluid/tests/unittests/auto_parallel/launch.py; not a valid file name (h, cxx, cu, c++, hh, c, cc, cuh, hxx, hpp, h++, cpp)
-Ignoring python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py; not a valid file name (cuh, cxx, c, h++, h, hh, cu, hxx, cpp, c++, cc, hpp)
-Ignoring python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py; not a valid file name (hxx, hh, c++, hpp, cpp, c, cuh, cxx, h, cu, cc, h++)
-Ignoring python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py; not a valid file name (cpp, h++, hpp, hxx, h, c, c++, cxx, cc, cu, hh, cuh)
-Ignoring python/paddle/fluid/tests/unittests/distributed_passes/check_pass_conflict_example.py; not a valid file name (h, hxx, c, c++, cc, h++, hpp, cu, cuh, hh, cpp, cxx)
-Ignoring python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py; not a valid file name (cc, hh, c++, cu, cxx, hxx, hpp, cuh, cpp, h, c, h++)
-Ignoring python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py; not a valid file name (hpp, c++, h, hxx, cu, c, cpp, hh, h++, cxx, cuh, cc)
-Ignoring python/paddle/fluid/tests/unittests/distributed_passes/pass_run_main.py; not a valid file name (cu, hpp, cpp, hxx, cc, c++, cxx, c, h++, h, hh, cuh)
-Ignoring python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_all_reduce_pass.py; not a valid file name (cc, hxx, h++, cpp, cxx, hh, cu, c++, hpp, cuh, c, h)
-Ignoring python/paddle/fluid/tests/unittests/distributed_passes/test_white_lists.py; not a valid file name (hpp, cpp, cc, c, cxx, hh, h, h++, c++, cu, hxx, cuh)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py; not a valid file name (hh, cxx, h++, h, cuh, hpp, cpp, hxx, c, cc, cu, c++)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py; not a valid file name (c, h++, cc, hpp, h, cu, hxx, cuh, c++, hh, cpp, cxx)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py; not a valid file name (cc, h++, h, cxx, hxx, c++, hh, cuh, c, hpp, cpp, cu)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py; not a valid file name (hxx, cpp, cu, hh, cuh, cc, hpp, h, c, cxx, c++, h++)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py; not a valid file name (cxx, cc, hxx, hh, cuh, hpp, cpp, h++, c, cu, c++, h)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py; not a valid file name (hpp, cpp, hh, c, h, cuh, cxx, cc, c++, h++, hxx, cu)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py; not a valid file name (h, h++, hh, hxx, c, cu, c++, cxx, hpp, cpp, cc, cuh)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py; not a valid file name (hpp, hxx, hh, cpp, h++, c++, cc, h, cu, cxx, c, cuh)
-Ignoring python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py; not a valid file name (hxx, h++, cuh, cc, h, c++, cxx, cu, cpp, hpp, c, hh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/ernie_training.py; not a valid file name (cu, c++, h, hh, h++, hpp, cc, c, cuh, hxx, cpp, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py; not a valid file name (cxx, h++, hh, cuh, c, hpp, c++, cpp, hxx, cu, h, cc)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py; not a valid file name (h++, h, cuh, c++, c, cu, cxx, cpp, cc, hxx, hpp, hh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py; not a valid file name (hh, cu, c, c++, cxx, h++, hpp, cpp, cc, hxx, h, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py; not a valid file name (cuh, cu, h++, hxx, h, c, c++, hh, cc, cpp, hpp, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py; not a valid file name (hpp, c, c++, hxx, cc, h++, hh, cuh, cxx, cu, cpp, h)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py; not a valid file name (c++, cpp, cxx, cu, h++, hpp, hxx, h, cc, c, cuh, hh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py; not a valid file name (c, cpp, h, hpp, cxx, cuh, hxx, hh, h++, cc, c++, cu)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py; not a valid file name (cxx, cc, hh, cuh, hxx, cpp, cu, hpp, h++, c++, h, c)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py; not a valid file name (cxx, c++, h, cpp, h++, cuh, c, cu, hpp, hh, cc, hxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py; not a valid file name (cxx, cuh, c++, hpp, hh, c, h, h++, cu, cc, hxx, cpp)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py; not a valid file name (c++, h, cc, hxx, cu, hh, cxx, cpp, h++, hpp, c, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py; not a valid file name (hxx, cpp, hpp, cc, c++, hh, h, cuh, c, cu, cxx, h++)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py; not a valid file name (h, hh, hpp, cuh, h++, c, cxx, cc, hxx, cpp, c++, cu)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py; not a valid file name (c++, cxx, cc, cpp, cuh, hh, h++, c, hpp, hxx, cu, h)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py; not a valid file name (hpp, cuh, hh, cpp, cxx, cu, hxx, h, h++, c++, c, cc)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py; not a valid file name (cpp, hpp, cxx, h, hxx, cuh, cu, cc, hh, c++, h++, c)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py; not a valid file name (cxx, cu, cc, hxx, h, c, cpp, cuh, h++, hpp, c++, hh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py; not a valid file name (c, cu, h, hh, cpp, hpp, hxx, c++, cuh, h++, cc, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py; not a valid file name (h++, hh, cuh, cpp, h, c, cu, cc, hxx, c++, hpp, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py; not a valid file name (hh, cc, h++, h, cpp, c++, c, hpp, cuh, hxx, cu, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py; not a valid file name (cuh, cc, hh, cxx, h, c++, hpp, c, cpp, h++, hxx, cu)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py; not a valid file name (cc, h++, hxx, cuh, hpp, cpp, c, cu, cxx, c++, h, hh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py; not a valid file name (cpp, h++, hh, cuh, cxx, hpp, h, hxx, cc, c, c++, cu)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py; not a valid file name (c++, cu, h++, cuh, c, cpp, hpp, cc, hxx, h, cxx, hh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py; not a valid file name (hh, h++, hxx, c++, hpp, cc, cpp, cuh, cu, cxx, c, h)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py; not a valid file name (cc, hh, h++, cxx, c++, hxx, c, h, cu, cpp, cuh, hpp)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py; not a valid file name (cuh, hxx, cpp, cu, cxx, c, c++, h, hh, hpp, h++, cc)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py; not a valid file name (hpp, h, cc, cpp, cuh, hxx, c++, h++, cu, hh, c, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py; not a valid file name (cpp, hpp, c, cu, c++, cxx, h++, hh, cuh, cc, h, hxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py; not a valid file name (hxx, h++, cxx, hpp, cu, c++, cuh, cc, h, c, cpp, hh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py; not a valid file name (h++, h, hh, cpp, hxx, c++, c, cuh, hpp, cc, cu, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py; not a valid file name (cxx, hh, c++, hpp, h, cpp, hxx, h++, c, cc, cuh, cu)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py; not a valid file name (cuh, hh, c++, cc, cxx, c, hpp, cpp, h++, cu, hxx, h)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py; not a valid file name (hh, cc, cu, cxx, cpp, c, c++, h++, cuh, hpp, hxx, h)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py; not a valid file name (cc, h++, cuh, cu, cpp, cxx, hxx, h, c, hpp, hh, c++)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py; not a valid file name (cc, h, hxx, hh, h++, cpp, hpp, c++, cuh, cu, c, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py; not a valid file name (hpp, cxx, h++, h, hh, cu, c++, c, hxx, cc, cpp, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py; not a valid file name (hpp, hh, c++, cc, c, h++, hxx, cu, cuh, cpp, h, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_save_load.py; not a valid file name (cuh, hxx, c, h, cu, cxx, cc, hh, h++, c++, hpp, cpp)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py; not a valid file name (hpp, c++, c, hh, cc, hxx, h++, cu, h, cxx, cpp, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py; not a valid file name (c++, cuh, hpp, cc, cu, h++, h, cxx, hxx, c, hh, cpp)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py; not a valid file name (cc, hh, c++, hpp, cxx, cuh, cu, hxx, h++, cpp, h, c)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py; not a valid file name (c, hh, cpp, cc, hxx, c++, hpp, cu, cxx, h++, cuh, h)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py; not a valid file name (hpp, cc, h++, cpp, cu, cxx, hxx, h, cuh, c, hh, c++)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py; not a valid file name (hxx, c++, cu, cuh, h++, cxx, cpp, hpp, hh, cc, c, h)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py; not a valid file name (h++, c, hxx, cxx, cuh, cpp, c++, hh, cc, hpp, h, cu)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py; not a valid file name (hpp, c, cxx, hxx, cu, h, cc, h++, hh, cpp, c++, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py; not a valid file name (cu, cuh, h, c++, hpp, cxx, hh, hxx, h++, c, cpp, cc)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py; not a valid file name (h++, cpp, cuh, hh, c, hpp, cu, cc, cxx, c++, h, hxx)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py; not a valid file name (hh, c++, cpp, c, h, cxx, cc, hxx, cuh, cu, hpp, h++)
-Ignoring python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py; not a valid file name (cpp, hxx, hh, cc, cxx, cu, h++, c++, hpp, h, c, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt; not a valid file name (c, hpp, hh, h++, cxx, cu, cpp, h, cc, cuh, c++, hxx)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py; not a valid file name (hh, hxx, cu, hpp, h, cc, c++, cpp, c, h++, cxx, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/program_config.py; not a valid file name (c, cc, cu, cuh, hpp, cpp, hh, h, hxx, c++, cxx, h++)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py; not a valid file name (cuh, cc, c, c++, cu, h, hxx, hpp, cpp, h++, cxx, hh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py; not a valid file name (h, cuh, cpp, c, c++, h++, hpp, hxx, cu, cc, hh, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py; not a valid file name (c, hh, hxx, hpp, cxx, c++, h, cu, cc, cpp, h++, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py; not a valid file name (hh, cxx, cpp, c++, hxx, h++, cu, cc, cuh, c, hpp, h)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py; not a valid file name (c++, h, c, cpp, hpp, cc, cu, hxx, hh, cuh, cxx, h++)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py; not a valid file name (cpp, cu, c++, h++, hpp, c, cuh, cxx, hxx, cc, h, hh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py; not a valid file name (h, hxx, c, cc, cu, cuh, hpp, h++, cpp, cxx, c++, hh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py; not a valid file name (h, cxx, hh, cuh, cu, c++, hpp, cpp, h++, cc, hxx, c)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py; not a valid file name (hh, c++, cu, cuh, hpp, hxx, cpp, h, h++, c, cc, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py; not a valid file name (hxx, h++, cuh, cxx, c, cc, hh, c++, cpp, h, cu, hpp)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py; not a valid file name (cu, hh, c, cxx, c++, h++, cpp, hpp, h, hxx, cc, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py; not a valid file name (cuh, cxx, cpp, cc, h++, hxx, h, c++, c, cu, hh, hpp)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py; not a valid file name (hpp, cxx, hxx, cu, h, c, cuh, cc, c++, h++, cpp, hh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_layer_norm_fuse_pass.py; not a valid file name (hpp, hh, h++, cu, hxx, c, cxx, c++, cpp, cuh, h, cc)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py; not a valid file name (cuh, cc, h++, c++, h, cu, hh, hxx, cxx, hpp, cpp, c)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py; not a valid file name (c++, c, hpp, cu, cpp, hxx, h++, cuh, cc, hh, h, cxx)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py; not a valid file name (c, cc, cu, hh, cxx, h++, cpp, hxx, hpp, c++, h, cuh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py; not a valid file name (cu, cc, h++, hxx, c++, cxx, hh, h, c, cuh, hpp, cpp)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py; not a valid file name (h++, hh, cu, cuh, cpp, c, hpp, c++, cxx, h, cc, hxx)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py; not a valid file name (cxx, cpp, hpp, cu, c++, cc, h, cuh, hxx, hh, c, h++)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py; not a valid file name (h++, c, cuh, hxx, cpp, hh, cc, hpp, c++, h, cxx, cu)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py; not a valid file name (cpp, cuh, hpp, hxx, hh, cc, h++, h, cxx, cu, c++, c)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py; not a valid file name (hpp, cc, cu, hh, cxx, cuh, hxx, cpp, c++, h++, h, c)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py; not a valid file name (cpp, cc, h++, cuh, c, c++, hpp, cu, h, cxx, hxx, hh)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py; not a valid file name (h, h++, hh, cu, cxx, cuh, cpp, cc, c++, hpp, hxx, c)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py; not a valid file name (cu, cc, hpp, h, hh, cpp, c++, cuh, h++, hxx, cxx, c)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py; not a valid file name (hpp, cxx, hxx, c++, h++, h, hh, cpp, cu, c, cuh, cc)
-Ignoring python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py; not a valid file name (c++, hxx, h, cpp, cc, hh, cxx, c, h++, cu, cuh, hpp)
-Ignoring python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py; not a valid file name (hh, cpp, hpp, c, cuh, c++, h++, hxx, h, cc, cu, cxx)
-Ignoring python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py; not a valid file name (cpp, cxx, h, c++, cu, hpp, hxx, cc, cuh, h++, c, hh)
-Ignoring python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py; not a valid file name (h++, h, cu, c++, cxx, cuh, c, cc, hpp, cpp, hxx, hh)
-Ignoring python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py; not a valid file name (cc, cpp, hpp, c++, h++, cxx, c, h, hh, hxx, cu, cuh)
-Ignoring python/paddle/fluid/tests/unittests/test_activation_op.py; not a valid file name (cuh, hxx, cxx, cc, c++, c, hpp, h++, cu, h, cpp, hh)
-Ignoring python/paddle/fluid/tests/unittests/test_assign_op.py; not a valid file name (c++, cu, hh, hxx, h, cuh, cc, cpp, c, hpp, h++, cxx)
-Ignoring python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py; not a valid file name (hpp, hh, h++, cpp, cuh, cu, cc, h, c, cxx, c++, hxx)
-Ignoring python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py; not a valid file name (h++, cc, cxx, h, hh, cpp, hxx, c++, cuh, c, cu, hpp)
-Ignoring python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py; not a valid file name (c, hpp, cuh, h++, c++, h, hxx, cxx, cpp, hh, cu, cc)
-Ignoring python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py; not a valid file name (cxx, cc, hh, hpp, c++, cu, h++, cuh, hxx, h, cpp, c)
-Ignoring python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py; not a valid file name (h, c, cc, h++, hh, cuh, c++, cu, cxx, cpp, hxx, hpp)
-Ignoring python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py; not a valid file name (c++, cxx, hpp, hh, cuh, cc, h++, cu, hxx, c, cpp, h)
-Ignoring python/paddle/fluid/tests/unittests/test_base_layer.py; not a valid file name (h++, cxx, c, hh, cpp, c++, hpp, h, hxx, cu, cuh, cc)
-Ignoring python/paddle/fluid/tests/unittests/test_collective_base.py; not a valid file name (c++, cc, h, c, hh, cpp, cu, h++, cxx, cuh, hpp, hxx)
-Ignoring python/paddle/fluid/tests/unittests/test_complex_op.py; not a valid file name (c, cxx, cc, cuh, cpp, c++, hpp, h, hxx, h++, cu, hh)
-Ignoring python/paddle/fluid/tests/unittests/test_complex_view_op.py; not a valid file name (hxx, cc, cxx, hh, h, c, cpp, cuh, c++, hpp, h++, cu)
-Ignoring python/paddle/fluid/tests/unittests/test_cuda_graph.py; not a valid file name (cc, hh, hpp, cpp, h++, hxx, cxx, c, cuh, h, c++, cu)
-Ignoring python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py; not a valid file name (cpp, cuh, hpp, hh, h, hxx, c, cc, h++, cu, c++, cxx)
-Ignoring python/paddle/fluid/tests/unittests/test_distribution.py; not a valid file name (hxx, cpp, cxx, cc, hpp, h, hh, h++, c, cu, cuh, c++)
-Ignoring python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py; not a valid file name (cc, cpp, hh, h++, cu, cuh, c, h, hpp, hxx, c++, cxx)
-Ignoring python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py; not a valid file name (c, hh, c++, hpp, hxx, cc, h, h++, cxx, cu, cpp, cuh)
-Ignoring python/paddle/fluid/tests/unittests/test_egr_python_api.py; not a valid file name (hh, cc, hxx, cpp, h, c++, h++, hpp, cuh, cxx, cu, c)
-Ignoring python/paddle/fluid/tests/unittests/test_elementwise_min_op.py; not a valid file name (hpp, cpp, h++, cuh, cu, h, c, cc, cxx, hxx, hh, c++)
-Ignoring python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py; not a valid file name (c, cuh, hpp, cpp, h++, hh, h, hxx, cc, c++, cxx, cu)
-Ignoring python/paddle/fluid/tests/unittests/test_fleet_executor.py; not a valid file name (hxx, c++, hh, cu, h, cc, cxx, cpp, h++, hpp, c, cuh)
-Ignoring python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py; not a valid file name (h++, c, cu, h, hxx, cpp, c++, hpp, cuh, cc, cxx, hh)
-Ignoring python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py; not a valid file name (cc, hh, hxx, hpp, c, h, cuh, c++, cxx, cpp, cu, h++)
-Ignoring python/paddle/fluid/tests/unittests/test_fmax_op.py; not a valid file name (hpp, hh, cuh, h, cpp, cc, cxx, hxx, cu, c++, h++, c)
-Ignoring python/paddle/fluid/tests/unittests/test_fmin_op.py; not a valid file name (hh, cu, cc, hxx, h++, cuh, cxx, c, cpp, c++, hpp, h)
-Ignoring python/paddle/fluid/tests/unittests/test_gcd.py; not a valid file name (cc, h++, hxx, cu, c++, cpp, hh, h, hpp, cxx, c, cuh)
-Ignoring python/paddle/fluid/tests/unittests/test_gradient_clip.py; not a valid file name (h, hpp, c++, hh, cu, h++, c, cpp, cxx, cc, cuh, hxx)
-Ignoring python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py; not a valid file name (cc, hxx, cxx, h, h++, cpp, c, c++, cu, hpp, hh, cuh)
-Ignoring python/paddle/fluid/tests/unittests/test_imperative_optimizer.py; not a valid file name (cxx, cc, hh, cu, cuh, c++, hpp, hxx, h++, cpp, h, c)
-Ignoring python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py; not a valid file name (cuh, cu, hxx, cpp, h, hh, c++, cxx, cc, c, h++, hpp)
-Ignoring python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py; not a valid file name (h, cu, hxx, hh, c++, cuh, cc, c, h++, cpp, hpp, cxx)
-Ignoring python/paddle/fluid/tests/unittests/test_is_complex.py; not a valid file name (cpp, c, hxx, cxx, h, cc, cuh, hpp, h++, c++, hh, cu)
-Ignoring python/paddle/fluid/tests/unittests/test_is_integer.py; not a valid file name (cu, c, hxx, cuh, hh, cc, hpp, h++, cpp, h, c++, cxx)
-Ignoring python/paddle/fluid/tests/unittests/test_lambv2_op.py; not a valid file name (hxx, c++, hpp, cxx, h, c, cc, cpp, h++, cu, cuh, hh)
-Ignoring python/paddle/fluid/tests/unittests/test_layer_norm_op.py; not a valid file name (hh, c++, h, h++, cxx, cu, hpp, cpp, cc, c, hxx, cuh)
-Ignoring python/paddle/fluid/tests/unittests/test_lcm.py; not a valid file name (c, hh, hpp, h++, cxx, cpp, h, cc, hxx, cu, c++, cuh)
-Ignoring python/paddle/fluid/tests/unittests/test_lerp_op.py; not a valid file name (c++, cc, cuh, hxx, cu, c, h, cpp, cxx, hpp, hh, h++)
-Ignoring python/paddle/fluid/tests/unittests/test_logit_op.py; not a valid file name (hpp, c++, cpp, c, h++, h, cuh, cxx, cc, cu, hxx, hh)
-Ignoring python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py; not a valid file name (cuh, hxx, cpp, c, hpp, cxx, h++, cc, hh, cu, h, c++)
-Ignoring python/paddle/fluid/tests/unittests/test_max_op.py; not a valid file name (cu, cc, hh, h, cpp, cuh, h++, c, hpp, c++, hxx, cxx)
-Ignoring python/paddle/fluid/tests/unittests/test_mean_op.py; not a valid file name (cxx, hh, cuh, hpp, h++, cc, hxx, c, cpp, h, c++, cu)
-Ignoring python/paddle/fluid/tests/unittests/test_momentum_op.py; not a valid file name (cxx, h++, cc, h, cpp, hh, cu, c++, hxx, hpp, cuh, c)
-Ignoring python/paddle/fluid/tests/unittests/test_pylayer_op.py; not a valid file name (hxx, hh, c, cuh, cxx, hpp, h++, h, c++, cu, cc, cpp)
-Ignoring python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py; not a valid file name (cpp, cxx, hxx, hpp, h++, cu, c++, h, hh, c, cc, cuh)
-Ignoring python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py; not a valid file name (hh, cc, h++, cuh, cpp, hxx, cu, cxx, c++, h, c, hpp)
-Ignoring python/paddle/fluid/tests/unittests/test_rot90_op.py; not a valid file name (cc, c++, h++, h, hpp, hxx, cuh, cu, cxx, cpp, c, hh)
-Ignoring python/paddle/fluid/tests/unittests/test_scatter_op.py; not a valid file name (hh, hpp, h, hxx, c++, cc, cxx, h++, cu, cuh, c, cpp)
-Ignoring python/paddle/fluid/tests/unittests/test_set_value_op.py; not a valid file name (cxx, h++, cu, cpp, c++, hh, hpp, cc, h, cuh, hxx, c)
-Ignoring python/paddle/fluid/tests/unittests/test_sparse_attention_op.py; not a valid file name (cc, hxx, h, cuh, hh, cpp, cu, c++, c, hpp, cxx, h++)
-Ignoring python/paddle/fluid/tests/unittests/test_translated_layer.py; not a valid file name (cu, hh, cpp, cxx, h++, hpp, c++, cuh, c, h, hxx, cc)
-Ignoring python/paddle/fluid/tests/unittests/test_transpose_op.py; not a valid file name (cc, c, hpp, cuh, cu, h, cpp, hh, cxx, c++, h++, hxx)
-Ignoring python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py; not a valid file name (cxx, cu, c++, hpp, h, cpp, hh, cuh, c, h++, hxx, cc)
-Ignoring python/paddle/fluid/transpiler/details/checkport.py; not a valid file name (cpp, cuh, hxx, c, cc, hh, cxx, c++, cu, h, h++, hpp)
-Ignoring python/paddle/framework/__init__.py; not a valid file name (cuh, c++, hpp, cu, h++, h, hh, cc, c, cpp, cxx, hxx)
-Ignoring python/paddle/nn/__init__.py; not a valid file name (cu, h, c, h++, cpp, hh, hxx, hpp, cc, c++, cuh, cxx)
-Ignoring python/paddle/nn/functional/__init__.py; not a valid file name (cxx, cu, hh, c++, cuh, c, cpp, hxx, h++, hpp, cc, h)
-Ignoring python/paddle/nn/functional/loss.py; not a valid file name (cxx, cuh, hxx, cc, hpp, c, cpp, c++, h++, h, hh, cu)
-Ignoring python/paddle/nn/functional/sparse_attention.py; not a valid file name (hxx, cuh, cxx, hpp, h++, h, cu, cpp, cc, hh, c++, c)
-Ignoring python/paddle/nn/layer/__init__.py; not a valid file name (h++, cxx, hxx, hpp, c, cu, c++, h, hh, cc, cpp, cuh)
-Ignoring python/paddle/nn/layer/loss.py; not a valid file name (c++, hh, cpp, cc, hpp, cu, cxx, h++, hxx, c, cuh, h)
-Ignoring python/paddle/nn/layer/pooling.py; not a valid file name (hh, hxx, cuh, c++, hpp, h, cc, cxx, cpp, h++, cu, c)
-Ignoring python/paddle/nn/layer/transformer.py; not a valid file name (hpp, cuh, h++, hxx, cc, h, hh, c, cpp, c++, cu, cxx)
-Ignoring python/paddle/optimizer/lamb.py; not a valid file name (cuh, c, hh, h, h++, cxx, cpp, c++, cu, cc, hpp, hxx)
-Ignoring python/paddle/optimizer/momentum.py; not a valid file name (hxx, cc, cxx, c++, cuh, h++, h, cu, cpp, hh, hpp, c)
-Ignoring python/paddle/optimizer/optimizer.py; not a valid file name (h++, c, cc, h, hpp, cxx, cuh, cpp, hxx, cu, c++, hh)
-Ignoring python/paddle/tensor/__init__.py; not a valid file name (cuh, hh, cpp, c, cc, cu, h, hpp, h++, hxx, cxx, c++)
-Ignoring python/paddle/tensor/attribute.py; not a valid file name (c, cc, cxx, hh, cu, h++, hpp, cuh, cpp, hxx, c++, h)
-Ignoring python/paddle/tensor/creation.py; not a valid file name (hh, hxx, h, h++, cc, cxx, c++, cuh, c, hpp, cpp, cu)
-Ignoring python/paddle/tensor/manipulation.py; not a valid file name (cc, hh, cuh, cxx, cpp, h++, cu, c++, h, c, hpp, hxx)
-Ignoring python/paddle/tensor/math.py; not a valid file name (hpp, h, cuh, c++, cxx, cpp, hxx, hh, h++, c, cc, cu)
-Ignoring python/paddle/tensor/stat.py; not a valid file name (cc, hh, cu, h, c, c++, cpp, hxx, h++, cuh, hpp, cxx)
-Ignoring python/paddle/utils/code_gen/api.yaml; not a valid file name (hh, c, hxx, h++, cpp, cuh, cxx, h, c++, cu, hpp, cc)
-Ignoring python/paddle/utils/code_gen/api_gen.py; not a valid file name (cuh, hxx, cpp, hh, cxx, c, c++, h++, hpp, cu, cc, h)
-Ignoring python/paddle/utils/cpp_extension/cpp_extension.py; not a valid file name (cc, hxx, cu, hpp, cuh, h, hh, h++, c, cxx, c++, cpp)
-Ignoring python/paddle/utils/cpp_extension/extension_utils.py; not a valid file name (cxx, cuh, h++, hxx, cc, h, c++, hh, c, hpp, cu, cpp)
-Ignoring python/requirements.txt; not a valid file name (c, hh, cu, hpp, c++, cc, cxx, cpp, hxx, h, cuh, h++)
-Ignoring python/setup.py.in; not a valid file name (c, cu, cxx, cc, cuh, h++, hh, cpp, hxx, hpp, h, c++)
-Ignoring python/unittest_py/requirements.txt; not a valid file name (c++, hpp, cpp, hh, cuh, cxx, h++, hxx, h, cu, c, cc)
-Ignoring tools/check_file_diff_approvals.sh; not a valid file name (h, cpp, hxx, cc, hpp, cxx, h++, c, cu, hh, c++, cuh)
-Ignoring tools/coverage/paddle_coverage.sh; not a valid file name (cu, hpp, hh, hxx, cpp, h++, h, c, cc, cxx, c++, cuh)
-Ignoring tools/dockerfile/Dockerfile.ipu; not a valid file name (c++, c, cu, cc, hxx, cpp, cuh, h, hh, cxx, h++, hpp)
-Ignoring tools/dockerfile/ci_dockerfile.sh; not a valid file name (cc, cu, h++, h, cuh, cpp, c, cxx, hh, hpp, hxx, c++)
-Ignoring tools/parallel_UT_rule.py; not a valid file name (c, c++, cu, cpp, cuh, h, hh, hxx, hpp, cxx, cc, h++)
-Ignoring tools/static_mode_white_list.py; not a valid file name (h, cuh, c, cc, hh, cxx, cpp, hxx, cu, hpp, h++, c++)
-Ignoring tools/windows/run_unittests.sh; not a valid file name (h++, cuh, h, hpp, cc, hh, cxx, c++, hxx, c, cu, cpp)
-
-pylint...............................................(no files to check)Skipped
-copyright_checker........................................................Passed

From 14a0382ab566aaaac46e3660387c0024fe651efd Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 23 Dec 2021 12:01:09 +0000
Subject: [PATCH 20/41] update the code

---
 CMakeLists.txt                                | 20 ++---
 cmake/configure.cmake                         |  6 +-
 cmake/generic.cmake                           | 11 ++-
 cmake/operators.cmake                         | 35 ++-------
 cmake/xpu2.cmake                              | 77 +++++--------------
 .../elementwise/elementwise_add_op.h          |  7 ++
 .../elementwise/elementwise_add_op.xpu        |  9 ++-
 7 files changed, 53 insertions(+), 112 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a476c1745414..bc5baddb884fe 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,14 +37,14 @@ project(paddle CXX C)
 
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
-# find_package(CUDA QUIET) (TODO:liuxiandong)
+find_package(CUDA QUIET)
 find_package(MKL CONFIG QUIET)
 option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
-#option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          OFF)
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
-option(WITH_XPU2         "Compile PaddlePaddle with BAIDU KUNLUN XPU2"    OFF)
+option(WITH_XPU_KP         "Compile PaddlePaddle with BAIDU XPU compiler "    OFF)
 option(WITH_MLU    "Compile PaddlePaddle with CAMBRICON MLU"     OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
@@ -61,7 +61,7 @@ include(generic)            # simplify cmake module
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU  AND WITH_XPU2)
+if (WITH_GPU  AND WITH_XPU_KP)
     message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
 endif()
 if (WITH_GPU AND WITH_ASCEND)
@@ -276,20 +276,14 @@ if (NOT WITH_GPU AND WITH_NCCL)
         "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
-# force XPU on when WITH_XPU2
-if (WITH_XPU2 AND NOT WITH_XPU)
+# force XPU on when WITH_XPU_KP
+if (WITH_XPU_KP AND NOT WITH_XPU)
     MESSAGE(WARNING
         "Enable XPU when compiling with XPU2. Force WITH_XPU=ON.")
     set(WITH_XPU ON CACHE STRING
         "Enable XPU when compiling with XPU2" FORCE)
 endif()
 
-if (WITH_XPU AND NOT WITH_XPU2)
-    set(WITH_XPU2 OFF CACHE STRING
-        "Disable XPU2 when compiling with XPU" FORCE)
-endif()
-
-#
 if (NOT WITH_XPU AND WITH_XPU_BKCL)
     MESSAGE(WARNING
         "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
@@ -334,7 +328,7 @@ if(WITH_ROCM)
     include(miopen) # set miopen libraries, must before configure
 endif(WITH_ROCM)
 
-if(WITH_XPU2)
+if(WITH_XPU_KP)
     include(xpu2)
 endif()
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 4f04a13467c0a..b87b91a97382a 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -97,9 +97,9 @@ if(WITH_XPU)
     add_definitions(-DPADDLE_WITH_XPU)
 endif()
 
-if(WITH_XPU2)
-    message(STATUS "Compile with XPU2!")
-    add_definitions(-DPADDLE_WITH_XPU2)
+if(WITH_XPU_KP)
+    message(STATUS "Compile with XPU_KP!")
+    add_definitions(-DPADDLE_WITH_XPU_KP)
 endif()
 
 if(WITH_IPU)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index cac92483136db..2d521d03d377e 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -400,7 +400,7 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
-  if(WITH_XPU2)
+  if(WITH_XPU_KP)
     # target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
     # target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
   endif()
@@ -429,7 +429,7 @@ function(cc_test_build TARGET_NAME)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
     # added by lxd
-    if(WITH_XPU2)
+    if(WITH_XPU_KP)
       #target_link_libraries(${TARGET_NAME} ${XPU2_CLANGRTC_LIB})
     endif()
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
@@ -663,7 +663,7 @@ function(hip_test TARGET_NAME)
 endfunction(hip_test)
 
 function(xpu_library TARGET_NAME)
-  if (WITH_XPU2)
+  if (WITH_XPU_KP)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
@@ -671,7 +671,6 @@ function(xpu_library TARGET_NAME)
 
     if(xpu_library_SRCS)
       if (xpu_library_SHARED OR xpu_library_shared) # build *.so
-        # xpu_add_library(${TARGET_NAME} SHARED ${xpu_library_SRCS})
         message(FATAL_ERROR "XPU kernel currently does not support dynamic links")
       else()
         xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
@@ -702,7 +701,7 @@ function(xpu_library TARGET_NAME)
 endfunction(xpu_library)
 
 function(xpu_binary TARGET_NAME)
-  if (WITH_XPU2)
+  if (WITH_XPU_KP)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
@@ -720,7 +719,7 @@ function(xpu_test TARGET_NAME)
   # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
   # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
   # other than *.py are modified.
-  if (WITH_XPU2 AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+  if (WITH_XPU_KP AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 6ffee10a404df..70cb4190c85c8 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -65,7 +65,6 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu)
                 list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
             endif()
-            # TODO(liuxiandong) add .kps file
         endif()
         if(WITH_ROCM)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
@@ -99,8 +98,7 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
             endif()
         endif()
-        if(WITH_XPU2)
-            # TODO(liuxiandong) xpu->kps
+        if(WITH_XPU_KP)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
                 list(APPEND xpu2_cc_srcs ${TARGET}.xpu)
             endif()
@@ -143,9 +141,9 @@ function(op_library TARGET)
                 list(APPEND cu_cc_srcs ${src})
             elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
-            elseif(WITH_XPU2 AND ${src} MATCHES ".*\\.xpu$")
+            elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
                 list(APPEND xpu2_cc_srcs ${src})
-            elseif(WITH_XPU2 AND ${src} MATCHES ".*_op_kps.cc$")
+            elseif(WITH_XPU_KP AND ${src} MATCHES ".*_op_kps.cc$")
                 list(APPEND xpu2_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
@@ -161,25 +159,6 @@ function(op_library TARGET)
     
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
-
-    #TODO(liuxiandong) 
-    # if(WITH_XPU2 AND ${xpu2_cc_srcs_len})
-    #     foreach(src ${xpu2_cc_srcs})
-    #         #message(STATUS "lxd_debug src----------- ${src}")
-    #         get_filename_component(op_name ${src} NAME_WE)
-    #         #message(STATUS "lxd_debug op_name ${op_name}")
-    #         if(WITH_XPU)
-    #             if(xpu_cc_srcs MATCHES ".*_op_xpu.cc$") 
-    #                 #message(STATUS "the target is matched")
-    #                 list(REMOVE_ITEM xpu_cc_srcs "${op_name}_xpu.cc")
-    #             endif()
-    #         endif()
-    #     endforeach()
-    # endif()
-
-    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
-    list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
-
     list(LENGTH cc_srcs cc_srcs_len)
     if (${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
@@ -242,7 +221,7 @@ function(op_library TARGET)
         list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
-    elseif (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
+    elseif (WITH_XPU_KP AND ${xpu2_cc_srcs_len} GREATER 0)
         xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu2_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
     else()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
@@ -317,8 +296,8 @@ function(op_library TARGET)
     list(LENGTH npu_cc_srcs npu_cc_srcs_len)
     list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND
-        ${npu_cc_srcs_len} EQUAL 0 AND ${mlu_cc_srcs_len} EQUAL 0)
+        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu2_cc_srcs_len} EQUAL 0 AND
+        ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0 AND ${mlu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -362,7 +341,7 @@ function(op_library TARGET)
 
     #message(STATUS "lxd_debug: cmake source dir is: ${CMAKE_SOURCE_DIR}")
 
-    if (WITH_XPU2 AND ${xpu2_cc_srcs_len} GREATER 0)
+    if (WITH_XPU_KP AND ${xpu2_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
index 550d64b7a33f8..27d67f65931b7 100644
--- a/cmake/xpu2.cmake
+++ b/cmake/xpu2.cmake
@@ -1,4 +1,4 @@
-if(NOT WITH_XPU2)
+if(NOT WITH_XPU_KP)
     return()
 endif()
 
@@ -77,9 +77,7 @@ macro(compile_kernel COMPILE_ARGS)
     )
 
   if(cc_depends)
-    message(STATUS "lxd_debug kernel dependencies: ${xpu_add_library_DEPENDS}")
     add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS})
-    #target_link_libraries(${kernel_target} ${xpu_add_library_DEPENDS})
   endif()
 
   set(arg_device_o_extra_flags ${device_o_extra_flags})
@@ -89,12 +87,9 @@ macro(compile_kernel COMPILE_ARGS)
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
-  #-Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow
   set(XPU_CXX_INCLUDES  -I${CMAKE_SOURCE_DIR}/build -I${CMAKE_SOURCE_DIR}/paddle/fluid/framework/io -I${CMAKE_SOURCE_DIR}/build/third_party/install/zlib/include -I${CMAKE_SOURCE_DIR}/build/third_party/install -I${CMAKE_SOURCE_DIR}/build/third_party/install/gflags/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/glog/include -I${CMAKE_SOURCE_DIR}/build/third_party/boost/src/extern_boost -I${CMAKE_SOURCE_DIR}/build/third_party/eigen3/src/extern_eigen3 -I${CMAKE_SOURCE_DIR}/build/third_party/threadpool/src/extern_threadpool -I${CMAKE_SOURCE_DIR}/build/third_party/dlpack/src/extern_dlpack/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xxhash/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/warpctc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/utf8proc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/openblas/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I${CMAKE_SOURCE_DIR}/build/third_party/pybind/src/extern_pybind/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gtest/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xpu/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gloo/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include/xbyak -I${CMAKE_SOURCE_DIR}/build/third_party/install/cryptopp/include -I${CMAKE_SOURCE_DIR}/build/third_party/pocketfft/src -I${CMAKE_SOURCE_DIR} -I${CMAKE_SOURCE_DIR}/paddle/fluid/platform)
-  #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -mavx -O3 -DNDEBUG )
   set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
-  #set(XPU_CXX_FLAGS -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=terminate -Wno-error=int-in-bool-context -Wimplicit-fallthrough=0 -Wno-error=maybe-uninitialized -Wno-format-truncation -Wno-error=cast-function-type -Wno-error=parentheses -Wno-error=catch-value -Wno-error=nonnull-compare -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
-  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU2 -DPADDLE_WITH_XPU -DXBYAK64 -DXBYAK_NO_OP_NAMES)
+  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU_KP -DPADDLE_WITH_XPU -DXBYAK64 -DXBYAK_NO_OP_NAMES)
 
   add_custom_command(
     OUTPUT
@@ -102,7 +97,6 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
        -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
         --xpu-device-only -c -v 
@@ -116,7 +110,6 @@ macro(compile_kernel COMPILE_ARGS)
       kernel_build/${kernel_name}.bin.o
     VERBATIM
     )
-    # TODO attention here
     list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
 
   add_custom_command(
@@ -125,7 +118,6 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    # TODO(liuxiandong) xpu->kps -I${XTDK_DIR}/include -std=c++11 
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
         -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
         --xpu-host-only -c -v 
@@ -162,7 +154,6 @@ macro(xpu_add_library TARGET_NAME)
     # Distinguish .xpu file from other files
     foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
       get_filename_component(language_type_name ${cur_xpu_src} EXT)
-      # TODO(liuxiandong) xpu->kps
       if(${language_type_name} STREQUAL ".xpu")
         list(APPEND xpu_kernel_lists ${cur_xpu_src})
       else()
@@ -179,10 +170,8 @@ macro(xpu_add_library TARGET_NAME)
             message(STATUS "Process ${xpu_kernel}")
             get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
             get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
-            #message(STATUS "lxd_debug PATH ${kernel_dir}")
-            #TODO(liuxiandong set default rules)
             set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
-            set(kernel_name ${kernel_name}) #DIRPATH ${kernel_dir}
+            set(kernel_name ${kernel_name})
             compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
         endforeach()
 
@@ -191,59 +180,29 @@ macro(xpu_add_library TARGET_NAME)
                 ${CMAKE_CURRENT_BINARY_DIR}
             DEPENDS
                 ${xpu_kernel_depends}
-                #${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
             COMMENT
                 ${xpu_target}_src
             VERBATIM
             )
 
-        # add_custom_command(
-        #     OUTPUT
-        #     ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-        #     COMMAND
-        #         ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
-        #     WORKING_DIRECTORY
-        #         ${CMAKE_CURRENT_BINARY_DIR}
-        #     DEPENDS
-        #         ${xpu_kernel_depends}
-        #     COMMENT
-        #         ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-        #     VERBATIM
-        #     ) 
+        add_custom_command(
+            OUTPUT
+            ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMAND
+                ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+            COMMENT
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            VERBATIM
+            ) 
         
         add_library(${xpu_target} STATIC ${cc_kernel_lists})
         add_dependencies(${xpu_target} ${xpu_target}_src)
-        #target_link_libraries(${xpu_target} ${xpu_target}_src)
-        #target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
-
-        # TEST
-        # add_library(${xpu_target}_obj OBJECT ${cc_kernel_lists})
-        #add_dependencies(${xpu_target} ${xpu_target}_src)
-
-        # add_custom_target(${xpu_target} ALL
-        #   WORKING_DIRECTORY
-        #     ${CMAKE_CURRENT_BINARY_DIR}
-        #   DEPENDS
-        #     ${xpu_kernel_depends}
-        #     ${xpu_target}_obj
-        #     ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}.a
-        #   COMMENT
-        #     ${xpu_target}
-        #   VERBATIM
-        #   )
-        # add_custom_command(
-        #   OUTPUT
-        #     ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}.a
-        #   COMMAND
-        #     ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}.a ${xpu_kernel_depends} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${xpu_target}.dir/*.o
-        #   WORKING_DIRECTORY
-        #     ${CMAKE_CURRENT_BINARY_DIR}
-        #   DEPENDS
-        #     ${xpuapi_wrapper_a_depends}
-        #   COMMENT
-        #     ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}.a
-        #   VERBATIM
-        #   )
+        target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
     else()
         add_library(${xpu_target} STATIC ${cc_kernel_lists})
     endif()
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index baf0e2e0d9694..a4567beeb4f3d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -20,6 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+#include "paddle/fluid/framework/pten_utils.h"
+
+// only can include the headers in paddle/pten/include dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
index ceeedaeab1ab1..1a9a922132378 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
@@ -11,6 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
+/*
 #ifdef PADDLE_WITH_XPU
 
 #if defined(__CUDA_ARCH__)
@@ -29,9 +31,7 @@ limitations under the License. */
   #undef __NVCC__
 #endif
 
-#if defined(EIGEN_HAS_BUILTIN_INT128)
-  #undef EIGEN_HAS_BUILTIN_INT128
-#endif
+
 
 #include <xpu/runtime.h>
 #include "xpu/kernel/cluster_header.h"
@@ -47,6 +47,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+
 void ElementwiseAddXPU2Compute(const framework::ExecutionContext& ctx){
   std::cout<<"lxd_debug: XPU2 forward element_add !"<<std::endl;
 }
@@ -55,7 +56,9 @@ void ElementwiseAddGradXPU2Compute(const framework::ExecutionContext& ctx){
   std::cout<<"lxd_debug: XPU2 backward element_add !"<<std::endl;
 }
 
+
 }  // namespace operators
 }  // namespace paddle
 
 #endif
+*/
\ No newline at end of file

From 7a871a271c15c66abd9c744825818cb5be41c263 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Fri, 24 Dec 2021 03:12:17 +0000
Subject: [PATCH 21/41] update the code

---
 CMakeLists.txt        | 1 -
 cmake/generic.cmake   | 8 --------
 cmake/operators.cmake | 5 -----
 3 files changed, 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc5baddb884fe..6de9b3550b9ff 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,6 @@ find_package(CUDA QUIET)
 find_package(MKL CONFIG QUIET)
 option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          OFF)
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_XPU_KP         "Compile PaddlePaddle with BAIDU XPU compiler "    OFF)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 2d521d03d377e..6655963e728f1 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -400,10 +400,6 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
-  if(WITH_XPU_KP)
-    # target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_API_LIB})
-    # target_link_libraries(${TARGET_NAME} ${XPU2_CLANG_RT_LIB})
-  endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
@@ -428,10 +424,6 @@ function(cc_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
-    # added by lxd
-    if(WITH_XPU_KP)
-      #target_link_libraries(${TARGET_NAME} ${XPU2_CLANGRTC_LIB})
-    endif()
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
 endfunction()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 70cb4190c85c8..b34b7e58f00af 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -290,8 +290,6 @@ function(op_library TARGET)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH hip_cc_srcs hip_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    #list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
-    #list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
     list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
     list(LENGTH npu_cc_srcs npu_cc_srcs_len)
     list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
@@ -335,12 +333,9 @@ function(op_library TARGET)
     endif()
 
     if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0 AND ${xpu2_cc_srcs_len} EQUAL 0)
-        #message(STATUS "lxd_debug: ${TARGET} op in XPU1")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
-    #message(STATUS "lxd_debug: cmake source dir is: ${CMAKE_SOURCE_DIR}")
-
     if (WITH_XPU_KP AND ${xpu2_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()

From 56e20520d2f49a87ee3053b0d8f559f8b5619f42 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 28 Dec 2021 06:55:05 +0000
Subject: [PATCH 22/41] add run_kp_kernel flag

---
 CMakeLists.txt                                |   2 +-
 cmake/operators.cmake                         |  29 +--
 cmake/xpu2.cmake                              | 209 ------------------
 paddle/fluid/framework/library_type.h         |   7 +-
 paddle/fluid/framework/operator.cc            |  24 ++
 paddle/fluid/framework/operator.h             |   1 +
 paddle/fluid/imperative/prepared_operator.cc  |   8 +
 paddle/fluid/imperative/prepared_operator.h   |   1 +
 .../fluid/platform/device/xpu/xpu_op_list.cc  |  42 ++++
 .../fluid/platform/device/xpu/xpu_op_list.h   |   3 +
 paddle/fluid/platform/flags.cc                |  12 +
 paddle/pten/common/data_type.h                |  16 +-
 12 files changed, 121 insertions(+), 233 deletions(-)
 delete mode 100644 cmake/xpu2.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6de9b3550b9ff..8706e3f6cacf1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -328,7 +328,7 @@ if(WITH_ROCM)
 endif(WITH_ROCM)
 
 if(WITH_XPU_KP)
-    include(xpu2)
+    include(xpu_kp)
 endif()
 
 if (NOT WITH_ROCM AND WITH_RCCL)
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index b34b7e58f00af..60572f040cd75 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -11,7 +11,7 @@ function(op_library TARGET)
     set(cu_cc_srcs)
     set(hip_cc_srcs)
     set(xpu_cc_srcs)
-    set(xpu2_cc_srcs)
+    set(xpu_kp_cc_srcs)
     set(npu_cc_srcs)
     set(mlu_cc_srcs)
     set(cudnn_cu_cc_srcs)
@@ -100,11 +100,11 @@ function(op_library TARGET)
         endif()
         if(WITH_XPU_KP)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
-                list(APPEND xpu2_cc_srcs ${TARGET}.xpu)
+                list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
             endif()
-            string(REPLACE "_op" "_op_kps" XPU2_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU2_FILE}.cc)
-                list(APPEND xpu2_cc_srcs ${XPU2_FILE}.cc)
+            string(REPLACE "_op" "_op_kps" XPU_KP_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_KP_FILE}.cc)
+                list(APPEND xpu_kp_cc_srcs ${XPU_KP_FILE}.cc)
             endif()
         endif()
         if(WITH_ASCEND_CL)
@@ -142,9 +142,9 @@ function(op_library TARGET)
             elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
             elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
-                list(APPEND xpu2_cc_srcs ${src})
+                list(APPEND xpu_kp_cc_srcs ${src})
             elseif(WITH_XPU_KP AND ${src} MATCHES ".*_op_kps.cc$")
-                list(APPEND xpu2_cc_srcs ${src})
+                list(APPEND xpu_kp_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
             elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
@@ -158,7 +158,7 @@ function(op_library TARGET)
     endif()
     
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
-    list(LENGTH xpu2_cc_srcs xpu2_cc_srcs_len)
+    list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len)
     list(LENGTH cc_srcs cc_srcs_len)
     if (${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
@@ -221,8 +221,8 @@ function(op_library TARGET)
         list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
-    elseif (WITH_XPU_KP AND ${xpu2_cc_srcs_len} GREATER 0)
-        xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu2_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
+    elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+        xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
     else()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
@@ -294,7 +294,7 @@ function(op_library TARGET)
     list(LENGTH npu_cc_srcs npu_cc_srcs_len)
     list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu2_cc_srcs_len} EQUAL 0 AND
+        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_kp_cc_srcs_len} EQUAL 0 AND
         ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0 AND ${mlu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
@@ -332,12 +332,13 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
-    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0 AND ${xpu2_cc_srcs_len} EQUAL 0)
+    # pybind USE_OP_DEVICE_KERNEL for XPU
+    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
-    if (WITH_XPU_KP AND ${xpu2_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
     endif()
 
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
diff --git a/cmake/xpu2.cmake b/cmake/xpu2.cmake
deleted file mode 100644
index 27d67f65931b7..0000000000000
--- a/cmake/xpu2.cmake
+++ /dev/null
@@ -1,209 +0,0 @@
-if(NOT WITH_XPU_KP)
-    return()
-endif()
-
-if(NOT XPU_TOOLCHAIN)
-  set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
-  get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
-endif()
-if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
-  message(FATAL_ERROR "Directory ${XPU_TOOLCHAIN} not found!")
-endif()
-message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
-set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++)
-message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
-
-if(NOT HOST_SYSROOT)
-  set(HOST_SYSROOT /opt/compiler/gcc-8.2)
-endif()
-
-if(NOT IS_DIRECTORY ${HOST_SYSROOT})
-  message(FATAL_ERROR "Directory ${HOST_SYSROOT} not found!")
-endif()
-
-if(NOT API_ARCH)
-  set(API_ARCH x86_64-baidu-linux-gnu)
-endif()
-
-if(API_ARCH MATCHES "x86_64")
-if(EXISTS ${HOST_SYSROOT}/bin/g++)
-  set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
-  set(HOST_AR ${HOST_SYSROOT}/bin/ar)
-else()
-  set(HOST_CXX /usr/bin/g++)
-  set(HOST_AR /usr/bin/ar)
-endif()
-else()
-  set(HOST_CXX ${CMAKE_CXX_COMPILER})
-  set(HOST_AR ${CMAKE_AR})
-endif()
-
-set(TOOLCHAIN_ARGS )
-
-if(OPT_LEVEL)
-  set(OPT_LEVEL ${OPT_LEVEL})
-else()
-  set(OPT_LEVEL "-O2")
-endif()
-
-message(STATUS "Build with API_ARCH=" ${API_ARCH})
-message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS})
-message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
-message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
-message(STATUS "Build with HOST_AR=" ${HOST_AR})
-
-macro(compile_kernel COMPILE_ARGS)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS)
-  cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(kernel_path ${xpu_add_library_DIRPATH})
-  set(kernel_name ${xpu_add_library_XNAME})
-  set(device_o_extra_flags ${xpu_add_library_DEVICE})
-  set(host_o_extra_flags ${xpu_add_library_HOST})
-  set(xpu_1_or_2 ${xpu_add_library_XPU})
-  set(cc_depends ${xpu_add_library_DEPENDS})
-
-  set(kernel_target ${kernel_name}_kernel)
-  add_custom_target(${kernel_target}
-    WORKING_DIRECTORY
-      ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS
-      kernel_build/${kernel_name}.host.o
-      kernel_build/${kernel_name}.bin.o
-    COMMENT
-      ${kernel_target}
-    VERBATIM
-    )
-
-  if(cc_depends)
-    add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS})
-  endif()
-
-  set(arg_device_o_extra_flags ${device_o_extra_flags})
-  separate_arguments(arg_device_o_extra_flags)
-  set(arg_host_o_extra_flags ${host_o_extra_flags})
-  separate_arguments(arg_host_o_extra_flags)
-
-  set(XTDK_DIR ${XPU_TOOLCHAIN})
-  set(CXX_DIR ${HOST_SYSROOT})
-  set(XPU_CXX_INCLUDES  -I${CMAKE_SOURCE_DIR}/build -I${CMAKE_SOURCE_DIR}/paddle/fluid/framework/io -I${CMAKE_SOURCE_DIR}/build/third_party/install/zlib/include -I${CMAKE_SOURCE_DIR}/build/third_party/install -I${CMAKE_SOURCE_DIR}/build/third_party/install/gflags/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/glog/include -I${CMAKE_SOURCE_DIR}/build/third_party/boost/src/extern_boost -I${CMAKE_SOURCE_DIR}/build/third_party/eigen3/src/extern_eigen3 -I${CMAKE_SOURCE_DIR}/build/third_party/threadpool/src/extern_threadpool -I${CMAKE_SOURCE_DIR}/build/third_party/dlpack/src/extern_dlpack/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xxhash/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/warpctc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/utf8proc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/openblas/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I${CMAKE_SOURCE_DIR}/build/third_party/pybind/src/extern_pybind/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gtest/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xpu/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gloo/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include/xbyak -I${CMAKE_SOURCE_DIR}/build/third_party/install/cryptopp/include -I${CMAKE_SOURCE_DIR}/build/third_party/pocketfft/src -I${CMAKE_SOURCE_DIR} -I${CMAKE_SOURCE_DIR}/paddle/fluid/platform)
-  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
-  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU_KP -DPADDLE_WITH_XPU -DXBYAK64 -DXBYAK_NO_OP_NAMES)
-
-  add_custom_command(
-    OUTPUT
-      kernel_build/${kernel_name}.bin.o
-    COMMAND
-      ${CMAKE_COMMAND} -E make_directory kernel_build
-    COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
-        --xpu-device-only -c -v 
-    COMMAND
-      ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
-    WORKING_DIRECTORY
-      ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS
-      ${xpu_add_library_DEPENDS}
-    COMMENT
-      kernel_build/${kernel_name}.bin.o
-    VERBATIM
-    )
-    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
-
-  add_custom_command(
-    OUTPUT
-      kernel_build/${kernel_name}.host.o
-    COMMAND
-      ${CMAKE_COMMAND} -E make_directory kernel_build
-    COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-        -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
-        --xpu-host-only -c -v 
-    WORKING_DIRECTORY
-      ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS
-      ${xpu_add_library_DEPENDS}
-    COMMENT
-      kernel_build/${kernel_name}.host.o
-    VERBATIM
-    )
-    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
-endmacro()
-
-###############################################################################
-# XPU_ADD_LIBRARY
-###############################################################################
-macro(xpu_add_library TARGET_NAME)
-    # Separate the sources from the options
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs STATIC DEPENDS)
-    cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(xpu_srcs ${xpu_add_library_STATIC})
-    set(xpu_target ${TARGET_NAME})
-    set(cc_srcs_depends ${xpu_add_library_DEPENDS})
-    
-    file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
-    list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
-
-    set(XPU1_DEVICE_O_EXTRA_FLAGS " ")
-    set(XPU1_HOST_O_EXTRA_FLAGS " ")
-
-    # Distinguish .xpu file from other files
-    foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
-      get_filename_component(language_type_name ${cur_xpu_src} EXT)
-      if(${language_type_name} STREQUAL ".xpu")
-        list(APPEND xpu_kernel_lists ${cur_xpu_src})
-      else()
-        list(APPEND cc_kernel_lists ${cur_xpu_src})
-      endif()
-    endforeach()
-
-    # Ensure that there is only one xpu kernel
-    list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
-    list(LENGTH cc_srcs_depends cc_srcs_depends_num)
-
-    if(${xpu_kernel_lists_num})
-        foreach(xpu_kernel IN LISTS xpu_kernel_lists)
-            message(STATUS "Process ${xpu_kernel}")
-            get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
-            get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
-            set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
-            set(kernel_name ${kernel_name})
-            compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
-        endforeach()
-
-        add_custom_target(${xpu_target}_src ALL
-            WORKING_DIRECTORY
-                ${CMAKE_CURRENT_BINARY_DIR}
-            DEPENDS
-                ${xpu_kernel_depends}
-                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-            COMMENT
-                ${xpu_target}_src
-            VERBATIM
-            )
-
-        add_custom_command(
-            OUTPUT
-            ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-            COMMAND
-                ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
-            WORKING_DIRECTORY
-                ${CMAKE_CURRENT_BINARY_DIR}
-            DEPENDS
-                ${xpu_kernel_depends}
-            COMMENT
-                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-            VERBATIM
-            ) 
-        
-        add_library(${xpu_target} STATIC ${cc_kernel_lists})
-        add_dependencies(${xpu_target} ${xpu_target}_src)
-        target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
-    else()
-        add_library(${xpu_target} STATIC ${cc_kernel_lists})
-    endif()
-endmacro()
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index 6fdd128b0d3bf..0cb6cac26a61c 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -26,6 +26,7 @@ enum class LibraryType {
   kPlain = 0,
   kMKLDNN = 1,
   kCUDNN = 2,
+  kKP = 3,
 };
 
 inline std::string LibraryTypeToString(const LibraryType& library_type) {
@@ -36,10 +37,12 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
       return "MKLDNN";
     case LibraryType::kCUDNN:
       return "CUDNN";
+    case LibraryType::kKP:
+      return "KP";
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
           "Unknown LibraryType code (%d), only supports library type include "
-          "PLAIN(0), MKLDNN(1), CUDNN(2).",
+          "PLAIN(0), MKLDNN(1), CUDNN(2), KP(3).",
           static_cast<int>(library_type)));
   }
 }
@@ -57,6 +60,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     return LibraryType::kCUDNN;
     // To be compatible with register macro.
     // CPU, CUDA, PLAIN are same library type.
+  } else if (s == std::string("KP")) {
+    return LibraryType::kKP;
   } else if (s == std::string("CPU")) {
     return LibraryType::kPlain;
   } else if (s == std::string("XPU")) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 19cbb0991515b..b822a3b96d1d2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -53,9 +53,11 @@ class LoDTensor;
 DECLARE_bool(benchmark);
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
+DECLARE_bool(use_kp);
 PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
                              "number of threads for inner op");
 DECLARE_bool(run_pten_kernel);
+DECLARE_bool(run_kp_kernel);
 
 namespace paddle {
 namespace framework {
@@ -1320,6 +1322,12 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
   auto expected_kernel_key = InnerGetExpectedKernelType(ctx);
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+
+  // added by liuxiandong for test
+  if (FLAGS_run_kp_kernel) {
+    std::cout << "****** FLAGS_use_kp  operator.cc *******";
+  }
+
 #ifdef PADDLE_WITH_MKLDNN
   // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
   if (kernel_iter == kernels.end() &&
@@ -1342,6 +1350,22 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
+
+#ifdef PADDLE_WITH_XPU_KP
+  bool use_xpu_kp_kernel_rt =
+      FLAGS_run_kp_kernel &&
+      paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+  bool use_xpu_kp_kernel_debug =
+      paddle::platform::is_in_xpu_kpwhite_list(type_);
+  if (is_xpu_place(expected_kernel_key.place_) &&
+      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
+    expected_kernel_key.library_type_ = LibraryType::kKP;
+    kernel_iter = kernels.find(expected_kernel_key);
+    VLOG(3) << "using XPU KP kernel: " << type_
+            << ", using_kernel_key:" << expected_kernel_key;
+  }
+#endif
+
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
       is_npu_place(expected_kernel_key.place_)) {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 842ef0457d7bd..f2c2def52d0ca 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -617,6 +617,7 @@ class OperatorWithKernel : public OperatorBase {
   // new pten kernel, if there is a better design in the future,
   // we may polish the implementation here
   mutable bool run_pten_kernel_ = false;
+  mutable bool run_kp_kernel = false;
   mutable std::unique_ptr<KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<pten::Kernel> pt_kernel_;
   // In order to reduce the compatibility phase
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 80f7e7d4f4c5a..5d02438b4c8ed 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -24,11 +24,13 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
+#include "paddle/fluid/framework/library_type.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(run_pten_kernel);
 DECLARE_bool(benchmark);
+DECLARE_bool(run_kp_kernel);
 
 namespace paddle {
 namespace imperative {
@@ -195,6 +197,12 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
+
+  // added by liuxiandong for test
+  if (FLAGS_run_kp_kernel) {
+    std::cout << "****** FLAGS_use_kp  prepared_operator.cc *******";
+  }
+
 #ifdef PADDLE_WITH_XPU
   if (is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 5262b265b1b53..770a52c630cc4 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -196,6 +196,7 @@ class PreparedOp {
   // new pten kernel, if there is a better design in the future,
   // we may polish the implementation here
   bool run_pten_kernel_{false};
+  bool run_kp_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
   pten::Kernel pt_kernel_;
   // In order to reduce the compatibility phase
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 7561830fc76c1..9066b84ce5c4c 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu1_op_list.h"
 #include "paddle/fluid/platform/device/xpu/xpu2_op_list.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 
 namespace paddle {
@@ -36,6 +37,22 @@ bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
   return false;
 }
 
+bool is_xpu_kp_support_op(const std::string& op_name,
+                          const pOpKernelType& type) {
+  auto& ops = get_kl1_ops();
+  auto v =
+      get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
+  if (v == XPU2) {
+    ops = get_kp_ops();
+  }
+
+  if (ops.find(op_name) != ops.end() &&
+      ops[op_name].find(type) != ops[op_name].end()) {
+    return true;
+  }
+  return false;
+}
+
 // ops_string contains op_list(e.g., 'mul,mul_grad'), parse the op string and
 // insert op to op set
 static void tokenize(const std::string& ops, char delim,
@@ -75,6 +92,31 @@ bool is_in_xpu_black_list(const std::string& op_name) {
   return false;
 }
 
+bool is_in_xpu_kpwhite_list(const std::string& op_name) {
+  static bool inited = false;
+  static std::unordered_set<std::string> xpu_kpwhite_list;
+  static std::mutex s_mtx;
+  if (!inited) {
+    std::lock_guard<std::mutex> guard(s_mtx);
+    if (!inited) {
+      if (std::getenv("XPU_KPWHITE_LIST") != nullptr) {
+        std::string ops(std::getenv("XPU_KPWHITE_LIST"));
+        tokenize(ops, ',', &xpu_kpwhite_list);
+      }
+      inited = true;
+      VLOG(3) << "XPU kpwhite List: ";
+      for (auto iter = xpu_kpwhite_list.begin(); iter != xpu_kpwhite_list.end();
+           ++iter) {
+        VLOG(3) << *iter << " ";
+      }
+    }
+  }
+  if (xpu_kpwhite_list.find(op_name) != xpu_kpwhite_list.end()) {
+    return true;
+  }
+  return false;
+}
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 705f701e13634..d532b610e8f9f 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -21,7 +21,10 @@ namespace platform {
 using pOpKernelType = paddle::framework::OpKernelType;
 
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
+bool is_xpu_kp_support_op(const std::string& op_name,
+                          const pOpKernelType& type);
 bool is_in_xpu_black_list(const std::string& op_name);
+bool is_in_xpu_kpwhite_list(const std::string& op_name);
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 2df3d00dc924a..f2d4fcafd234e 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -694,6 +694,18 @@ PADDLE_DEFINE_EXPORTED_bool(
 PADDLE_DEFINE_EXPORTED_bool(run_pten_kernel, true,
                             "It controls whether to use pten kernel");
 
+/**
+ * KP kernel related FLAG
+ * Name: FLAGS_run_kp_kernel
+ * Since Version: 2.3.0
+ * Value Range: bool, default=false
+ * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
+ * Op.
+ * Note:
+ */
+PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel, false,
+                            "It controls whether to run PaddlePaddle using KP");
+
 /**
  * Distributed related FLAG
  * Name: FLAGS_allreduce_record_one_event
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index 438011d7b872a..252565385766d 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -14,14 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-//#include "bfloat16.h"  // NOLINT
-//#include "complex.h"   // NOLINT
-//#include "float16.h"   // NOLINT
-
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
+#include "bfloat16.h"  // NOLINT
+#include "complex.h"   // NOLINT
+#include "float16.h"   // NOLINT
+
+// #include "paddle/fluid/platform/bfloat16.h"
+// #include "paddle/fluid/platform/complex.h"
+// #include "paddle/fluid/platform/enforce.h"
+// #include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/api/ext/exception.h"
 
 namespace paddle {

From bfe52ab117553ad64362c328c9808e38817682c6 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 28 Dec 2021 07:19:25 +0000
Subject: [PATCH 23/41] update

---
 cmake/xpu_kp.cmake                            | 209 ++++++++++++++++++
 .../elementwise/elementwise_add_op_kps.cc     |  61 +++++
 .../platform/device/xpu/xpu_op_kpfirst_list.h |  46 ++++
 3 files changed, 316 insertions(+)
 create mode 100644 cmake/xpu_kp.cmake
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
 create mode 100644 paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h

diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
new file mode 100644
index 0000000000000..27d67f65931b7
--- /dev/null
+++ b/cmake/xpu_kp.cmake
@@ -0,0 +1,209 @@
+if(NOT WITH_XPU_KP)
+    return()
+endif()
+
+if(NOT XPU_TOOLCHAIN)
+  set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
+  get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
+endif()
+if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
+  message(FATAL_ERROR "Directory ${XPU_TOOLCHAIN} not found!")
+endif()
+message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
+set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++)
+message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
+
+if(NOT HOST_SYSROOT)
+  set(HOST_SYSROOT /opt/compiler/gcc-8.2)
+endif()
+
+if(NOT IS_DIRECTORY ${HOST_SYSROOT})
+  message(FATAL_ERROR "Directory ${HOST_SYSROOT} not found!")
+endif()
+
+if(NOT API_ARCH)
+  set(API_ARCH x86_64-baidu-linux-gnu)
+endif()
+
+if(API_ARCH MATCHES "x86_64")
+if(EXISTS ${HOST_SYSROOT}/bin/g++)
+  set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
+  set(HOST_AR ${HOST_SYSROOT}/bin/ar)
+else()
+  set(HOST_CXX /usr/bin/g++)
+  set(HOST_AR /usr/bin/ar)
+endif()
+else()
+  set(HOST_CXX ${CMAKE_CXX_COMPILER})
+  set(HOST_AR ${CMAKE_AR})
+endif()
+
+set(TOOLCHAIN_ARGS )
+
+if(OPT_LEVEL)
+  set(OPT_LEVEL ${OPT_LEVEL})
+else()
+  set(OPT_LEVEL "-O2")
+endif()
+
+message(STATUS "Build with API_ARCH=" ${API_ARCH})
+message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS})
+message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
+message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
+message(STATUS "Build with HOST_AR=" ${HOST_AR})
+
+macro(compile_kernel COMPILE_ARGS)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS)
+  cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(kernel_path ${xpu_add_library_DIRPATH})
+  set(kernel_name ${xpu_add_library_XNAME})
+  set(device_o_extra_flags ${xpu_add_library_DEVICE})
+  set(host_o_extra_flags ${xpu_add_library_HOST})
+  set(xpu_1_or_2 ${xpu_add_library_XPU})
+  set(cc_depends ${xpu_add_library_DEPENDS})
+
+  set(kernel_target ${kernel_name}_kernel)
+  add_custom_target(${kernel_target}
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      kernel_build/${kernel_name}.host.o
+      kernel_build/${kernel_name}.bin.o
+    COMMENT
+      ${kernel_target}
+    VERBATIM
+    )
+
+  if(cc_depends)
+    add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS})
+  endif()
+
+  set(arg_device_o_extra_flags ${device_o_extra_flags})
+  separate_arguments(arg_device_o_extra_flags)
+  set(arg_host_o_extra_flags ${host_o_extra_flags})
+  separate_arguments(arg_host_o_extra_flags)
+
+  set(XTDK_DIR ${XPU_TOOLCHAIN})
+  set(CXX_DIR ${HOST_SYSROOT})
+  set(XPU_CXX_INCLUDES  -I${CMAKE_SOURCE_DIR}/build -I${CMAKE_SOURCE_DIR}/paddle/fluid/framework/io -I${CMAKE_SOURCE_DIR}/build/third_party/install/zlib/include -I${CMAKE_SOURCE_DIR}/build/third_party/install -I${CMAKE_SOURCE_DIR}/build/third_party/install/gflags/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/glog/include -I${CMAKE_SOURCE_DIR}/build/third_party/boost/src/extern_boost -I${CMAKE_SOURCE_DIR}/build/third_party/eigen3/src/extern_eigen3 -I${CMAKE_SOURCE_DIR}/build/third_party/threadpool/src/extern_threadpool -I${CMAKE_SOURCE_DIR}/build/third_party/dlpack/src/extern_dlpack/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xxhash/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/warpctc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/utf8proc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/openblas/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I${CMAKE_SOURCE_DIR}/build/third_party/pybind/src/extern_pybind/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gtest/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xpu/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gloo/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include/xbyak -I${CMAKE_SOURCE_DIR}/build/third_party/install/cryptopp/include -I${CMAKE_SOURCE_DIR}/build/third_party/pocketfft/src -I${CMAKE_SOURCE_DIR} -I${CMAKE_SOURCE_DIR}/paddle/fluid/platform)
+  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
+  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU_KP -DPADDLE_WITH_XPU -DXBYAK64 -DXBYAK_NO_OP_NAMES)
+
+  add_custom_command(
+    OUTPUT
+      kernel_build/${kernel_name}.bin.o
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
+       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
+        --xpu-device-only -c -v 
+    COMMAND
+      ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      ${xpu_add_library_DEPENDS}
+    COMMENT
+      kernel_build/${kernel_name}.bin.o
+    VERBATIM
+    )
+    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
+
+  add_custom_command(
+    OUTPUT
+      kernel_build/${kernel_name}.host.o
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
+        -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
+        --xpu-host-only -c -v 
+    WORKING_DIRECTORY
+      ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS
+      ${xpu_add_library_DEPENDS}
+    COMMENT
+      kernel_build/${kernel_name}.host.o
+    VERBATIM
+    )
+    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
+endmacro()
+
+###############################################################################
+# XPU_ADD_LIBRARY
+###############################################################################
+macro(xpu_add_library TARGET_NAME)
+    # Separate the sources from the options
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs STATIC DEPENDS)
+    cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(xpu_srcs ${xpu_add_library_STATIC})
+    set(xpu_target ${TARGET_NAME})
+    set(cc_srcs_depends ${xpu_add_library_DEPENDS})
+    
+    file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
+    list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
+
+    set(XPU1_DEVICE_O_EXTRA_FLAGS " ")
+    set(XPU1_HOST_O_EXTRA_FLAGS " ")
+
+    # Distinguish .xpu file from other files
+    foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
+      get_filename_component(language_type_name ${cur_xpu_src} EXT)
+      if(${language_type_name} STREQUAL ".xpu")
+        list(APPEND xpu_kernel_lists ${cur_xpu_src})
+      else()
+        list(APPEND cc_kernel_lists ${cur_xpu_src})
+      endif()
+    endforeach()
+
+    # Ensure that there is only one xpu kernel
+    list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
+    list(LENGTH cc_srcs_depends cc_srcs_depends_num)
+
+    if(${xpu_kernel_lists_num})
+        foreach(xpu_kernel IN LISTS xpu_kernel_lists)
+            message(STATUS "Process ${xpu_kernel}")
+            get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
+            get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
+            set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
+            set(kernel_name ${kernel_name})
+            compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
+        endforeach()
+
+        add_custom_target(${xpu_target}_src ALL
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMENT
+                ${xpu_target}_src
+            VERBATIM
+            )
+
+        add_custom_command(
+            OUTPUT
+            ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            COMMAND
+                ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
+            WORKING_DIRECTORY
+                ${CMAKE_CURRENT_BINARY_DIR}
+            DEPENDS
+                ${xpu_kernel_depends}
+            COMMENT
+                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+            VERBATIM
+            ) 
+        
+        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+        add_dependencies(${xpu_target} ${xpu_target}_src)
+        target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
+    else()
+        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+    endif()
+endmacro()
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
new file mode 100644
index 0000000000000..ea68422aabbf9
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(liuxiandong): add template
+void ElementwiseAddXPU2Compute(const framework::ExecutionContext& ctx);
+
+void ElementwiseAddGradXPU2Compute(const framework::ExecutionContext& ctx);
+
+template <typename T>
+class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // ElementwiseAddXPU2Compute(ctx);
+    std::cout << "lxd_debug: elementwise_add forward" << std::endl;
+  }
+};
+
+template <typename T>
+class ElementwiseAddGradXPUKernel
+    : public ::paddle::operators::ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // ElementwiseAddGradXPU2Compute(ctx);
+    std::cout << "lxd_debug: elementwise_add backward" << std::endl;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
+                   ops::ElementwiseAddXPUKernel<float>,
+                   ops::ElementwiseAddXPUKernel<paddle::platform::float16>);
+REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace,
+                   ops::ElementwiseAddGradXPUKernel<float>,
+                   ops::ElementwiseAddGradXPUKernel<paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
new file mode 100644
index 0000000000000..63f8814d13bd9
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU_KP
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace platform {
+
+using vartype = paddle::framework::proto::VarType;
+using pOpKernelType = paddle::framework::OpKernelType;
+using XPUKernelSet =
+    std::unordered_set<pOpKernelType, paddle::framework::OpKernelType::Hash>;
+using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
+
+XPUOpMap& get_kp_ops() {
+  // KL1支持的op，通过op_name, data_type, place来索引
+  static XPUOpMap s_xpu_kp_kernels{
+      {"elementwise_add_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"elementwise_add",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      // AddMore
+  };
+
+  return s_xpu_kp_kernels;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif

From 83da9f55538d5037e3155593e5d8a560796495c3 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 28 Dec 2021 12:02:29 +0000
Subject: [PATCH 24/41] update

---
 paddle/fluid/imperative/prepared_operator.cc | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 5d02438b4c8ed..754fcc221c5a6 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -213,10 +213,24 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
-  } else {
-    VLOG(3) << "This is XPU : " << op.Type();
   }
 #endif
+
+#ifdef PADDLE_WITH_XPU_KP
+  bool use_xpu_kp_kernel_rt =
+      FLAGS_run_kp_kernel &&
+      paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+  bool use_xpu_kp_kernel_debug =
+      paddle::platform::is_in_xpu_kpwhite_list(type_);
+  if (is_xpu_place(expected_kernel_key.place_) &&
+      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
+    expected_kernel_key.library_type_ = LibraryType::kKP;
+    kernel_iter = kernels.find(expected_kernel_key);
+    VLOG(3) << "using XPU KP kernel: " << type_
+            << ", using_kernel_key:" << expected_kernel_key;
+  }
+#endif
+
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
       is_npu_place(expected_kernel_key.place_)) {

From 8b9c96cdb46d3d5d7aa29fde1a58e1aa90b58098 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Wed, 29 Dec 2021 03:21:43 +0000
Subject: [PATCH 25/41] fix prepared type_ bug

---
 paddle/fluid/imperative/prepared_operator.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 754fcc221c5a6..f36b609d45258 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -219,14 +219,14 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #ifdef PADDLE_WITH_XPU_KP
   bool use_xpu_kp_kernel_rt =
       FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+      paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
   bool use_xpu_kp_kernel_debug =
-      paddle::platform::is_in_xpu_kpwhite_list(type_);
+      paddle::platform::is_in_xpu_kpwhite_list(op.Type());
   if (is_xpu_place(expected_kernel_key.place_) &&
       (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
     expected_kernel_key.library_type_ = LibraryType::kKP;
     kernel_iter = kernels.find(expected_kernel_key);
-    VLOG(3) << "using XPU KP kernel: " << type_
+    VLOG(3) << "using XPU KP kernel: " << op.Type()
             << ", using_kernel_key:" << expected_kernel_key;
   }
 #endif

From 12522e10f939faab25a423b357e4294e3cbdc5df Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Wed, 5 Jan 2022 06:12:46 +0000
Subject: [PATCH 26/41] clean and update the code

---
 cmake/operators.cmake                         | 25 +++++++-
 paddle/fluid/imperative/prepared_operator.cc  | 29 +++++++--
 .../elementwise/elementwise_add_op.xpu        | 45 ++++++++------
 .../elementwise/elementwise_add_op_kps.cc     | 61 -------------------
 .../operators/elementwise/elementwise_op.h    |  1 +
 .../fluid/platform/device/xpu/xpu_op_list.cc  |  9 +++
 paddle/fluid/platform/enforce.h               |  3 +-
 7 files changed, 85 insertions(+), 88 deletions(-)
 delete mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 60572f040cd75..04835e76765f3 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -333,13 +333,32 @@ function(op_library TARGET)
     endif()
 
     # pybind USE_OP_DEVICE_KERNEL for XPU
+    if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
+    endif()
+
     if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
-    if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
-    endif()
+    # if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+    #     file(READ ${ORIGINAL_TARGET}_xpu.cc TARGET_XPU_CONTENT)
+    #     # It is different from the logic above, becareful
+    #     string(REGEX MATCH "REGISTER_OP_XPU_KERNEL\\(.*" multi_xpu_register "${TARGET_XPU_CONTENT}")
+    #     # [ \t\r\n]* is used for blank characters
+    #     string(REGEX MATCH "REGISTER_OP_XPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_xpu_register "${multi_xpu_register}")
+
+    #     if (one_xpu_register STREQUAL "")
+    #         string(REPLACE "_op" "" XPU_TARGET "${TARGET}")
+    #     else ()
+    #         string(REPLACE "REGISTER_OP_XPU_KERNEL(" "" XPU_TARGET "${one_xpu_register}")
+    #         string(REPLACE "," "" XPU_TARGET "${XPU_TARGET}")
+    #         # [ \t\r\n]+ is used for blank characters.
+    #         # Here we use '+' instead of '*' since it is a REPLACE operation.
+    #         string(REGEX REPLACE "[ \t\r\n]+" "" XPU_TARGET "${XPU_TARGET}")
+    #     endif()
+    #     file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${XPU_TARGET}, XPU);\n")
+    # endif()
 
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
         file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index f36b609d45258..e59a67744cef1 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -161,6 +161,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
       op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs);
   auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(3) << "expected_kernel_key.place_:" << expected_kernel_key.place_;
 
   if (FLAGS_run_pten_kernel &&
       pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
@@ -198,11 +199,6 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
 
-  // added by liuxiandong for test
-  if (FLAGS_run_kp_kernel) {
-    std::cout << "****** FLAGS_use_kp  prepared_operator.cc *******";
-  }
-
 #ifdef PADDLE_WITH_XPU
   if (is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
@@ -213,7 +209,21 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
+
+    if (!is_xpu_place(expected_kernel_key.place_)) {
+      VLOG(3) << "lxd_debug: is not xpu_place";
+    }
+    if (kernel_iter == kernels.end()) {
+      VLOG(3) << "lxd_debug: can't find KP kernel";
+    }
+    if (!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) {
+      VLOG(3) << "lxd_debug: is not support current op";
+    }
+    if (paddle::platform::is_in_xpu_black_list(op.Type())) {
+      VLOG(3) << "lxd_debug: is in black list";
+    }
   }
+
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
@@ -222,9 +232,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
       paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
   bool use_xpu_kp_kernel_debug =
       paddle::platform::is_in_xpu_kpwhite_list(op.Type());
+  if (use_xpu_kp_kernel_rt) {
+    VLOG(3) << "lxd_debug: using rt mode ";
+  }
+  if (use_xpu_kp_kernel_debug) {
+    VLOG(3) << "lxd_debug: using debug mode ";
+  }
   if (is_xpu_place(expected_kernel_key.place_) &&
       (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
-    expected_kernel_key.library_type_ = LibraryType::kKP;
+    expected_kernel_key.place_ = platform::XPUPlace();
+    expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
     kernel_iter = kernels.find(expected_kernel_key);
     VLOG(3) << "using XPU KP kernel: " << op.Type()
             << ", using_kernel_key:" << expected_kernel_key;
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
index 1a9a922132378..43cfabe7ae96c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-/*
-#ifdef PADDLE_WITH_XPU
 
 #if defined(__CUDA_ARCH__)
   #undef __CUDA_ARCH__
@@ -31,12 +29,9 @@ limitations under the License. */
   #undef __NVCC__
 #endif
 
-
-
+// Note(liuxiandong)
+#ifdef __xpu_on_host__
 #include <xpu/runtime.h>
-#include "xpu/kernel/cluster_header.h"
-#include "xpu/kernel/debug.h"
-#include "xpu/kernel/math.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include <memory>
 #include <string>
@@ -47,18 +42,34 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-
-void ElementwiseAddXPU2Compute(const framework::ExecutionContext& ctx){
-  std::cout<<"lxd_debug: XPU2 forward element_add !"<<std::endl;
-}
-
-void ElementwiseAddGradXPU2Compute(const framework::ExecutionContext& ctx){
-  std::cout<<"lxd_debug: XPU2 backward element_add !"<<std::endl;
-}
-
+template <typename T>
+class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::cout << "lxd_debug: elementwise_add forward" << std::endl;
+  }
+};
+
+template <typename T>
+class ElementwiseAddGradXPUKernel
+    : public ::paddle::operators::ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::cout << "lxd_debug: elementwise_add backward" << std::endl;
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
+                   ops::ElementwiseAddXPUKernel<float>,
+                   ops::ElementwiseAddXPUKernel<paddle::platform::float16>);
+
+REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace,
+                   ops::ElementwiseAddGradXPUKernel<float>,
+                   ops::ElementwiseAddGradXPUKernel<paddle::platform::float16>);
 #endif
-*/
\ No newline at end of file
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
deleted file mode 100644
index ea68422aabbf9..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
-
-namespace paddle {
-namespace operators {
-
-// TODO(liuxiandong): add template
-void ElementwiseAddXPU2Compute(const framework::ExecutionContext& ctx);
-
-void ElementwiseAddGradXPU2Compute(const framework::ExecutionContext& ctx);
-
-template <typename T>
-class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // ElementwiseAddXPU2Compute(ctx);
-    std::cout << "lxd_debug: elementwise_add forward" << std::endl;
-  }
-};
-
-template <typename T>
-class ElementwiseAddGradXPUKernel
-    : public ::paddle::operators::ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // ElementwiseAddGradXPU2Compute(ctx);
-    std::cout << "lxd_debug: elementwise_add backward" << std::endl;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
-                   ops::ElementwiseAddXPUKernel<float>,
-                   ops::ElementwiseAddXPUKernel<paddle::platform::float16>);
-REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace,
-                   ops::ElementwiseAddGradXPUKernel<float>,
-                   ops::ElementwiseAddGradXPUKernel<paddle::platform::float16>);
-#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 91867c890d2c1..c455f53fc52a5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -124,6 +124,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
                                      framework::LibraryType::kMKLDNN);
     }
 #endif
+
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 9066b84ce5c4c..78c19d518c304 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -28,6 +28,15 @@ bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
       get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
   if (v == XPU2) {
     ops = get_kl2_ops();
+    VLOG(3) << "lxd_debug: op " << op_name << " get_kl2_ops";
+  }
+
+  if (ops.find(op_name) == ops.end()) {
+    VLOG(3) << "lxd_debug: can't find " << op_name << " in related list";
+  }
+  if (ops[op_name].find(type) == ops[op_name].end()) {
+    VLOG(3) << "lxd_debug: can't find type " << op_name
+            << " cause have no library_type";
   }
 
   if (ops.find(op_name) != ops.end() &&
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index dcd90b27a3b88..f450156c77a0d 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -377,7 +377,8 @@ struct EnforceNotMet : public std::exception {
 
   EnforceNotMet(const ErrorSummary& error, const char* file, int line)
       : code_(error.code()),
-        err_str_(GetTraceBackString("lxd_debug error summy", file, line)) {
+        err_str_(GetTraceBackString("lxd_debug: error here", file,
+                                    line)) {  // error.to_string()
     simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
   }
 

From 90abd9c8a0bd6f54043ad02ff7eb2fd0afd58774 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Wed, 5 Jan 2022 06:29:19 +0000
Subject: [PATCH 27/41] reset the kernel_primitives

---
 .../compute_primitives_xpu2.h                 | 82 ++++++++++---------
 .../datamover_primitives_xpu2.h               | 68 +++++++--------
 .../kernel_primitives/functor_primitives.h    | 69 ++++++++--------
 3 files changed, 115 insertions(+), 104 deletions(-)

diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h b/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
index 26678dd7d644c..3235591580916 100644
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
@@ -48,31 +48,31 @@ static inline __device__ void sync_all() {
 #define ncores 64
 template <typename T, typename OpFunc, int VecSize>
 __device__ void BlockXReduce(T* data, OpFunc reducer) {
-  //  __shared__ T sum_array[ncores * VecSize];
-  //  int core_idx = core_id() * VecSize;
-  //  mfence();
-  //  sync_all();
-  //
-  // #pragma unroll
-  //  for (int i = 0; i < VecSize; i++) {
-  //    mfence();
-  //    sum_array[core_idx + i] = data[i];
-  //    mfence();
-  //    data[i] = 0;
-  //  }
-  //  sync_all();
-  // #pragma unroll
-  //  for (int i = 0; i < VecSize; i++) {
-  // #pragma unroll
-  //    for (int j = 0; j < ncores; j++) {
-  //      mfence();
-  //      T tmp = sum_array[j * VecSize + i];
-  //      mfence();
-  //      data[i] = reducer(data[i], tmp);
-  //      mfence();
-  //    }
-  //  }
-  //  sync_all();
+  __shared__ T sum_array[ncores * VecSize];
+  int core_idx = core_id() * VecSize;
+  mfence();
+  sync_all();
+
+#pragma unroll
+  for (int i = 0; i < VecSize; i++) {
+    mfence();
+    sum_array[core_idx + i] = data[i];
+    mfence();
+    data[i] = 0;
+  }
+  sync_all();
+#pragma unroll
+  for (int i = 0; i < VecSize; i++) {
+#pragma unroll
+    for (int j = 0; j < ncores; j++) {
+      mfence();
+      T tmp = sum_array[j * VecSize + i];
+      mfence();
+      data[i] = reducer(data[i], tmp);
+      mfence();
+    }
+  }
+  sync_all();
 }
 #undef ncores
 
@@ -104,7 +104,8 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ void ElementwiseUnary(OutT* out, const InT* in, OpFunc compute) {
+__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
+                                                 OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; idx++) {
     out[idx] = static_cast<OutT>(compute(in[idx]));
@@ -138,8 +139,9 @@ __device__ void ElementwiseUnary(OutT* out, const InT* in, OpFunc compute) {
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ void ElementwiseBinary(OutT* out, const InT* in1, const InT* in2,
-                                  OpFunc compute) {
+__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
+                                                  const InT* in2,
+                                                  OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
     out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx]));
@@ -175,8 +177,10 @@ __device__ void ElementwiseBinary(OutT* out, const InT* in1, const InT* in2,
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ void ElementwiseTernary(OutT* out, const InT* in1, const InT* in2,
-                                   const InT* in3, OpFunc compute) {
+__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
+                                                   const InT* in2,
+                                                   const InT* in3,
+                                                   OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
     out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
@@ -210,7 +214,8 @@ __device__ void ElementwiseTernary(OutT* out, const InT* in1, const InT* in2,
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
           class OpFunc>
-__device__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], OpFunc compute) {
+__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
+                                               OpFunc compute) {
   __local__ InT args[Arity];
 #pragma unroll
   for (int idx = 0; idx < NX * NY; ++idx) {
@@ -250,8 +255,8 @@ __device__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], OpFunc compute) {
  */
 template <typename InT, typename OutT, int NX, int NY, int BlockSize,
           class OpFunc>
-__device__ void CycleBinary(OutT* out, const InT* in1, const InT* in2,
-                            OpFunc compute) {
+__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
+                                            const InT* in2, OpFunc compute) {
 #pragma unroll
   for (int idx = 0; idx < NX; idx++) {
 #pragma unroll
@@ -289,11 +294,12 @@ __device__ void CycleBinary(OutT* out, const InT* in1, const InT* in2,
  * reducer: Compute function which was declared like ReduceFunctor<InT>().
  * reduce_last_dim: if the last dim gets involved in reduction.
  */
-template <typename T, int NX, int NY, int BlockSize, typename ReduceFunctor,
+template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
           details::ReduceMode Mode>
-__device__ void Reduce(T* out, const T* in, ReduceFunctor reducer,
-                       bool reduce_last_dim) {
-  if (Mode == details::kGlobalMode) {
+__device__ __forceinline__ void Reduce(T* out, const T* in,
+                                       ReduceFunctor reducer,
+                                       bool reduce_last_dim) {
+  if (Mode == kGlobalMode) {
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
 #pragma unroll
@@ -301,7 +307,7 @@ __device__ void Reduce(T* out, const T* in, ReduceFunctor reducer,
         out[i] = reducer(out[i], in[i * NX + j]);
       }
     }
-    // BlockXReduce<T, ReduceFunctor, NY>(out, reducer);
+    BlockXReduce<T, OpFunc, NY>(out, reducer);
   } else {  // else  kLocalMode
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
index 6aef07d1bd463..b27ba27b3c6f1 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
@@ -56,11 +56,11 @@ struct BroadcastConfig {
     }
 
     for (int i = 1; i < dim_size - 1; ++i) {
-      strides_out[dim_size - i - 1] =
-          std::accumulate(out_dims.begin(), out_dims.begin() + i, 1,
-                          std::multiplies<int64_t>());
-      strides_in[dim_size - i - 1] = std::accumulate(
-          in_dims.begin(), in_dims.begin() + i, 1, std::multiplies<int64_t>());
+      strides_out[dim_size - i - 1] = std::accumulate(
+          out_dims.begin(), out_dims.begin() + i, 1, std::multiplies<int64_t>())
+          strides_in[dim_size - i - 1] =
+              std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
+                              std::multiplies<int64_t>())
     }
 
     memcpy(stride_in, strides_in.data(), kDims * sizeof(uint32_t));
@@ -99,11 +99,12 @@ struct BroadcastConfig {
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
-__device__ void ReadData(Ty* dst, const Tx _global_ptr_* src, int size_nx,
-                         int size_ny, int stride_nx, int stride_ny) {
+__device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
+                                         int size_nx, int size_ny,
+                                         int stride_nx, int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
-  __local__ Tx in_temp[1];
+  __local__ T in_temp[1];
   // Each branch is added for better performance
   if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1
     if (IsBoundary) {
@@ -167,7 +168,7 @@ __device__ void ReadData(Ty* dst, const Tx _global_ptr_* src, int size_nx,
  * init_data: Initial value.
  */
 template <typename T, int NX>
-__device__ void Init(T* dst, T init_data) {
+__device__ __forceinline__ void Init(T* dst, T init_data) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     dst[i] = init_data;
@@ -196,7 +197,8 @@ __device__ void Init(T* dst, T init_data) {
  * size: The current block needs to load size data continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ void ReadData(T* dst, const T _global_ptr_* src, int num) {
+__device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
+                                         int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
   if (IsBoundary) {  // core_num() * NX > num
@@ -239,10 +241,10 @@ __device__ void ReadData(T* dst, const T _global_ptr_* src, int num) {
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
-__device__ void ReadDataBc(T* dst, const T _global_ptr_* src,
-                           uint32_t block_offset,
-                           details::BroadcastConfig<Rank> config,
-                           int total_num_output, int stride_nx, int stride_ny) {
+__device__ __forceinline__ void ReadDataBc(
+    T* dst, const T _global_ptr_* src, uint32_t block_offset,
+    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
+    int stride_ny) {
   uint32_t thread_offset = block_offset + core_id();
   uint32_t index_src = 0;
   __local__ T in_temp[1];
@@ -254,7 +256,7 @@ __device__ void ReadDataBc(T* dst, const T _global_ptr_* src,
       uint32_t index_output = thread_offset + ny * stride_ny + nx * stride_nx;
       index_src = 0;
       if (IsBoundary) {
-        if (index_output >= (uint32_t)total_num_output) {
+        if (index_output >= total_num_output) {
           break;
         }
       }
@@ -303,10 +305,10 @@ __device__ void ReadDataBc(T* dst, const T _global_ptr_* src,
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           typename IndexCal, bool IsBoundary = false>
-__device__ void ReadDataReduce(T* dst, const T _global_ptr_* src,
-                               int block_offset, const IndexCal& index_cal,
-                               int size_nx, int size_ny, int stride_nx,
-                               int stride_ny, bool reduce_last_dim) {
+__device__ __forceinline__ void ReadDataReduce(
+    T* dst, const T _global_ptr_* src, int block_offset,
+    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
+    int stride_ny, bool reduce_last_dim) {
   __local__ T in_temp[1];
   int thread_offset = 0;
   int left_size_nx = size_nx;
@@ -419,8 +421,9 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
-__device__ void WriteData(Ty _global_ptr_* dst, const Tx* src, int size_nx,
-                          int size_ny, int stride_nx, int stride_ny) {
+__device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
+                                          int size_nx, int size_ny,
+                                          int stride_nx, int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
   __local__ Ty in_temp[1];
@@ -430,11 +433,11 @@ __device__ void WriteData(Ty _global_ptr_* dst, const Tx* src, int size_nx,
     if (IsBoundary) {
       if (left_size_nx > 0) {
         in_temp[0] = static_cast<Ty>(src[0]);
-        LM2GM(in_temp, dst + thread_offset, sizeof(Ty));
+        LM2GM(in_temp, dst + thread_offset, sizeof(T));
       }
     } else {
       in_temp[0] = static_cast<Ty>(src[0]);
-      LM2GM(in_temp, dst + thread_offset, sizeof(Ty));
+      LM2GM(in_temp, dst + thread_offset, sizeof(T));
     }
   } else if (NX == 1) {
 #pragma unroll
@@ -446,7 +449,7 @@ __device__ void WriteData(Ty _global_ptr_* dst, const Tx* src, int size_nx,
       }
 
       in_temp[0] = static_cast<Ty>(src[idy]);
-      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(Ty));
+      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(T));
     }
   } else if (NY == 1) {  // for NY == 1 and NX != 1
 #pragma unroll
@@ -458,7 +461,7 @@ __device__ void WriteData(Ty _global_ptr_* dst, const Tx* src, int size_nx,
       }
 
       in_temp[0] = static_cast<Ty>(src[idx]);
-      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(Ty));
+      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(T));
     }
   } else {  // for NX != 1 and NY != 1
 #pragma unroll
@@ -477,7 +480,7 @@ __device__ void WriteData(Ty _global_ptr_* dst, const Tx* src, int size_nx,
         }
         in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
         LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
-              sizeof(Ty));
+              sizeof(T));
       }
     }
   }
@@ -495,7 +498,7 @@ __device__ void WriteData(Ty _global_ptr_* dst, const Tx* src, int size_nx,
  * init_data: The register pointer of init data, the size is NX.
  */
 template <typename T, int NX, bool IsBoundary = false>
-__device__ void Init(T* dst, T* init_data, int num) {
+__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     if (IsBoundary) {
@@ -532,10 +535,9 @@ __device__ void Init(T* dst, T* init_data, int num) {
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
-__device__ void ReadDataBc(T* dst, const T _global_ptr_* src,
-                           uint32_t block_offset,
-                           details::BroadcastConfig<Rank> config,
-                           int total_num_output) {
+__device__ __forceinline__ void ReadDataBc(
+    T* dst, const T _global_ptr_* src, uint32_t block_offset,
+    details::BroadcastConfig<Rank> config, int total_num_output) {
   uint32_t thread_offset = block_offset + core_id() * NX;
   uint32_t index_src = 0;
   __local__ T in_temp[1];
@@ -545,7 +547,7 @@ __device__ void ReadDataBc(T* dst, const T _global_ptr_* src,
     uint32_t index_output = thread_offset + nx;
     index_src = 0;
     if (IsBoundary) {
-      if (index_output >= (uint32_t)total_num_output) {
+      if (index_output >= total_num_output) {
         break;
       }
     }
@@ -556,7 +558,7 @@ __device__ void ReadDataBc(T* dst, const T _global_ptr_* src,
       index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
     }
     GM2LM(src + index_src, in_temp, sizeof(T));
-    dst[nx] = in_temp[0];
+    dst[nx + ny * NX] = in_temp[0];
   }
 }
 
diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
index 7fd497bf394a7..2bd8721b82fa2 100644
--- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -18,31 +18,27 @@
 #include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
-#include "xpu/kernel/cluster_header.h"
-#include "xpu/kernel/debug.h"
-#include "xpu/kernel/math.h"
 
 namespace paddle {
 namespace operators {
 namespace kernel_primitives {
 namespace details {
 
-// static __device__ platform::float16 Exp(platform::float16 x) {
-//  // return 1;//::Eigen::numext::exp(x);
-//  //return static_cast<platform::float16>(1);//::Eigen::numext::log(x);
-//}
+static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) {
+  return ::Eigen::numext::exp(x);
+}
 
-// static __device__  float Exp(float x) { return expf(x); }
+static __device__ __forceinline__ float Exp(float x) { return expf(x); }
 
-// static __device__  double Exp(double x) { return exp(x); }
+static __device__ __forceinline__ double Exp(double x) { return exp(x); }
 
-// static __device__  platform::float16 Log(platform::float16 x) {
-//  return static_cast<platform::float16>(1);//::Eigen::numext::log(x);
-//}
+static __device__ __forceinline__ platform::float16 Log(platform::float16 x) {
+  return ::Eigen::numext::log(x);
+}
 
-// static __device__  float Log(float x) { return logf(x); }
+static __device__ __forceinline__ float Log(float x) { return logf(x); }
 
-// static __device__  double Log(double x) { return log(x); }
+static __device__ __forceinline__ double Log(double x) { return log(x); }
 
 }  // namespace details
 
@@ -51,16 +47,16 @@ namespace details {
 /**
  * @brief Default unary exp functor
  */
-// template <typename Tx, typename Ty = Tx>
-// struct ExpFunctor {
-//  HOSTDEVICE inline ExpFunctor() {}
-//
-//  HOSTDEVICE explicit inline ExpFunctor(int n) {}
-//
-//  HOSTDEVICE inline Ty operator()(const Tx& x) const {
-//    return static_cast<Ty>(details::Exp(x));
-//  }
-//};
+template <typename Tx, typename Ty = Tx>
+struct ExpFunctor {
+  HOSTDEVICE inline ExpFunctor() {}
+
+  HOSTDEVICE explicit inline ExpFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(details::Exp(x));
+  }
+};
 
 /**
  * @brief Default unary identity functor
@@ -132,10 +128,9 @@ struct SquareFunctor {
  */
 template <typename T>
 struct MinFunctor {
-  inline T initial() { /*return static_cast<T>(std::numeric_limits<T>::max());*/
-  }
+  inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
 
-  __device__ T operator()(const T& a, const T& b) const {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
     return (b < a) ? b : a;
   }
 };
@@ -146,10 +141,10 @@ struct MinFunctor {
 template <typename T>
 struct MaxFunctor {
   inline T initial() {
-    // return static_cast<T>(std::numeric_limits<T>::lowest());
+    return static_cast<T>(std::numeric_limits<T>::lowest());
   }
 
-  __device__ T operator()(const T& a, const T& b) const {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
     return (b > a) ? b : a;
   }
 };
@@ -161,7 +156,9 @@ template <typename T>
 struct AddFunctor {
   inline T initial() { return static_cast<T>(0.0f); }
 
-  __device__ T operator()(const T& a, const T& b) const { return b + a; }
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return b + a;
+  }
 };
 
 /**
@@ -171,7 +168,9 @@ template <typename T>
 struct MulFunctor {
   inline T initial() { return static_cast<T>(1.0f); }
 
-  __device__ T operator()(const T& a, const T& b) const { return b * a; }
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return b * a;
+  }
 };
 
 /**
@@ -181,7 +180,9 @@ template <typename T>
 struct LogicalOrFunctor {
   inline T initial() { return static_cast<T>(false); }
 
-  __device__ T operator()(const T& a, const T& b) const { return b || a; }
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return b || a;
+  }
 };
 
 /**
@@ -191,7 +192,9 @@ template <typename T>
 struct LogicalAndFunctor {
   inline T initial() { return static_cast<T>(true); }
 
-  __device__ T operator()(const T& a, const T& b) const { return b && a; }
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return b && a;
+  }
 };
 
 /**

From 50caa4830c00cf7519798615e9b5ce5f1bcdc323 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Wed, 5 Jan 2022 06:37:51 +0000
Subject: [PATCH 28/41] update

---
 paddle/fluid/operators/elementwise/elementwise_op.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index c455f53fc52a5..91867c890d2c1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -124,7 +124,6 @@ class ElementwiseOp : public framework::OperatorWithKernel {
                                      framework::LibraryType::kMKLDNN);
     }
 #endif
-
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 

From 1970e17de3af60642852eb2fb7ccbac6b10ff141 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Wed, 5 Jan 2022 09:38:56 +0000
Subject: [PATCH 29/41] clean the code

---
 paddle/fluid/framework/tensor_impl.h | 8 ++++++--
 paddle/fluid/platform/enforce.h      | 3 +--
 paddle/pten/common/data_type.h       | 4 ----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 08ade36afafce..a83b5d0662bb9 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -28,7 +28,9 @@ inline const T* Tensor::data() const {
   PADDLE_ENFORCE_EQ(
       valid, true,
       platform::errors::InvalidArgument(
-          "Tensor holds the wrong type, it holds, but desires to be."));
+          "Tensor holds the wrong type, it holds %s, but desires to be %s.",
+          DataTypeToString(type_),
+          DataTypeToString(DataTypeTrait<T>::DataType())));
 
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -44,7 +46,9 @@ inline T* Tensor::data() {
   PADDLE_ENFORCE_EQ(
       valid, true,
       platform::errors::InvalidArgument(
-          "Tensor holds the wrong type, it holds, but desires to be"));
+          "Tensor holds the wrong type, it holds %s, but desires to be %s",
+          DataTypeToString(type_),
+          DataTypeToString(DataTypeTrait<T>::DataType())));
 
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index e50cf7ac566e0..30930897ea8ca 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -378,8 +378,7 @@ struct EnforceNotMet : public std::exception {
 
   EnforceNotMet(const ErrorSummary& error, const char* file, int line)
       : code_(error.code()),
-        err_str_(GetTraceBackString("lxd_debug: error here", file,
-                                    line)) {  // error.to_string()
+        err_str_(GetTraceBackString(error.to_string(), file, line)) {
     simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
   }
 
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index 252565385766d..a00d68c535415 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -18,10 +18,6 @@ limitations under the License. */
 #include "complex.h"   // NOLINT
 #include "float16.h"   // NOLINT
 
-// #include "paddle/fluid/platform/bfloat16.h"
-// #include "paddle/fluid/platform/complex.h"
-// #include "paddle/fluid/platform/enforce.h"
-// #include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/api/ext/exception.h"
 
 namespace paddle {

From 51f32e16d6677c9673c08c850d5a4e48eae6e80c Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Wed, 5 Jan 2022 09:42:25 +0000
Subject: [PATCH 30/41] delete useless comment

---
 cmake/operators.cmake | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index ff57e0730a0bf..3342d8bc78802 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -343,25 +343,6 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
-    # if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
-    #     file(READ ${ORIGINAL_TARGET}_xpu.cc TARGET_XPU_CONTENT)
-    #     # It is different from the logic above, becareful
-    #     string(REGEX MATCH "REGISTER_OP_XPU_KERNEL\\(.*" multi_xpu_register "${TARGET_XPU_CONTENT}")
-    #     # [ \t\r\n]* is used for blank characters
-    #     string(REGEX MATCH "REGISTER_OP_XPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_xpu_register "${multi_xpu_register}")
-
-    #     if (one_xpu_register STREQUAL "")
-    #         string(REPLACE "_op" "" XPU_TARGET "${TARGET}")
-    #     else ()
-    #         string(REPLACE "REGISTER_OP_XPU_KERNEL(" "" XPU_TARGET "${one_xpu_register}")
-    #         string(REPLACE "," "" XPU_TARGET "${XPU_TARGET}")
-    #         # [ \t\r\n]+ is used for blank characters.
-    #         # Here we use '+' instead of '*' since it is a REPLACE operation.
-    #         string(REGEX REPLACE "[ \t\r\n]+" "" XPU_TARGET "${XPU_TARGET}")
-    #     endif()
-    #     file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${XPU_TARGET}, XPU);\n")
-    # endif()
-
     if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
         file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
         # It is different from the logic above, becareful

From 5576ba9eff63bbd08fc121a95d930d53b465308d Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 6 Jan 2022 03:22:39 +0000
Subject: [PATCH 31/41] fix the bug in WITH_XPU

---
 ...e_add_op.xpu => elementwise_add_op_kps.cc} | 26 +++-----------
 .../fluid/platform/device/xpu/xpu_op_list.cc  | 34 ++++++++++---------
 .../fluid/platform/device/xpu/xpu_op_list.h   |  5 ++-
 3 files changed, 26 insertions(+), 39 deletions(-)
 rename paddle/fluid/operators/elementwise/{elementwise_add_op.xpu => elementwise_add_op_kps.cc} (88%)

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
similarity index 88%
rename from paddle/fluid/operators/elementwise/elementwise_add_op.xpu
rename to paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
index 43cfabe7ae96c..d7d0c3bc68ee7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.xpu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
@@ -12,41 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-#if defined(__CUDA_ARCH__)
-  #undef __CUDA_ARCH__
-#endif
-
-#if defined(__CUDACC__)
-  #undef __CUDACC__
-#endif
-
-#if defined(__CUDA__)
-  #undef __CUDA__
-#endif
-
-#if defined(__NVCC__)
-  #undef __NVCC__
-#endif
-
-// Note(liuxiandong)
-#ifdef __xpu_on_host__
-#include <xpu/runtime.h>
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include <xpu/runtime.h>
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 
-
 namespace paddle {
 namespace operators {
 
 template <typename T>
 class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     std::cout << "lxd_debug: elementwise_add forward" << std::endl;
+    XPUElementwise<T, XPUType>(ctx, xpu::broadcast_add<XPUType>);
   }
 };
 
@@ -72,4 +55,3 @@ REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
 REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace,
                    ops::ElementwiseAddGradXPUKernel<float>,
                    ops::ElementwiseAddGradXPUKernel<paddle::platform::float16>);
-#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 78c19d518c304..fb26c96f4de1c 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -46,22 +46,6 @@ bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
   return false;
 }
 
-bool is_xpu_kp_support_op(const std::string& op_name,
-                          const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
-  auto v =
-      get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
-  if (v == XPU2) {
-    ops = get_kp_ops();
-  }
-
-  if (ops.find(op_name) != ops.end() &&
-      ops[op_name].find(type) != ops[op_name].end()) {
-    return true;
-  }
-  return false;
-}
-
 // ops_string contains op_list(e.g., 'mul,mul_grad'), parse the op string and
 // insert op to op set
 static void tokenize(const std::string& ops, char delim,
@@ -101,6 +85,23 @@ bool is_in_xpu_black_list(const std::string& op_name) {
   return false;
 }
 
+#ifdef PADDLE_WITH_XPU_KP
+bool is_xpu_kp_support_op(const std::string& op_name,
+                          const pOpKernelType& type) {
+  auto& ops = get_kl1_ops();
+  auto v =
+      get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
+  if (v == XPU2) {
+    ops = get_kp_ops();
+  }
+
+  if (ops.find(op_name) != ops.end() &&
+      ops[op_name].find(type) != ops[op_name].end()) {
+    return true;
+  }
+  return false;
+}
+
 bool is_in_xpu_kpwhite_list(const std::string& op_name) {
   static bool inited = false;
   static std::unordered_set<std::string> xpu_kpwhite_list;
@@ -125,6 +126,7 @@ bool is_in_xpu_kpwhite_list(const std::string& op_name) {
   }
   return false;
 }
+#endif
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index d532b610e8f9f..d0c1458b2d13b 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -21,10 +21,13 @@ namespace platform {
 using pOpKernelType = paddle::framework::OpKernelType;
 
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
+bool is_in_xpu_black_list(const std::string& op_name);
+
+#ifdef PADDLE_WITH_XPU_KP
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type);
-bool is_in_xpu_black_list(const std::string& op_name);
 bool is_in_xpu_kpwhite_list(const std::string& op_name);
+#endif
 
 }  // namespace platform
 }  // namespace paddle

From cec1cb08752faa859b8c07a945f49dc348a10b2f Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Mon, 10 Jan 2022 12:49:41 +0000
Subject: [PATCH 32/41] update

---
 paddle/fluid/framework/operator.cc            |  6 ------
 paddle/fluid/imperative/prepared_operator.cc  | 19 ++-----------------
 .../fluid/platform/device/xpu/xpu_op_list.cc  |  9 ---------
 3 files changed, 2 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b822a3b96d1d2..707aec3acb8f5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -53,7 +53,6 @@ class LoDTensor;
 DECLARE_bool(benchmark);
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
-DECLARE_bool(use_kp);
 PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
                              "number of threads for inner op");
 DECLARE_bool(run_pten_kernel);
@@ -1323,11 +1322,6 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 
-  // added by liuxiandong for test
-  if (FLAGS_run_kp_kernel) {
-    std::cout << "****** FLAGS_use_kp  operator.cc *******";
-  }
-
 #ifdef PADDLE_WITH_MKLDNN
   // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
   if (kernel_iter == kernels.end() &&
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 1d5917ee6a467..47bcaf0735998 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -160,8 +160,6 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto dygraph_exe_ctx = DygraphExecutionContext<VarType>(
       op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs);
   auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-  VLOG(3) << "expected_kernel_key.place_:" << expected_kernel_key.place_;
 
   if (FLAGS_run_pten_kernel &&
       pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
@@ -209,19 +207,6 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
-
-    if (!is_xpu_place(expected_kernel_key.place_)) {
-      VLOG(3) << "lxd_debug: is not xpu_place";
-    }
-    if (kernel_iter == kernels.end()) {
-      VLOG(3) << "lxd_debug: can't find KP kernel";
-    }
-    if (!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) {
-      VLOG(3) << "lxd_debug: is not support current op";
-    }
-    if (paddle::platform::is_in_xpu_black_list(op.Type())) {
-      VLOG(3) << "lxd_debug: is in black list";
-    }
   }
 
 #endif
@@ -233,10 +218,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   bool use_xpu_kp_kernel_debug =
       paddle::platform::is_in_xpu_kpwhite_list(op.Type());
   if (use_xpu_kp_kernel_rt) {
-    VLOG(3) << "lxd_debug: using rt mode ";
+    VLOG(3) << "xpu_kp using rt mode ";
   }
   if (use_xpu_kp_kernel_debug) {
-    VLOG(3) << "lxd_debug: using debug mode ";
+    VLOG(3) << "xpu_kp using debug mode ";
   }
   if (is_xpu_place(expected_kernel_key.place_) &&
       (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index fb26c96f4de1c..0e4eec89945e2 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -28,15 +28,6 @@ bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
       get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
   if (v == XPU2) {
     ops = get_kl2_ops();
-    VLOG(3) << "lxd_debug: op " << op_name << " get_kl2_ops";
-  }
-
-  if (ops.find(op_name) == ops.end()) {
-    VLOG(3) << "lxd_debug: can't find " << op_name << " in related list";
-  }
-  if (ops[op_name].find(type) == ops[op_name].end()) {
-    VLOG(3) << "lxd_debug: can't find type " << op_name
-            << " cause have no library_type";
   }
 
   if (ops.find(op_name) != ops.end() &&

From 4181edb17af8fd51dccbd5c9fedd5eb2646a84ec Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 13 Jan 2022 06:20:22 +0000
Subject: [PATCH 33/41] update

---
 cmake/xpu_kp.cmake                            | 14 +++++
 .../elementwise/elementwise_add_op_kps.cc     | 57 -------------------
 2 files changed, 14 insertions(+), 57 deletions(-)
 delete mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc

diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index 27d67f65931b7..93ce771051020 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 if(NOT WITH_XPU_KP)
     return()
 endif()
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
deleted file mode 100644
index d7d0c3bc68ee7..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_kps.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include <xpu/runtime.h>
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::cout << "lxd_debug: elementwise_add forward" << std::endl;
-    XPUElementwise<T, XPUType>(ctx, xpu::broadcast_add<XPUType>);
-  }
-};
-
-template <typename T>
-class ElementwiseAddGradXPUKernel
-    : public ::paddle::operators::ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::cout << "lxd_debug: elementwise_add backward" << std::endl;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
-                   ops::ElementwiseAddXPUKernel<float>,
-                   ops::ElementwiseAddXPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace,
-                   ops::ElementwiseAddGradXPUKernel<float>,
-                   ops::ElementwiseAddGradXPUKernel<paddle::platform::float16>);

From 03896bc62e45a6e90836eeb67221a31c08c44426 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 25 Jan 2022 03:04:01 +0000
Subject: [PATCH 34/41] modify the abi

---
 cmake/xpu_kp.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index 93ce771051020..0039c85bd4145 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -111,7 +111,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
        -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
@@ -132,7 +132,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
         -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
         --xpu-host-only -c -v 
     WORKING_DIRECTORY

From 0c4d0972010487b1278742a20deda02f482ff9b4 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Tue, 25 Jan 2022 06:12:02 +0000
Subject: [PATCH 35/41] delete some useless code

---
 paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h | 9 +--------
 paddle/fluid/platform/device/xpu/xpu_op_list.h         | 4 +---
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 63f8814d13bd9..0cb0bd1fc03d4 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -27,15 +27,8 @@ using XPUKernelSet =
 using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 
 XPUOpMap& get_kp_ops() {
-  // KL1支持的op，通过op_name, data_type, place来索引
   static XPUOpMap s_xpu_kp_kernels{
-      {"elementwise_add_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_add",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      // AddMore
+      // TODO(Liu-xiandong)
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 6ecec4c53a804..a51dfac1892f3 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -32,9 +32,7 @@ bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type);
 bool is_in_xpu_kpwhite_list(const std::string& op_name);
 #endif
-std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
-                                                   XPUVersion version);
-XPUOpListMap get_xpu_op_list(XPUVersion version);
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, pten::backends::xpu::XPUVersion version);
 XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version);

From ebe3313a94b9efe7bd1f82471aff14c26ec11862 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 27 Jan 2022 09:10:29 +0000
Subject: [PATCH 36/41] Parameter automation in xpu compilation

---
 cmake/xpu_kp.cmake                            | 32 ++++++++++++++-----
 paddle/fluid/framework/operator.cc            |  2 +-
 paddle/fluid/imperative/prepared_operator.cc  |  2 +-
 .../fluid/platform/device/xpu/xpu_op_list.cc  |  5 ++-
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index 0039c85bd4145..a32f547d5da5b 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -27,8 +27,9 @@ message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
 set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++)
 message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
 
+# The host sysroot of XPU compiler is gcc-8.2 
 if(NOT HOST_SYSROOT)
-  set(HOST_SYSROOT /opt/compiler/gcc-8.2)
+  set(HOST_SYSROOT /opt/compiler/gcc-8.2) #/opt/compiler/gcc-8.2
 endif()
 
 if(NOT IS_DIRECTORY ${HOST_SYSROOT})
@@ -57,7 +58,7 @@ set(TOOLCHAIN_ARGS )
 if(OPT_LEVEL)
   set(OPT_LEVEL ${OPT_LEVEL})
 else()
-  set(OPT_LEVEL "-O2")
+  set(OPT_LEVEL "-O3")
 endif()
 
 message(STATUS "Build with API_ARCH=" ${API_ARCH})
@@ -101,9 +102,25 @@ macro(compile_kernel COMPILE_ARGS)
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
-  set(XPU_CXX_INCLUDES  -I${CMAKE_SOURCE_DIR}/build -I${CMAKE_SOURCE_DIR}/paddle/fluid/framework/io -I${CMAKE_SOURCE_DIR}/build/third_party/install/zlib/include -I${CMAKE_SOURCE_DIR}/build/third_party/install -I${CMAKE_SOURCE_DIR}/build/third_party/install/gflags/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/glog/include -I${CMAKE_SOURCE_DIR}/build/third_party/boost/src/extern_boost -I${CMAKE_SOURCE_DIR}/build/third_party/eigen3/src/extern_eigen3 -I${CMAKE_SOURCE_DIR}/build/third_party/threadpool/src/extern_threadpool -I${CMAKE_SOURCE_DIR}/build/third_party/dlpack/src/extern_dlpack/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xxhash/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/warpctc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/utf8proc/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/openblas/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/protobuf/include -I/usr/include/python3.7m -I/usr/local/lib/python3.7/dist-packages/numpy/core/include -I${CMAKE_SOURCE_DIR}/build/third_party/pybind/src/extern_pybind/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gtest/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xpu/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/gloo/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include -I${CMAKE_SOURCE_DIR}/build/third_party/install/xbyak/include/xbyak -I${CMAKE_SOURCE_DIR}/build/third_party/install/cryptopp/include -I${CMAKE_SOURCE_DIR}/build/third_party/pocketfft/src -I${CMAKE_SOURCE_DIR} -I${CMAKE_SOURCE_DIR}/paddle/fluid/platform)
-  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -O3 -DNDEBUG )
-  set(XPU_CXX_DEFINES -DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_DLL_EXPORT -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_VERSION_INTEGER=0 -DPADDLE_WITH_AVX -DPADDLE_WITH_CRYPTO -DPADDLE_WITH_POCKETFFT -DPADDLE_WITH_SSE3 -DPADDLE_WITH_TESTING -DPADDLE_WITH_XBYAK -DPADDLE_WITH_XPU_KP -DPADDLE_WITH_XPU -DXBYAK64 -DXBYAK_NO_OP_NAMES)
+  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
+
+  #include path
+  get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+  set(XPU_CXX_INCLUDES "")
+  foreach(dir IN LISTS dirs)
+    list(APPEND XPU_CXX_INCLUDES "-I${dir}")
+  endforeach()
+  string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}" )
+  separate_arguments(XPU_CXX_INCLUDES UNIX_COMMAND "${XPU_CXX_INCLUDES}")
+
+  #related flags
+  get_directory_property( DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS )
+  set(XPU_CXX_DEFINES "")
+  foreach(def IN LISTS DirDefs)
+    list(APPEND XPU_CXX_DEFINES "-D${def}")
+  endforeach()
+  string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" )
+  separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}")
 
   add_custom_command(
     OUTPUT
@@ -111,7 +128,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
        -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
@@ -132,7 +149,7 @@ macro(compile_kernel COMPILE_ARGS)
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
     COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 -O2 -fno-builtin -g -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
         -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
         --xpu-host-only -c -v 
     WORKING_DIRECTORY
@@ -181,7 +198,6 @@ macro(xpu_add_library TARGET_NAME)
 
     if(${xpu_kernel_lists_num})
         foreach(xpu_kernel IN LISTS xpu_kernel_lists)
-            message(STATUS "Process ${xpu_kernel}")
             get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
             get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
             set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index fb389029357ab..e7a22d561f41f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1416,7 +1416,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
       paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
   bool use_xpu_kp_kernel_debug =
       paddle::platform::is_in_xpu_kpwhite_list(type_);
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (platform::is_xpu_place(expected_kernel_key.place_) &&
       (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
     expected_kernel_key.library_type_ = LibraryType::kKP;
     kernel_iter = kernels.find(expected_kernel_key);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index be15ab0150603..8eca35c5cec02 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -243,7 +243,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   if (use_xpu_kp_kernel_debug) {
     VLOG(3) << "xpu_kp using debug mode ";
   }
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
     expected_kernel_key.place_ = platform::XPUPlace();
     expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 072884c32ad24..88d803bdf1873 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -79,9 +79,8 @@ bool is_in_xpu_black_list(const std::string& op_name) {
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type) {
   auto& ops = get_kl1_ops();
-  auto v =
-      get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
-  if (v == XPU2) {
+  auto v = get_xpu_version(type.place_.device);
+  if (v == pten::backends::xpu::XPUVersion::XPU2) {
     ops = get_kp_ops();
   }
 

From 16914fa299c8e1d85eaf4a64ffc53c341043b851 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 27 Jan 2022 09:11:13 +0000
Subject: [PATCH 37/41] Parameter automation in xpu compilation

---
 cmake/xpu_kp.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index a32f547d5da5b..f8ab9693db0c9 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -29,7 +29,7 @@ message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
 
 # The host sysroot of XPU compiler is gcc-8.2 
 if(NOT HOST_SYSROOT)
-  set(HOST_SYSROOT /opt/compiler/gcc-8.2) #/opt/compiler/gcc-8.2
+  set(HOST_SYSROOT /opt/compiler/gcc-8.2)
 endif()
 
 if(NOT IS_DIRECTORY ${HOST_SYSROOT})

From f1bb460e4873eb92ac6c56f47b0fbca1267a2601 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Thu, 27 Jan 2022 09:17:51 +0000
Subject: [PATCH 38/41] delete kps in cmake

---
 cmake/operators.cmake | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 198c5f5c3b4c0..e58dbf77b4c9c 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -125,10 +125,6 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
                 list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
             endif()
-            string(REPLACE "_op" "_op_kps" XPU_KP_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_KP_FILE}.cc)
-                list(APPEND xpu_kp_cc_srcs ${XPU_KP_FILE}.cc)
-            endif()
         endif()
         if(WITH_ASCEND_CL)
             string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
@@ -166,8 +162,6 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${src})
             elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
                 list(APPEND xpu_kp_cc_srcs ${src})
-            elseif(WITH_XPU_KP AND ${src} MATCHES ".*_op_kps.cc$")
-                list(APPEND xpu_kp_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
             elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")

From 720af784fc7f2de3ad86f1ac334a594f3f649b9a Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Fri, 28 Jan 2022 02:09:22 +0000
Subject: [PATCH 39/41] delete useless comment

---
 paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 0cb0bd1fc03d4..aa020593454f8 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -27,9 +27,7 @@ using XPUKernelSet =
 using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 
 XPUOpMap& get_kp_ops() {
-  static XPUOpMap s_xpu_kp_kernels{
-      // TODO(Liu-xiandong)
-  };
+  static XPUOpMap s_xpu_kp_kernels{};
 
   return s_xpu_kp_kernels;
 }

From b55d391dd422d0806d1bff5464bfeb5cfcabb8db Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Fri, 28 Jan 2022 06:46:10 +0000
Subject: [PATCH 40/41] clean the code

---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0312c348a5256..0c8b8f1ce28b8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,7 @@ option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
-option(WITH_XPU_KP         "Compile PaddlePaddle with BAIDU XPU compiler "    OFF)
+option(WITH_XPU_KP      "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
 option(WITH_MLU    "Compile PaddlePaddle with CAMBRICON MLU"     OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
@@ -277,12 +277,12 @@ if (NOT WITH_GPU AND WITH_NCCL)
         "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
-# force XPU on when WITH_XPU_KP
+# force WITH_XPU on when WITH_XPU_KP
 if (WITH_XPU_KP AND NOT WITH_XPU)
     MESSAGE(WARNING
-        "Enable XPU when compiling with XPU2. Force WITH_XPU=ON.")
+        "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
     set(WITH_XPU ON CACHE STRING
-        "Enable XPU when compiling with XPU2" FORCE)
+        "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
 endif()
 
 if (NOT WITH_XPU AND WITH_XPU_BKCL)

From 2541c1ba531721a316aaf1854cd5491e983a7c94 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <liuxiandong@ncic.ac.cn>
Date: Fri, 28 Jan 2022 06:48:29 +0000
Subject: [PATCH 41/41] clean the code

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c8b8f1ce28b8..549ed9d8543c2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ include(generic)            # simplify cmake module
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU  AND WITH_XPU_KP)
+if (WITH_GPU AND WITH_XPU_KP)
     message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
 endif()
 if (WITH_GPU AND WITH_ASCEND)