diff --git a/CMakeLists.txt b/CMakeLists.txt index 047db58cfdfd4..549ed9d8543c2 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) +option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF) option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) @@ -59,6 +60,9 @@ include(generic) # simplify cmake module if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() +if (WITH_GPU AND WITH_XPU_KP) + message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time") +endif() if (WITH_GPU AND WITH_ASCEND) message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") endif() @@ -273,6 +277,14 @@ if (NOT WITH_GPU AND WITH_NCCL) "Disable NCCL when compiling without GPU" FORCE) endif() +# force WITH_XPU on when WITH_XPU_KP +if (WITH_XPU_KP AND NOT WITH_XPU) + MESSAGE(WARNING + "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.") + set(WITH_XPU ON CACHE STRING + "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE) +endif() + if (NOT WITH_XPU AND WITH_XPU_BKCL) MESSAGE(WARNING "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.") @@ -317,6 +329,10 @@ if(WITH_ROCM) include(miopen) # set miopen libraries, must before configure endif(WITH_ROCM) +if(WITH_XPU_KP) + include(xpu_kp) +endif() + if (NOT WITH_ROCM AND WITH_RCCL) MESSAGE(WARNING "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.") diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 88e8dde8addbc..9ebde06bd01ab 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -99,6 +99,11 @@ if(WITH_XPU) add_definitions(-DPADDLE_WITH_XPU) endif() +if(WITH_XPU_KP) + message(STATUS "Compile with XPU_KP!") + add_definitions(-DPADDLE_WITH_XPU_KP) +endif() + if(WITH_IPU) message(STATUS "Compile with IPU!") add_definitions(-DPADDLE_WITH_IPU) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 2004abcbfa1f2..6655963e728f1 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -654,6 +654,81 @@ function(hip_test TARGET_NAME) endif() endfunction(hip_test) +function(xpu_library TARGET_NAME) + if (WITH_XPU_KP) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(xpu_library_SRCS) + if (xpu_library_SHARED OR xpu_library_shared) # build *.so + message(FATAL_ERROR "XPU kernel currently does not support dynamic links") + else() + xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS}) + find_fluid_modules(${TARGET_NAME}) + endif() + if (xpu_library_DEPS) + add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${xpu_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND xpu_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + else(xpu_library_SRCS) + if (xpu_library_DEPS) + list(REMOVE_DUPLICATES xpu_library_DEPS) + generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:xpu_library") + target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS}) + add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) + else() + message(FATAL "Please specify source file or library in xpu_library.") + endif() + endif(xpu_library_SRCS) + endif() +endfunction(xpu_library) + +function(xpu_binary TARGET_NAME) + if (WITH_XPU_KP) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${xpu_binary_SRCS}) + if(xpu_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${xpu_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${xpu_binary_DEPS}) + common_link(${TARGET_NAME}) + endif() + endif() +endfunction(xpu_binary) + +function(xpu_test TARGET_NAME) + # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation + # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files + # other than *.py are modified. + if (WITH_XPU_KP AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${xpu_test_SRCS}) + # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE + target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules}) + add_dependencies(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) + add_test(${TARGET_NAME} ${TARGET_NAME}) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + endif() +endfunction(xpu_test) + function(go_library TARGET_NAME) set(options STATIC static SHARED shared) set(oneValueArgs "") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index d7742c3473724..e58dbf77b4c9c 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -34,6 +34,7 @@ function(op_library TARGET) set(cu_cc_srcs) set(hip_cc_srcs) set(xpu_cc_srcs) + set(xpu_kp_cc_srcs) set(npu_cc_srcs) set(mlu_cc_srcs) set(cudnn_cu_cc_srcs) @@ -120,6 +121,11 @@ function(op_library TARGET) list(APPEND xpu_cc_srcs ${XPU_FILE}.cc) endif() endif() + if(WITH_XPU_KP) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu) + list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu) + endif() + endif() if(WITH_ASCEND_CL) string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}") if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc) @@ -154,6 +160,8 @@ function(op_library TARGET) list(APPEND mkldnn_cc_srcs ${src}) elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$") list(APPEND xpu_cc_srcs ${src}) + elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$") + list(APPEND xpu_kp_cc_srcs ${src}) elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$") list(APPEND npu_cc_srcs ${src}) elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$") @@ -161,11 +169,13 @@ function(op_library TARGET) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) else() - message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu") + message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu or .xpu") endif() endforeach() endif() - + + list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) + list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len) list(LENGTH cc_srcs cc_srcs_len) if (${cc_srcs_len} EQUAL 0) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") @@ -231,6 +241,8 @@ function(op_library TARGET) list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) + elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) + xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) @@ -359,6 +371,11 @@ function(op_library TARGET) endif() endif() + # pybind USE_OP_DEVICE_KERNEL for XPU KP + if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n") + endif() + # pybind USE_OP_DEVICE_KERNEL for NPU if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0) foreach(npu_src ${npu_cc_srcs}) @@ -438,7 +455,6 @@ function(op_library TARGET) endif() endfunction() - function(register_operators) set(options "") set(oneValueArgs "") diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake new file mode 100644 index 0000000000000..f8ab9693db0c9 --- /dev/null +++ b/cmake/xpu_kp.cmake @@ -0,0 +1,239 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_XPU_KP) + return() +endif() + +if(NOT XPU_TOOLCHAIN) + set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK) + get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH) +endif() +if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN}) + message(FATAL_ERROR "Directory ${XPU_TOOLCHAIN} not found!") +endif() +message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN}) +set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++) +message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG}) + +# The host sysroot of XPU compiler is gcc-8.2 +if(NOT HOST_SYSROOT) + set(HOST_SYSROOT /opt/compiler/gcc-8.2) +endif() + +if(NOT IS_DIRECTORY ${HOST_SYSROOT}) + message(FATAL_ERROR "Directory ${HOST_SYSROOT} not found!") +endif() + +if(NOT API_ARCH) + set(API_ARCH x86_64-baidu-linux-gnu) +endif() + +if(API_ARCH MATCHES "x86_64") +if(EXISTS ${HOST_SYSROOT}/bin/g++) + set(HOST_CXX ${HOST_SYSROOT}/bin/g++) + set(HOST_AR ${HOST_SYSROOT}/bin/ar) +else() + set(HOST_CXX /usr/bin/g++) + set(HOST_AR /usr/bin/ar) +endif() +else() + set(HOST_CXX ${CMAKE_CXX_COMPILER}) + set(HOST_AR ${CMAKE_AR}) +endif() + +set(TOOLCHAIN_ARGS ) + +if(OPT_LEVEL) + set(OPT_LEVEL ${OPT_LEVEL}) +else() + set(OPT_LEVEL "-O3") +endif() + +message(STATUS "Build with API_ARCH=" ${API_ARCH}) +message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS}) +message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT}) +message(STATUS "Build with HOST_CXX=" ${HOST_CXX}) +message(STATUS "Build with HOST_AR=" ${HOST_AR}) + +macro(compile_kernel COMPILE_ARGS) + set(options "") + set(oneValueArgs "") + set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS) + cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(kernel_path ${xpu_add_library_DIRPATH}) + set(kernel_name ${xpu_add_library_XNAME}) + set(device_o_extra_flags ${xpu_add_library_DEVICE}) + set(host_o_extra_flags ${xpu_add_library_HOST}) + set(xpu_1_or_2 ${xpu_add_library_XPU}) + set(cc_depends ${xpu_add_library_DEPENDS}) + + set(kernel_target ${kernel_name}_kernel) + add_custom_target(${kernel_target} + WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS + kernel_build/${kernel_name}.host.o + kernel_build/${kernel_name}.bin.o + COMMENT + ${kernel_target} + VERBATIM + ) + + if(cc_depends) + add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS}) + endif() + + set(arg_device_o_extra_flags ${device_o_extra_flags}) + separate_arguments(arg_device_o_extra_flags) + set(arg_host_o_extra_flags ${host_o_extra_flags}) + separate_arguments(arg_host_o_extra_flags) + + set(XTDK_DIR ${XPU_TOOLCHAIN}) + set(CXX_DIR ${HOST_SYSROOT}) + set(XPU_CXX_FLAGS -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG ) + + #include path + get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) + set(XPU_CXX_INCLUDES "") + foreach(dir IN LISTS dirs) + list(APPEND XPU_CXX_INCLUDES "-I${dir}") + endforeach() + string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}" ) + separate_arguments(XPU_CXX_INCLUDES UNIX_COMMAND "${XPU_CXX_INCLUDES}") + + #related flags + get_directory_property( DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS ) + set(XPU_CXX_DEFINES "") + foreach(def IN LISTS DirDefs) + list(APPEND XPU_CXX_DEFINES "-D${def}") + endforeach() + string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" ) + separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}") + + add_custom_command( + OUTPUT + kernel_build/${kernel_name}.bin.o + COMMAND + ${CMAKE_COMMAND} -E make_directory kernel_build + COMMAND + ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} + -I. -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu + --xpu-device-only -c -v + COMMAND + ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR} + WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS + ${xpu_add_library_DEPENDS} + COMMENT + kernel_build/${kernel_name}.bin.o + VERBATIM + ) + list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o) + + add_custom_command( + OUTPUT + kernel_build/${kernel_name}.host.o + COMMAND + ${CMAKE_COMMAND} -E make_directory kernel_build + COMMAND + ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} + -I. -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu + --xpu-host-only -c -v + WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS + ${xpu_add_library_DEPENDS} + COMMENT + kernel_build/${kernel_name}.host.o + VERBATIM + ) + list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o) +endmacro() + +############################################################################### +# XPU_ADD_LIBRARY +############################################################################### +macro(xpu_add_library TARGET_NAME) + # Separate the sources from the options + set(options "") + set(oneValueArgs "") + set(multiValueArgs STATIC DEPENDS) + cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(xpu_srcs ${xpu_add_library_STATIC}) + set(xpu_target ${TARGET_NAME}) + set(cc_srcs_depends ${xpu_add_library_DEPENDS}) + + file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs}) + list(LENGTH xpu_srcs_lists xpu_srcs_lists_num) + + set(XPU1_DEVICE_O_EXTRA_FLAGS " ") + set(XPU1_HOST_O_EXTRA_FLAGS " ") + + # Distinguish .xpu file from other files + foreach(cur_xpu_src IN LISTS xpu_srcs_lists) + get_filename_component(language_type_name ${cur_xpu_src} EXT) + if(${language_type_name} STREQUAL ".xpu") + list(APPEND xpu_kernel_lists ${cur_xpu_src}) + else() + list(APPEND cc_kernel_lists ${cur_xpu_src}) + endif() + endforeach() + + # Ensure that there is only one xpu kernel + list(LENGTH xpu_kernel_lists xpu_kernel_lists_num) + list(LENGTH cc_srcs_depends cc_srcs_depends_num) + + if(${xpu_kernel_lists_num}) + foreach(xpu_kernel IN LISTS xpu_kernel_lists) + get_filename_component(kernel_name ${xpu_kernel} NAME_WE) + get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY) + set(kernel_rules ${kernel_dir}/${kernel_name}.rules) + set(kernel_name ${kernel_name}) + compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends}) + endforeach() + + add_custom_target(${xpu_target}_src ALL + WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS + ${xpu_kernel_depends} + ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a + COMMENT + ${xpu_target}_src + VERBATIM + ) + + add_custom_command( + OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a + COMMAND + ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends} + WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS + ${xpu_kernel_depends} + COMMENT + ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a + VERBATIM + ) + + add_library(${xpu_target} STATIC ${cc_kernel_lists}) + add_dependencies(${xpu_target} ${xpu_target}_src) + target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a) + else() + add_library(${xpu_target} STATIC ${cc_kernel_lists}) + endif() +endmacro() diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h index 6fdd128b0d3bf..0cb6cac26a61c 100644 --- a/paddle/fluid/framework/library_type.h +++ b/paddle/fluid/framework/library_type.h @@ -26,6 +26,7 @@ enum class LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2, + kKP = 3, }; inline std::string LibraryTypeToString(const LibraryType& library_type) { @@ -36,10 +37,12 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) { return "MKLDNN"; case LibraryType::kCUDNN: return "CUDNN"; + case LibraryType::kKP: + return "KP"; default: PADDLE_THROW(platform::errors::Unimplemented( "Unknown LibraryType code (%d), only supports library type include " - "PLAIN(0), MKLDNN(1), CUDNN(2).", + "PLAIN(0), MKLDNN(1), CUDNN(2), KP(3).", static_cast(library_type))); } } @@ -57,6 +60,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { return LibraryType::kCUDNN; // To be compatible with register macro. // CPU, CUDA, PLAIN are same library type. + } else if (s == std::string("KP")) { + return LibraryType::kKP; } else if (s == std::string("CPU")) { return LibraryType::kPlain; } else if (s == std::string("XPU")) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 426b5ac8ffdde..e7a22d561f41f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1386,6 +1386,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { auto expected_kernel_key = InnerGetExpectedKernelType(ctx); auto kernel_iter = kernels.find(expected_kernel_key); + #ifdef PADDLE_WITH_MKLDNN // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set if (kernel_iter == kernels.end() && @@ -1408,6 +1409,22 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif + +#ifdef PADDLE_WITH_XPU_KP + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(type_); + if (platform::is_xpu_place(expected_kernel_key.place_) && + (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { + expected_kernel_key.library_type_ = LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << type_ + << ", using_kernel_key:" << expected_kernel_key; + } +#endif + #ifdef PADDLE_WITH_IPU if (kernel_iter == kernels.end() && platform::is_ipu_place(expected_kernel_key.place_)) { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 9ad13299a3773..991a448386dd7 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -685,6 +685,7 @@ class OperatorWithKernel : public OperatorBase { // new pten kernel, if there is a better design in the future, // we may polish the implementation here mutable bool run_pten_kernel_ = false; + mutable bool run_kp_kernel = false; mutable std::unique_ptr pt_kernel_signature_; mutable std::unique_ptr pt_kernel_; }; diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 5d6df145ab356..8eca35c5cec02 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -24,6 +24,7 @@ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #endif +#include "paddle/fluid/framework/library_type.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler.h" @@ -145,7 +146,6 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto dygraph_exe_ctx = DygraphExecutionContext( op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs); auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; framework::KernelSignature pt_kernel_signature; pten::KernelKey pt_kernel_key; @@ -228,7 +228,31 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } + #endif + +#ifdef PADDLE_WITH_XPU_KP + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(op.Type()); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && + (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { + expected_kernel_key.place_ = platform::XPUPlace(); + expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << op.Type() + << ", using_kernel_key:" << expected_kernel_key; + } +#endif + #ifdef PADDLE_WITH_ASCEND_CL if (kernel_iter == kernels.end() && paddle::platform::is_npu_place(expected_kernel_key.place_)) { diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index f9165e8ee23e3..10c5a2cf46e20 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -190,6 +190,7 @@ class PreparedOp { // new pten kernel, if there is a better design in the future, // we may polish the implementation here bool run_pten_kernel_{false}; + bool run_kp_kernel_{false}; framework::KernelSignature pt_kernel_signature_; pten::Kernel pt_kernel_; }; diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h new file mode 100644 index 0000000000000..aa020593454f8 --- /dev/null +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_XPU_KP +#include +#include +#include + +#include "paddle/fluid/framework/op_kernel_type.h" + +namespace paddle { +namespace platform { + +using vartype = paddle::framework::proto::VarType; +using pOpKernelType = paddle::framework::OpKernelType; +using XPUKernelSet = + std::unordered_set; +using XPUOpMap = std::unordered_map; + +XPUOpMap& get_kp_ops() { + static XPUOpMap s_xpu_kp_kernels{}; + + return s_xpu_kp_kernels; +} + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index e9b494024bd69..88d803bdf1873 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/xpu/xpu1_op_list.h" #include "paddle/fluid/platform/device/xpu/xpu2_op_list.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" namespace paddle { @@ -74,6 +75,48 @@ bool is_in_xpu_black_list(const std::string& op_name) { return false; } +#ifdef PADDLE_WITH_XPU_KP +bool is_xpu_kp_support_op(const std::string& op_name, + const pOpKernelType& type) { + auto& ops = get_kl1_ops(); + auto v = get_xpu_version(type.place_.device); + if (v == pten::backends::xpu::XPUVersion::XPU2) { + ops = get_kp_ops(); + } + + if (ops.find(op_name) != ops.end() && + ops[op_name].find(type) != ops[op_name].end()) { + return true; + } + return false; +} + +bool is_in_xpu_kpwhite_list(const std::string& op_name) { + static bool inited = false; + static std::unordered_set xpu_kpwhite_list; + static std::mutex s_mtx; + if (!inited) { + std::lock_guard guard(s_mtx); + if (!inited) { + if (std::getenv("XPU_KPWHITE_LIST") != nullptr) { + std::string ops(std::getenv("XPU_KPWHITE_LIST")); + tokenize(ops, ',', &xpu_kpwhite_list); + } + inited = true; + VLOG(3) << "XPU kpwhite List: "; + for (auto iter = xpu_kpwhite_list.begin(); iter != xpu_kpwhite_list.end(); + ++iter) { + VLOG(3) << *iter << " "; + } + } + } + if (xpu_kpwhite_list.find(op_name) != xpu_kpwhite_list.end()) { + return true; + } + return false; +} +#endif + std::vector get_xpu_op_support_type( const std::string& op_name, pten::backends::xpu::XPUVersion version) { std::vector res; @@ -101,7 +144,6 @@ XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) { } return res; } - } // namespace platform } // namespace paddle #endif diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h index 4c3eb097a147e..a51dfac1892f3 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h @@ -27,6 +27,12 @@ using XPUOpListMap = bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type); bool is_in_xpu_black_list(const std::string& op_name); +#ifdef PADDLE_WITH_XPU_KP +bool is_xpu_kp_support_op(const std::string& op_name, + const pOpKernelType& type); +bool is_in_xpu_kpwhite_list(const std::string& op_name); +#endif + std::vector get_xpu_op_support_type( const std::string& op_name, pten::backends::xpu::XPUVersion version); XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 4a6bfe67ba597..d195b5c04ace7 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -700,16 +700,16 @@ PADDLE_DEFINE_EXPORTED_bool(run_pten_kernel, true, "It controls whether to use pten kernel"); /** - * Pt kernel related FLAG + * KP kernel related FLAG * Name: FLAGS_run_kp_kernel * Since Version: 2.3.0 * Value Range: bool, default=false - * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in - * the Op for XPU2. + * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the + * Op. * Note: */ -PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel, true, - "It controls whether to use kp kernel for xpu2"); +PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel, false, + "It controls whether to run PaddlePaddle using KP"); /** * Distributed related FLAG